In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score,roc_auc_score
from sklearn.svm import SVC
import joblib
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Load data
data = pd.read_csv('Online_Dating_Behavior_Dataset.csv')

In [3]:
data.head()

Unnamed: 0,Gender,PurchasedVIP,Income,Children,Age,Attractiveness,Matches
0,0,1,51777,3,47,5,70
1,1,0,36646,0,42,7,130
2,0,0,53801,1,25,5,0
3,0,0,56105,0,35,8,0
4,0,0,55597,1,36,6,0


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype
---  ------          --------------  -----
 0   Gender          1000 non-null   int64
 1   PurchasedVIP    1000 non-null   int64
 2   Income          1000 non-null   int64
 3   Children        1000 non-null   int64
 4   Age             1000 non-null   int64
 5   Attractiveness  1000 non-null   int64
 6   Matches         1000 non-null   int64
dtypes: int64(7)
memory usage: 54.8 KB


In [5]:
# is null?
isnull = data.isnull().sum()
isnull

Gender            0
PurchasedVIP      0
Income            0
Children          0
Age               0
Attractiveness    0
Matches           0
dtype: int64

In [6]:
# Preprocess data
selected_features = [
    'Gender', 'PurchasedVIP', 'Income', 'Age',
    'Children', 'Attractiveness'
]
X = data[selected_features]
Y = data[['Matches']]


In [7]:
# Scaling
scaler = MinMaxScaler()
X[['Gender', 'Age', 'PurchasedVIP', 'Income', 'Children', 'Attractiveness']] = scaler.fit_transform(X[['Gender', 'Age', 'PurchasedVIP', 'Income', 'Children', 'Attractiveness']])


In [8]:
# Split data
train_X, val_X, train_Y, val_Y = train_test_split(
    X, Y, random_state=0, train_size=0.8
)



In [14]:
# Train model
model = KNeighborsClassifier(n_neighbors=2, metric='euclidean', weights='uniform', algorithm='auto', leaf_size=50, p=2)
model.fit(train_X, train_Y)

In [15]:
# Evaluate model
val_prediction = model.predict(val_X)
Y_pred_proba = model.predict_proba(val_X)[:,1]
accuracy = accuracy_score(val_Y, val_prediction)
print(f'Model accuracy: {accuracy}')

Model accuracy: 0.68


In [11]:
print(confusion_matrix(val_Y, val_prediction))
print(classification_report(val_Y, val_prediction))

[[40  0  0  0  0  0  0  0  0  0  0]
 [ 0 49  2  0  0  0  0  0  0  0  0]
 [ 0  5  5  0  1  0  0  0  0  0  0]
 [ 0  1  6  5  1  0  0  0  0  0  0]
 [ 0  0  0  7  7  0  0  0  0  0  0]
 [ 0  0  1  2  7  3  0  1  0  0  0]
 [ 0  0  0  0  0  6  7  2  0  0  0]
 [ 0  0  0  0  0  1  7  5  0  1  0]
 [ 0  0  0  0  0  0  0  3  0  2  0]
 [ 0  0  0  0  0  0  0  0  2 10  0]
 [ 0  0  0  0  0  0  0  1  2  3  5]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        40
          70       0.89      0.96      0.92        51
          80       0.36      0.45      0.40        11
          90       0.36      0.38      0.37        13
         100       0.44      0.50      0.47        14
         110       0.30      0.21      0.25        14
         120       0.50      0.47      0.48        15
         130       0.42      0.36      0.38        14
         140       0.00      0.00      0.00         5
         150       0.62      0.83      0.71        12
        

In [12]:
auc = roc_auc_score(val_Y, Y_pred_proba)
print(auc)

ValueError: multi_class must be in ('ovo', 'ovr')

In [13]:
# Save model
joblib.dump(model, 'dating_model.pkl')

['dating_model.pkl']