## Models

We chose to use Naive Bayes, Decision Tree, and K Nearest Neighbors classifiers to predict our data.

In [32]:
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

def nb_model(x_train, x_test, y_train, y_test, var_smoothing=1e-9):
    model = GaussianNB(var_smoothing=var_smoothing)
    model.fit(x_train, y_train)
    y_pred = model.predict(x_test)
    score = model.score(x_test, y_test)
    return y_pred, score

def tree_model(x_train, x_test, y_train, y_test, max_depth=None, criterion='gini'):
    model = DecisionTreeClassifier(max_depth=max_depth, criterion=criterion)
    model.fit(x_train, y_train)
    y_pred = model.predict(x_test)
    score = model.score(x_test, y_test)
    return y_pred, score


def knn_model(x_train, x_test, y_train, y_test, weights='uniform', leaf_size=30, p=2):
    model = KNeighborsClassifier(weights=weights, leaf_size=leaf_size, p=p)
    model.fit(x_train, y_train)
    y_pred = model.predict(x_test)
    score = model.score(x_test, y_test)
    return y_pred, score



Data is split for training and testing.

In [34]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

data = pd.read_csv('data/speed_dating_select.csv')
target = 'match'
desc_features = list(data.columns)
desc_features.remove(target)

x = data[desc_features]
y = data[target]

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=.25, random_state=42)

## Optimization

### Hyperparameter Optimization (Grid Search with Accuracy Score)

Log loss parameter was introdiced in scikit-learn version 1.1.2; please make sure your scikit-learn package is up to date before running.

In [35]:
def optimize_tree():
    depths = []
    crits = []
    scores = []
    for depth in [i for i in range(1,15)]:
        for crit in ['gini', 'entropy', 'log_loss']:
            _, score = tree_model(x_train, x_test, y_train, y_test, max_depth=depth, criterion=crit)
            scores.append(score)
            depths.append(depth)
            crits.append(crit)
    best = scores.index(max(scores))
    print(f'Maximum score {scores[best]} at depth {depths[best]} using {crits[best]}')
    return {
        'max_depth': depths[best],
        'criterion': crits[best]
    }

tree_params = optimize_tree()

Maximum score 0.8509822712026833 at depth 13 using gini


In [36]:
def optimize_knn():
    weights = []
    leaf_sizes = []
    ps = []
    scores = []
    for weight in ['uniform', 'distance']:
        for p in [1,2,3,4]:
            for leaf_size in [10, 30, 50, 100]:
                _, score = knn_model(x_train, x_test, y_train, y_test, weights=weight, leaf_size=leaf_size, p=p)
                scores.append(score)
                weights.append(weight)
                leaf_sizes.append(leaf_size)
                ps.append(p)
    best = scores.index(max(scores))
    print(f'Maximum score {scores[best]} with weights {weights[best]}, leaf size {leaf_sizes[best]} and p={ps[best]}')
    return {
        'weights': weights[best],
        'leaf_size': leaf_sizes[best],
        'p': ps[best]
    }

knn_params = optimize_knn()

Maximum score 0.8112122664111164 with weights uniform, leaf size 10 and p=1


In [37]:
def optimize_nb():
    smooths = []
    scores = []
    for smoothing in [1e-1, 1e-2, 1e-3, 1e-4, 1e-5, 1e-6, 1e-7, 1e-8, 1e-9, 1e-10, 1e-11]:
        _, score = nb_model(x_train, x_test, y_train, y_test, var_smoothing=smoothing)
        smooths.append(smoothing)
        scores.append(score)
    best = scores.index(max(scores))
    print(f'Maximum score {scores[best]} with smoothing {smooths[best]}')
    return {
        'var_smoothing': smooths[best]
    }

nb_params = optimize_nb()

Maximum score 0.8279827503593675 with smoothing 0.1


In [43]:
y_pred, score = nb_model(x_train, x_test, y_train, y_test, **nb_params)

print(score)

y_pred, score = tree_model(x_train, x_test, y_train, y_test, **tree_params)

print(score)

y_pred, score = knn_model(x_train, x_test, y_train, y_test, **knn_params)

print(score)

0.8279827503593675
0.8457115476760901
0.8112122664111164


## Evaluation