# Model Selection - Classification

Testing out different classification models and selecting what seems to perform the best. (I honestly have no idea what I'm doing.)

## Setup

In [606]:
import csv
import json
import os
import time
from datetime import datetime

from matplotlib import pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
from pandas import DataFrame

# suppress future warnings for xgboost and unsupported kanji unicode in plots
import warnings
warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=UserWarning)

import xgboost as xgb
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression,SGDClassifier
from sklearn.model_selection import cross_val_score,GridSearchCV,train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler,MinMaxScaler
from sklearn.tree import DecisionTreeClassifier

In [607]:
SEED = 1337
data_dir = os.path.abspath('data')

# today = datetime.today().strftime('%Y%m%d')
today = datetime.strptime('2022-09-27', '%Y-%m-%d').strftime('%Y%m%d')

In [608]:
# util to display general classifier performance
def print_model_results(y_test, y_pred):
    print(metrics.confusion_matrix(y_test, y_pred))

    print(metrics.classification_report(y_test, y_pred, digits=6))
    # precision: ratio of true positives to sum of true and false positives
    # recall:    ratio of true positives to sum of true positives and false negatives
    # f1 score:  weighted harmonic mean of precision and recall; closer to one, more accurate
    # support:   number of occurrences in dataset

## Encoding / Feature Engineering

In [609]:
# binary encode a column of categorical data
def binary_encode(df: DataFrame, col: str) -> DataFrame:
    try:
        df[col] = df[col].apply(json.loads) # deserialize JSON array
    except ValueError: pass
    except TypeError:  pass

    encoded = df.explode(col) # expand categorical data into rows
    encoded = pd.concat([df, pd.get_dummies(encoded[col], prefix=col, prefix_sep='_')], axis=1) # convert to indicators
    encoded = encoded.groupby('id').max().reset_index() # squash rows
    return encoded.drop([col], axis=1) # drop encoded column

In [610]:
# collapsing 1-10 score to [BAD,EH,GOOD] rating...to make prediction easier
def score_to_rating(score: int) -> str:
    if score <= 5:
        return 'BAD'
    elif score >= 6 and score <= 7:
        return 'EH'
    return 'GOOD'

def would_recommend(score: int) -> bool:
    if score < 7:
        return False
    return True

In [611]:
enriched_df = pd.read_csv(os.path.join(data_dir, f'user-{today}-enriched.csv'))

In [612]:
# combine less common studios to OTHER

# rare_studios = enriched_df.filter(['id','studios']).value_counts().reset_index(name='count').query('count == 1')['id']
# enriched_df['studios'] = enriched_df['studios'].apply(lambda s: 'OTHER' if s in rare_studios else s)

# Note: didn't seem to help much

In [613]:
cols = ['genres', 'studios', 'tags', 'source', 'season_year', 'format']

# drop unused features
enriched_df.drop(['episodes', 'season'], axis=1, inplace=True)

# condense scores to three rating levels
enriched_df['recommend'] = enriched_df['score'].apply(would_recommend)
enriched_df.drop(['score'], axis=1, inplace=True)

In [614]:
# Binary encodes tags,genres,studios. Binary encoding eliminates possible
# data leaks that occur with one-hot encoding since it retains one row per entity.

# binary encode each categorical column
for col in cols:
    enriched_df = binary_encode(enriched_df, col)
enriched_df.drop(['id'], axis=1, inplace=True)

print(enriched_df.shape)

(556, 458)


## Split Data

In [615]:
# split data into train,test,validation sets

X = enriched_df[enriched_df.columns.drop(['recommend'])]
y = enriched_df['recommend']

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=SEED, shuffle=True) # stratify=y
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, train_size=0.75, random_state=SEED, shuffle=True)

print('X_train =', X_train.shape, '; y_train =', y_train.shape)
print('X_valid =', X_valid.shape, '; y_valid =', y_valid.shape)
print('X_test =', X_test.shape, '; y_test =', y_test.shape)

# save to CSV for quick glancing data
X_train.to_csv(os.path.join(data_dir, f'user-{today}-cls-train.csv'), index=False)
X_valid.to_csv(os.path.join(data_dir, f'user-{today}-cls-valid.csv'), index=False)
X_test.to_csv(os.path.join(data_dir, f'user-{today}-cls-test.csv'), index=False)

X_train = (333, 457) ; y_train = (333,)
X_valid = (111, 457) ; y_valid = (111,)
X_test = (112, 457) ; y_test = (112,)


## Logistic Regression

In [662]:
%%time

# run with base hyper parameters

model = LogisticRegression(random_state=SEED)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print('Actual:     ', y_test.head(10).tolist())
print('Predictions:', y_pred.tolist()[0:10])
print_model_results(y_test, y_pred)

Actual:      [True, False, False, False, True, True, False, True, False, True]
Predictions: [False, True, False, False, True, True, False, True, True, True]
[[24  7]
 [15 66]]
              precision    recall  f1-score   support

       False   0.615385  0.774194  0.685714        31
        True   0.904110  0.814815  0.857143        81

    accuracy                       0.803571       112
   macro avg   0.759747  0.794504  0.771429       112
weighted avg   0.824195  0.803571  0.809694       112

CPU times: total: 125 ms
Wall time: 38.5 ms


In [663]:
%%time

# Find best hyperparameters
#   https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html
#   https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html

hyper_best = None

if not hyper_best:
    param_grid = {
        # 'penalty': ['l2'],
        'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
        # 'penalty': ['l1'],
        # 'solver': ['liblinear', 'saga'],
        'max_iter': [5, 10, 25, 50, 100, 250, 500],
        'C': [0.01, 0.1, 1.0, 10.0, 100.0],
    }
    hyper_search = GridSearchCV(model, param_grid, cv=5, verbose=1, n_jobs=4).fit(X_valid, y_valid)
    hyper_best = hyper_search.best_params_

print('Best hyperparameters =', hyper_best)

Fitting 5 folds for each of 175 candidates, totalling 875 fits
Best hyperparameters = {'C': 10.0, 'max_iter': 5, 'solver': 'sag'}
CPU times: total: 406 ms
Wall time: 6.93 s


In [664]:
%%time

# Try retraining model with new hyperparameters

model = LogisticRegression(**hyper_best, random_state=SEED)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print('Actual:     ', y_test.head(10).tolist())
print('Predictions:', y_pred.tolist()[0:10])
print_model_results(y_test, y_pred)

Actual:      [True, False, False, False, True, True, False, True, False, True]
Predictions: [True, True, True, False, True, True, False, True, False, True]
[[23  8]
 [13 68]]
              precision    recall  f1-score   support

       False   0.638889  0.741935  0.686567        31
        True   0.894737  0.839506  0.866242        81

    accuracy                       0.812500       112
   macro avg   0.766813  0.790721  0.776405       112
weighted avg   0.823922  0.812500  0.816511       112

CPU times: total: 15.6 ms
Wall time: 23 ms


## Stochastic Gradient Descent Classifier

In [619]:
%%time

# run with base hyper parameters

model = SGDClassifier(random_state=SEED)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print('Actual:     ', y_test.head(10).tolist())
print('Predictions:', y_pred.tolist()[0:10])
print_model_results(y_test, y_pred)

Actual:      [True, False, False, False, True, True, False, True, False, True]
Predictions: [False, True, False, False, True, True, False, True, True, True]
[[24  7]
 [21 60]]
              precision    recall  f1-score   support

       False   0.533333  0.774194  0.631579        31
        True   0.895522  0.740741  0.810811        81

    accuracy                       0.750000       112
   macro avg   0.714428  0.757467  0.721195       112
weighted avg   0.795274  0.750000  0.761202       112

CPU times: total: 109 ms
Wall time: 23.5 ms


In [620]:
%%time

# Find best hyperparameters
#   https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.SGDClassifier.html
#   https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html

hyper_best = None

if not hyper_best:
    param_grid = {
        'loss': ['hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron', 'squared_error', 'huber'],
        'penalty': ['l2', 'l1', 'elasticnet'],
        'max_iter': [2, 4, 8, 16, 32, 64, 128],
        # 'learning_rate': ['constant', 'optimal', 'adaptive'],
        # 'eta0': [0.5, 1, 2, 4, 8, 16, 32],
    }
    hyper_search = GridSearchCV(model, param_grid, cv=5, verbose=1).fit(X_valid, y_valid)
    hyper_best = hyper_search.best_params_

print('Best hyperparameters =', hyper_best)

Fitting 5 folds for each of 147 candidates, totalling 735 fits
Best hyperparameters = {'loss': 'squared_hinge', 'max_iter': 4, 'penalty': 'l2'}
CPU times: total: 6.75 s
Wall time: 6.42 s


In [621]:
%%time

# Try retraining model with new hyperparameters

model = SGDClassifier(**hyper_best, random_state=SEED)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print('Actual:     ', y_test.head(10).tolist())
print('Predictions:', y_pred.tolist()[0:10])
print_model_results(y_test, y_pred)

Actual:      [True, False, False, False, True, True, False, True, False, True]
Predictions: [False, True, False, False, True, True, False, True, True, True]
[[22  9]
 [16 65]]
              precision    recall  f1-score   support

       False   0.578947  0.709677  0.637681        31
        True   0.878378  0.802469  0.838710        81

    accuracy                       0.776786       112
   macro avg   0.728663  0.756073  0.738195       112
weighted avg   0.795500  0.776786  0.783068       112

CPU times: total: 15.6 ms
Wall time: 18.5 ms


## KNN

In [622]:
%%time

# run with base hyper parameters

model = KNeighborsClassifier()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print('Actual:     ', y_test.head(10).tolist())
print('Predictions:', y_pred.tolist()[0:10])
print_model_results(y_test, y_pred)

Actual:      [True, False, False, False, True, True, False, True, False, True]
Predictions: [True, False, False, False, True, False, False, True, False, True]
[[25  6]
 [26 55]]
              precision    recall  f1-score   support

       False   0.490196  0.806452  0.609756        31
        True   0.901639  0.679012  0.774648        81

    accuracy                       0.714286       112
   macro avg   0.695918  0.742732  0.692202       112
weighted avg   0.787758  0.714286  0.729008       112

CPU times: total: 109 ms
Wall time: 26 ms


In [623]:
%%time

# Find best hyperparameters
#   https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html
#   https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html

hyper_best = None

if not hyper_best:
    param_grid = {
        'n_neighbors': [5, 10, 15, 20, 25],
        'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
        'leaf_size': [10, 20, 30, 40, 50],
    }
    hyper_search = GridSearchCV(model, param_grid, cv=5, verbose=1).fit(X_valid, y_valid)
    hyper_best = hyper_search.best_params_

print('Best hyperparameters =', hyper_best)

Fitting 5 folds for each of 100 candidates, totalling 500 fits
Best hyperparameters = {'algorithm': 'kd_tree', 'leaf_size': 20, 'n_neighbors': 20}
CPU times: total: 13.4 s
Wall time: 5.49 s


In [624]:
%%time

# Try retraining model with new hyperparameters

model = KNeighborsClassifier(**hyper_best)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print('Actual:     ', y_test.head(10).tolist())
print('Predictions:', y_pred.tolist()[0:10])
print_model_results(y_test, y_pred)

Actual:      [True, False, False, False, True, True, False, True, False, True]
Predictions: [False, False, False, False, False, True, False, False, False, True]
[[28  3]
 [41 40]]
              precision    recall  f1-score   support

       False   0.405797  0.903226  0.560000        31
        True   0.930233  0.493827  0.645161        81

    accuracy                       0.607143       112
   macro avg   0.668015  0.698526  0.602581       112
weighted avg   0.785076  0.607143  0.621590       112

CPU times: total: 234 ms
Wall time: 89 ms


## Decision Tree

In [668]:
%%time

# run with base hyper parameters

model = DecisionTreeClassifier(random_state=SEED)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print('Actual:     ', y_test.head(10).tolist())
print('Predictions:', y_pred.tolist()[0:10])
print_model_results(y_test, y_pred)

Actual:      [True, False, False, False, True, True, False, True, False, True]
Predictions: [True, True, False, False, False, True, True, True, True, False]
[[18 13]
 [22 59]]
              precision    recall  f1-score   support

       False   0.450000  0.580645  0.507042        31
        True   0.819444  0.728395  0.771242        81

    accuracy                       0.687500       112
   macro avg   0.634722  0.654520  0.639142       112
weighted avg   0.717187  0.687500  0.698115       112

CPU times: total: 15.6 ms
Wall time: 19 ms


In [669]:
%%time

# Find best hyperparameters
#   https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html
#   https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html

hyper_best = None

if not hyper_best:
    param_grid = {
        # 'criterion': ['gini', 'entropy'],
        # 'splitter': ['best', 'random'],
        'max_depth': [2, 4, 8, 16, 32],
        # 'max_features': ['auto', 'sqrt', 'log2'],
        'max_leaf_nodes': [2, 4, 8, 16, 32],
        # 'min_samples_leaf': [1, 2, 3, 4],
    }
    hyper_search = GridSearchCV(model, param_grid, cv=5, verbose=1).fit(X_valid, y_valid)
    hyper_best = hyper_search.best_params_

print('Best hyperparameters =', hyper_best)

Fitting 5 folds for each of 25 candidates, totalling 125 fits
Best hyperparameters = {'max_depth': 2, 'max_leaf_nodes': 2}
CPU times: total: 1.03 s
Wall time: 1.04 s


In [670]:
%%time

# Try retraining model with new hyperparameters

model = DecisionTreeClassifier(**hyper_best, random_state=SEED)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print('Actual:     ', y_test.head(10).tolist())
print('Predictions:', y_pred.tolist()[0:10])
print_model_results(y_test, y_pred)

Actual:      [True, False, False, False, True, True, False, True, False, True]
Predictions: [True, True, True, True, True, True, True, True, True, True]
[[ 0 31]
 [ 0 81]]
              precision    recall  f1-score   support

       False   0.000000  0.000000  0.000000        31
        True   0.723214  1.000000  0.839378        81

    accuracy                       0.723214       112
   macro avg   0.361607  0.500000  0.419689       112
weighted avg   0.523039  0.723214  0.607050       112

CPU times: total: 15.6 ms
Wall time: 13.5 ms


## Random Forest

In [698]:
%%time

# run with base hyper parameters

model = RandomForestClassifier(random_state=SEED)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print('Actual:     ', y_test.head(10).tolist())
print('Predictions:', y_pred.tolist()[0:10])
print_model_results(y_test, y_pred)

Actual:      [True, False, False, False, True, True, False, True, False, True]
Predictions: [True, True, True, False, True, True, False, True, True, True]
[[15 16]
 [ 6 75]]
              precision    recall  f1-score   support

       False   0.714286  0.483871  0.576923        31
        True   0.824176  0.925926  0.872093        81

    accuracy                       0.803571       112
   macro avg   0.769231  0.704898  0.724508       112
weighted avg   0.793760  0.803571  0.790394       112

CPU times: total: 156 ms
Wall time: 158 ms


In [699]:
%%time

# Find best hyperparameters
#   https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html
#   https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html

hyper_best = None

if not hyper_best:
    param_grid = {
        # 'n_estimators': [75, 100, 125],
        'max_depth': [1, 3, 5, 7, 9],
        'max_features': [8, 16, 32, 64, 128],
        # 'max_features': ['sqrt', 'log2', None],
        #
        # 'criterion': ['gini', 'entropy'],
        # 'class_weight': ['balanced', 'balanced_subsample'],
        # 'min_samples_split': [10, 15, 25, 50],
        # 'min_samples_leaf': [5, 10, 15],
        # 'oob_score': [False, True],
        # 'max_leaf_nodes': [2, 4, 8, 16, 32, 64]
    }
    hyper_search = GridSearchCV(model, param_grid, cv=5, verbose=1, n_jobs=4).fit(X_valid, y_valid)
    hyper_best = hyper_search.best_params_

print('Best hyperparameters =', hyper_best)

Fitting 5 folds for each of 25 candidates, totalling 125 fits
Best hyperparameters = {'max_depth': 3, 'max_features': 64}
CPU times: total: 312 ms
Wall time: 4.85 s


In [700]:
%%time

# Try retraining model with new hyperparameters

model = RandomForestClassifier(**hyper_best, random_state=SEED)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print('Actual:     ', y_test.head(10).tolist())
print('Predictions:', y_pred.tolist()[0:10])
print_model_results(y_test, y_pred)

Actual:      [True, False, False, False, True, True, False, True, False, True]
Predictions: [True, True, True, True, True, True, True, True, True, True]
[[ 2 29]
 [ 0 81]]
              precision    recall  f1-score   support

       False   1.000000  0.064516  0.121212        31
        True   0.736364  1.000000  0.848168        81

    accuracy                       0.741071       112
   macro avg   0.868182  0.532258  0.484690       112
weighted avg   0.809334  0.741071  0.646957       112

CPU times: total: 141 ms
Wall time: 146 ms


## Gradient Boosting Classifier

In [733]:
%%time

# run with base hyper parameters

model = GradientBoostingClassifier(random_state=SEED)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print('Actual:     ', y_test.head(10).tolist())
print('Predictions:', y_pred.tolist()[0:10])
print_model_results(y_test, y_pred)

Actual:      [True, False, False, False, True, True, False, True, False, True]
Predictions: [True, True, True, False, False, True, False, True, False, True]
[[21 10]
 [11 70]]
              precision    recall  f1-score   support

       False   0.656250  0.677419  0.666667        31
        True   0.875000  0.864198  0.869565        81

    accuracy                       0.812500       112
   macro avg   0.765625  0.770808  0.768116       112
weighted avg   0.814453  0.812500  0.813406       112

CPU times: total: 281 ms
Wall time: 281 ms


In [734]:
%%time

# Find best hyperparameters
#   https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingClassifier.html
#   https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html

hyper_best = None

if not hyper_best:
    param_grid = {
        'learning_rate': [0.1, 1, 10],
        'n_estimators': [5, 50, 100, 250, 500],
        'max_depth': [5, 7, 9, 11, 13],
        'max_features': [64, 128, 256],
        # 'criterion': ['friedman_mse', 'squared_error', 'mse'],
        # 'max_features': ['auto', 'sqrt', 'log2'],
    }
    hyper_search = GridSearchCV(model, param_grid, cv=5, verbose=1, n_jobs=4).fit(X_valid, y_valid)
    hyper_best = hyper_search.best_params_

print('Best hyperparameters =', hyper_best)

Fitting 5 folds for each of 225 candidates, totalling 1125 fits
Best hyperparameters = {'learning_rate': 0.1, 'max_depth': 9, 'max_features': 256, 'n_estimators': 50}
CPU times: total: 1.69 s
Wall time: 42.3 s


In [735]:
%%time

# Try retraining model with new hyperparameters

model = GradientBoostingClassifier(**hyper_best, random_state=SEED)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print('Actual:     ', y_test.head(10).tolist())
print('Predictions:', y_pred.tolist()[0:10])
print_model_results(y_test, y_pred)

Actual:      [True, False, False, False, True, True, False, True, False, True]
Predictions: [True, True, False, False, False, True, False, True, False, True]
[[23  8]
 [11 70]]
              precision    recall  f1-score   support

       False   0.676471  0.741935  0.707692        31
        True   0.897436  0.864198  0.880503        81

    accuracy                       0.830357       112
   macro avg   0.786953  0.803067  0.794098       112
weighted avg   0.836276  0.830357  0.832672       112

CPU times: total: 281 ms
Wall time: 323 ms


## XGBoost Classifier

In [634]:
%%time

# run with base hyper parameters

model = xgb.XGBClassifier(random_state=SEED, eval_metric='mlogloss')
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print('Actual:     ', y_test.head(10).tolist())
print('Predictions:', y_pred.tolist()[0:10])
print_model_results(y_test, y_pred)

Actual:      [True, False, False, False, True, True, False, True, False, True]
Predictions: [True, True, False, False, False, True, False, True, True, True]
[[21 10]
 [13 68]]
              precision    recall  f1-score   support

       False   0.617647  0.677419  0.646154        31
        True   0.871795  0.839506  0.855346        81

    accuracy                       0.794643       112
   macro avg   0.744721  0.758463  0.750750       112
weighted avg   0.801450  0.794643  0.797445       112

CPU times: total: 2.16 s
Wall time: 392 ms


In [635]:
%%time

# Find best hyperparameters
#   https://xgboost.readthedocs.io/en/stable/python/python_api.html#xgboost.XGBClassifier
#   https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html

hyper_best = None

if not hyper_best:
    param_grid = {
        'learning_rate': [0.01, 0.1, 1, 10, 100],
        # 'n_estimators': [5, 50, 100, 250, 500],
        'max_depth': [1, 3, 5, 7, 9],
    }
    hyper_search = GridSearchCV(model, param_grid, cv=5, verbose=1).fit(X_valid, y_valid)
    hyper_best = hyper_search.best_params_

print('Best hyperparameters =', hyper_best)

Fitting 5 folds for each of 125 candidates, totalling 625 fits
Best hyperparameters = {'learning_rate': 1, 'max_depth': 5, 'n_estimators': 500}
CPU times: total: 28min 27s
Wall time: 4min 55s


In [636]:
%%time

# Try retraining model with new hyperparameters

model = xgb.XGBClassifier(**hyper_best, random_state=SEED, eval_metric='mlogloss')
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print('Actual:     ', y_test.head(10).tolist())
print('Predictions:', y_pred.tolist()[0:10])
print_model_results(y_test, y_pred)

Actual:      [True, False, False, False, True, True, False, True, False, True]
Predictions: [False, True, False, False, False, True, False, True, True, True]
[[20 11]
 [18 63]]
              precision    recall  f1-score   support

       False   0.526316  0.645161  0.579710        31
        True   0.851351  0.777778  0.812903        81

    accuracy                       0.741071       112
   macro avg   0.688834  0.711470  0.696307       112
weighted avg   0.761386  0.741071  0.748359       112

CPU times: total: 9.8 s
Wall time: 1.65 s


## Conclusion

TODO: