# Classifiers

## Imports

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import SelectPercentile
from sklearn.model_selection import GridSearchCV
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier

## Data Prep

In [3]:
# import data and drop extra columns
features = pd.read_csv("../data/prepped/modeling_features.csv")
labels = pd.read_csv("../data/prepped/modeling_outcome.csv")
features = features.drop(["Unnamed: 0"], axis=1)
labels = labels.drop(["Unnamed: 0"], axis=1)

In [4]:
# make the outcome categorical, incrementing from 0 in 0.25 steps
labels["rating_value"] = [str(round(i * 4) / 4) for i in labels["rating_value"]]

In [5]:
# perform a test train split, and generate folds for cross validation
train_features, test_features, train_outcome, test_outcome = train_test_split(
    features,
    labels["rating_value"],
    test_size=0.25,
    random_state=42
)
folds = KFold(n_splits=10, shuffle=True, random_state=42)

## KNN Classifier

In [6]:
# train a knn model
pipeline = make_pipeline(
    MinMaxScaler(),
    KNeighborsClassifier()
)
pipeline_params = {
    "kneighborsclassifier__n_neighbors": [10]
}
model = GridSearchCV(pipeline, pipeline_params, cv=folds)
model.fit(train_features, train_outcome)
score = model.score(test_features, test_outcome)

In [7]:
# get the knn model score
print("model score:", score)
print("best params:", model.best_params_)

model score: 0.5304878048780488
best params: {'kneighborsclassifier__n_neighbors': 10}


## MLP Classifier

In [8]:
# create the pipeline
pipeline = make_pipeline(
    MinMaxScaler(),
    MLPClassifier()
)
# set params
pipeline_params = {
    "mlpclassifier__hidden_layer_sizes": [100],
    "mlpclassifier__activation": ["relu"],
    "mlpclassifier__solver": ["adam"],
    "mlpclassifier__alpha": [0.001],
    "mlpclassifier__learning_rate": ["constant"],
    "mlpclassifier__random_state": [42],
    "mlpclassifier__beta_1": [0.07],
    "mlpclassifier__beta_2": [0.999],
    "mlpclassifier__early_stopping": [False]
}
# perform grid search
model = GridSearchCV(pipeline, pipeline_params, cv=folds, verbose=True)
model.fit(train_features, train_outcome)
score = model.score(test_features, test_outcome)

Fitting 10 folds for each of 1 candidates, totalling 10 fits


[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    7.8s finished


In [9]:
# output score
print("model score:", score)
print("best params:", model.best_params_)

model score: 0.5853658536585366
best params: {'mlpclassifier__activation': 'relu', 'mlpclassifier__alpha': 0.001, 'mlpclassifier__beta_1': 0.07, 'mlpclassifier__beta_2': 0.999, 'mlpclassifier__early_stopping': False, 'mlpclassifier__hidden_layer_sizes': 100, 'mlpclassifier__learning_rate': 'constant', 'mlpclassifier__random_state': 42, 'mlpclassifier__solver': 'adam'}
