In [40]:
import pandas as pd
import numpy as np
from sklearn.naive_bayes import CategoricalNB
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.metrics import accuracy_score, f1_score, precision_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA

#defining the model
nb = CategoricalNB()

In [41]:
df_training_activity = pd.read_csv('../data/archive/train_pca.csv')
df_testing_activity = pd.read_csv('../data/archive/test_pca.csv')

In [42]:
#Have to first load in original features so we can scale them 
df_training = pd.read_csv('../data/archive/train.csv')
df_testing = pd.read_csv('../data/archive/test.csv')

In [43]:
x_training = df_training.iloc[:,:-2]
x_testing = df_testing.iloc[:,:-2]

In [44]:
scaler = MinMaxScaler()

x_test = df_testing.iloc[:, :-1]
y_test = df_testing.iloc[:,-1]

In [29]:
#Now we have scaled them to be between 0 and 1, and we can convert them to PCA
x_training.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,551,552,553,554,555,556,557,558,559,560
0,0.644292,0.489853,0.433547,0.002361,0.008748,0.043237,0.002444,0.008546,0.038236,0.032638,...,0.475511,0.350991,0.14794,0.43703,0.5152,0.267793,0.491741,0.079377,0.798251,0.470687
1,0.639209,0.491795,0.43824,0.000877,0.012824,0.019839,0.000596,0.012749,0.021157,0.028466,...,0.594891,0.201676,0.070666,0.521131,0.496283,0.133774,0.853429,0.077606,0.798487,0.472842
2,0.639827,0.490266,0.443269,0.00231,0.017058,0.010528,0.00174,0.018464,0.011266,0.030654,...,0.726616,0.304605,0.122488,0.434094,0.58895,0.550707,0.906041,0.075533,0.798722,0.475441
3,0.639587,0.4869,0.438359,0.001954,0.008596,0.004662,0.00145,0.008767,0.005349,0.030654,...,0.721515,0.442374,0.264192,0.475464,0.493554,0.820538,0.257822,0.075675,0.7996,0.476168
4,0.638314,0.491715,0.442319,0.000931,0.009945,0.004759,0.000839,0.010331,0.004779,0.028765,...,0.558768,0.324393,0.153612,0.556466,0.561271,0.847339,0.192392,0.076067,0.801776,0.478054


In [31]:
pca = PCA(n_components = 40)

pca.fit(x_training)

x_training = pca.transform(x_training)
x_testing = pca.transform(x_testing)

x_training = pd.DataFrame(x_training)
x_testing = pd.DataFrame(x_testing)

In [32]:
#Now we have successfully fit the pca to both training and testing
x_training.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,30,31,32,33,34,35,36,37,38,39
0,-2.788729,-0.306267,-0.791485,0.794359,-0.693789,-0.041354,0.229888,0.411134,-0.52466,-0.215664,...,-0.57982,-0.28645,-0.188469,-0.138483,0.323477,-0.409058,0.114507,-0.062163,0.133459,-0.074963
1,-2.783821,-0.209484,-0.995852,0.373836,-0.296267,0.347477,-0.239007,-0.45716,-0.31612,-0.456797,...,-0.153565,0.054424,-0.04584,-0.253236,-0.187346,0.075533,-0.039927,-0.078495,-0.062283,0.130094
2,-2.753392,-0.028101,-1.126889,0.285697,-0.080894,-0.000106,0.041177,0.033117,-0.012706,-0.112034,...,0.217723,-0.042256,0.134062,-0.222337,0.146976,0.055712,-0.297752,-0.21105,0.082555,0.064711
3,-2.854574,0.326918,-1.103524,0.104164,-0.35761,0.532167,-0.124968,-0.381702,0.081263,0.222316,...,0.017576,-0.141325,0.388283,-0.064696,0.208816,0.000142,-0.046567,-0.011367,-0.011509,-0.261614
4,-2.891715,0.468004,-1.184149,0.118531,-0.217459,0.230438,-0.101686,-0.077527,0.130354,0.264381,...,0.138579,0.039666,0.027968,-0.093668,0.169453,0.276978,-0.114002,0.144861,-0.068293,0.109491


In [46]:
x_train = x_training
y_train = df_training_activity.iloc[:,-1]

x_test = x_testing
y_test = df_testing_activity.iloc[:,-1]

In [47]:
#Making a grid of values we want our grid search to test to find the best parameters

grid_values = [{'alpha': [.01,.05,.1,1] , 'min_categories': list(range(10,100,10))}]

nb_classifier = GridSearchCV(estimator = nb, param_grid = grid_values, cv = StratifiedKFold(n_splits = 5),
                      scoring = ['accuracy', 'precision_micro', 'f1_micro'], refit = False, verbose = 0)

nb_model = nb_classifier.fit(x_train, y_train)

In [48]:
#initializing our lists so we can later store the best params that achieve best scores across metrics
accuracy_best_params = []
roc_auc_best_params = []
f1_best_params = []

accuracy_best_params.append(nb_model.cv_results_['params'][ np.argmin(nb_model.cv_results_['rank_test_accuracy'])])
roc_auc_best_params.append(nb_model.cv_results_['params'][ np.argmin(nb_model.cv_results_['rank_test_precision_micro'])])
f1_best_params.append(nb_model.cv_results_['params'][ np.argmin(nb_model.cv_results_['rank_test_f1_micro']) ])

In [49]:
accuracy_best_params

[{'alpha': 1, 'min_categories': 30}]

In [50]:
roc_auc_best_params

[{'alpha': 1, 'min_categories': 30}]

In [51]:
f1_best_params

[{'alpha': 1, 'min_categories': 30}]

In [52]:
#finding the accuracy metrics
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
accuracy_test_score = []

nb_clf = CategoricalNB(alpha= .01, min_categories = 10)
model = nb_clf.fit(x_train, y_train)

y_pred = model.predict(x_test)

accuracy_test_score.append(accuracy_score(y_test, y_pred))

In [53]:
accuracy_test_score

[0.1985069562266712]