# ML Notebook 4: Classification
## Regression Classification and Multi-Class Classification

In [2]:
import pandas as pd
import pickle

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.svm import SVR

from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt

# Model 1: Regression classification using our visual feature topic matrix as features

In [8]:
# read our files
train = pd.read_csv('data/SCUT-FBP5500_v2/train_bovw.zip')
test = pd.read_csv('data/SCUT-FBP5500_v2/test_bovw.zip')    

In [9]:
X_train = train.iloc[:,2:]
y_train = train['rating']
X_test = test.iloc[:,2:]
y_test = test['rating']

In [16]:
y_test.shape

(2200,)

In [17]:
y_pred.shape

(2200,)

In [22]:
# grid search pipeline example
# pipelines
pipe_rf = Pipeline([('scl', StandardScaler()),
			('clf', RandomForestRegressor(random_state=42))])

pipe_svr = Pipeline([('scl', StandardScaler()),
			('clf', SVR())])
# Set grid search params
param_range = [9, 10,20,30]
param_range_fl = [1.0, 0.5]
estimators = [200,250,300,350]

grid_params_rf = [{'max_depth': param_range,
        'n_estimators': estimators,
        'min_samples_split': param_range[1:]}]

grid_params_svr = [{'kernel': ['rbf'], 
        'C': param_range_fl}]

# Construct grid searches
jobs = -1

gs_rf = GridSearchCV(estimator=RandomForestRegressor(random_state=42,criterion='squared_error”'),
			param_grid=grid_params_rf,
			scoring='neg_mean_squared_error',
			cv=5, 
			n_jobs=jobs)
gs_svr = GridSearchCV(estimator=SVR(),
			param_grid=grid_params_svr,
			scoring='neg_mean_squared_error',
			cv=5,
			n_jobs=jobs)

# List of pipelines for iterating through each of them
# Removed rf do to fitting issues , gs_rf,
grids = [gs_svr, gs_rf]

# Creating a dict for our reference
grid_dict = {0: 'Support Vector Machine', 1: 'Random Forest'} #0: 'Random Forest',

grid_dict_results = {}

# Fit the grid search objects
print('Performing model optimizations...')
best_acc = 0.0
best_clf = 0
best_gs = ''
for idx, gs in enumerate(grids):
    print('\nEstimator: %s' % grid_dict[idx])
    gs.fit(X_train, y_train)
    print('Best params are : %s' % gs.best_params_)
    # Best training data accuracy
    print('Best training mean squared error score: %.3f' % gs.best_score_)
    # Predict on test data with best params
    y_pred = gs.predict(X_test)
    # Test data accuracy of model with best params
    acc = gs.score(X_test, y_test)
    print('Test set mean squared error  for best params: %.3f ' % acc)
    # Track best (highest test accuracy) model
    grid_dict_results[grid_dict[idx]] = [gs.cv_results_, gs.best_params_, gs.best_score_, acc]
    if acc > best_acc:
        best_acc = acc
        best_gs = gs
        best_clf = idx
print('\nClassifier with best test set score ( mean squared error ): %s' % grid_dict[best_clf])

# Save best grid search pipeline to file
gs_dump_file = 'best_grid_search_pipeline.pkl'
dict_dump_file = 'ml_gs_dict.pkl'
path1 = 'data/SCUT-FBP5500_v2/' + gs_dump_file
path2 = 'data/SCUT-FBP5500_v2/' + dict_dump_file
with open(path1, 'wb') as f:
    pickle.dump(best_gs, f)
with open(path2, 'wb') as f:
    pickle.dump(grid_dict_results, f)
print('\nSaved %s grid search pipeline to file: %s' % (grid_dict[best_clf], gs_dump_file))

Performing model optimizations...

Estimator: Support Vector Machine
Best params are : {'C': 1.0, 'kernel': 'rbf'}
Best training mean squared error score: -0.331
Test set mean squared error  for best params: -0.313 

Estimator: Random Forest
Best params are : {'max_depth': 30, 'min_samples_split': 10, 'n_estimators': 350}
Best training mean squared error score: -0.393
Test set mean squared error  for best params: -0.364 

Classifier with best test set score ( mean squared error ): Support Vector Machine

Saved Support Vector Machine grid search pipeline to file: best_grid_search_pipeline.pkl


# mean squared error for best model: 0.313 
# ----------------------------------------

# Model 2: Multi-Class classification using all of our features

In [None]:
# if using google colab for outsourced processing uncomment this cell

## Pandas needs to be updated on google drive to read in pickle files
# !pip install --upgrade pandas

# from google.colab import drive
# drive.mount('/drive')

## TRAINING

In [None]:
# this cell calls the model
# The model can be re-loaded at various traing stages as we save after each epcoh batch

# first time we did this
#clf = PassiveAggressiveClassifier(random_state=42, n_jobs=-1, warm_start=True)

# reloading we call the pickled model from its local directory
clf = pickle.load(open('/drive/My Drive/Milestone 2 Project/ML_data/trained_PAC_CLF_batch_0013_model.pkl', 'rb'))

In [None]:
def trainer(filename, batch_num):
    """
    Helper function for training epochs. The path variable may be updated based on local storage set up.
    
    Params:
    filename: the date filename e.g train_batch_000.zip
    batch_num: This will name the exported classifier model
    
    Return:
    trains the model stored in clf variable; returns nothing
    """
    # read in batch
    path = "/drive/My Drive/Milestone 2 Project/ML_data/" + filename 
    train = pd.read_csv(path)

    # separate
    # rounding rating because classifying
    # skip first two columns which are rating and filename
    X_train = train.iloc[:,2:]
    y_train = train['rating'].round(0).astype(int)

    # Classification classes are [1,2,3,4,5]
    clf.partial_fit(X_train, y_train, classes=[1,2,3,4,5])

    # Save output
    with open(f"/drive/My Drive/Milestone 2 Project/ML_data/trained_PAC_CLF_batch_00{batch_num}_model.pkl", 'wb') as out:
        pickle.dump(clf, out)

    print(f"training {filename} complete")
    
    return


In [None]:
trainer('train_batch_000.zip', 0)

In [None]:
trainer('train_batch_001.zip', 1)

In [None]:
trainer('train_batch_002.zip', 2)

In [None]:
trainer('train_batch_003.zip', 3)

In [None]:
trainer('train_batch_004.zip', 4)

In [None]:
trainer('train_batch_005.zip', 5)

In [None]:
trainer('train_batch_006.zip', 6)

In [None]:
trainer('train_batch_007.zip', 7)

In [None]:
trainer('train_batch_008.zip', 8)

In [None]:
trainer('train_batch_009.zip', 9)

In [None]:
trainer('train_batch_0010.zip', 10)

In [None]:
trainer('train_batch_0011.zip',11)

In [None]:
trainer('train_batch_0012.zip', 12)

In [None]:
trainer('train_batch_0013.zip', 13)

## TESTING

In [None]:
# load in the classifier that we want to test
#clf = pickle.load(open('/drive/My Drive/Milestone 2 Project/ML_data/trained_PAC_CLF_batch_0013_model.pkl', 'rb'))


# dictionary holder for scores
test_scores = {}
def testing_func(filename, ix):
  path = '/drive/My Drive/Milestone 2 Project/ML_data/' + filename
  test = pd.read_csv(path)
  X_test = test.iloc[:,2:]
  
  # defining the rating by first rounding using numpy then cast to Integer
  # Classification classes are [1,2,3,4,5]
  y_test = test['rating'].astype('float').round(0).astype('int')

  y_pred = clf.predict(X_test)
  tmp_dict = {'filename':test['filename'],'rating':y_test,'pred_rating':y_pred}
  test_scores[ix] = pd.DataFrame.from_dict(tmp_dict)

  return


In [None]:
testing_func('test_batch_001.zip', 0)

In [None]:
testing_func('test_batch_002.zip', 1)

In [None]:
testing_func('test_batch_003.zip', 2)

In [None]:
testing_func('test_batch_004.zip', 3)

In [None]:
testing_func('test_batch_005.zip', 4)

In [None]:
testing_func('test_batch_006.zip', 5)

In [None]:
testing_func('test_batch_007.zip', 6)

In [None]:
testing_func('test_batch_008.zip', 7)

In [None]:
# Save output
# with open(f"/drive/My Drive/Milestone 2 Project/ML_data/test_scores.pkl", 'wb') as out:
#  pickle.dump(test_scores,out)

In [None]:
# test = pickle.load(open("/drive/My Drive/Milestone 2 Project/ML_data/test_scores.pkl", 'rb'))
test_scores_df = pd.DataFrame()
for v in test.values():
  test_scores_df = pd.concat([test_scores_df, v])

In [None]:
pac_clf_accu_scr = accuracy_score(test_scores_df.rating, test_scores_df.pred_rating)
# 0.3030769230769231
pac_clf_accu_scr

In [None]:
cm = confusion_matrix(test_scores_df.rating, test_scores_df.pred_rating, labels= [1,2,3,4,5])
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=[1,2,3,4,5])
disp.plot(values_format = '.1f')
plt.title('Passive Aggressive Classifier Accuracy = 30.3%')
plt.show()