# Imports and Load Data

In [None]:
# Standard scientific Python imports
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns # for visualisation
import numpy as np

from numpy import mean
from numpy import std
from numpy import arange

from sklearn.pipeline import Pipeline

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import QuantileTransformer
from sklearn.preprocessing import RobustScaler
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
from sklearn.feature_selection import RFE

from statsmodels.stats.outliers_influence import variance_inflation_factor


# Default plotting parameters
font = {'size'   : 18}
plt.rc('font', **font)

In [None]:
# Load training dataset
rep = '/Users/Cherry0904/Desktop/MSc/Practicals/SML Practical/' 
y_train = pd.read_csv(rep + 'y_train.csv', index_col = 0, squeeze=True)
X_train = pd.read_csv(rep + 'X_train.csv', index_col = 0, header=[0, 1, 2]) #sets up header to be tuple
X_test = pd.read_csv(rep + 'X_test.csv', index_col = 0, header=[0, 1, 2])

# Create version with them together
Xy = pd.concat([X_train, y_train], axis = 1)

# StandardScaler
scaler = StandardScaler() 
scaler.fit(X_train)
X_train_sd = scaler.transform(X_train)
X_test_sd = scaler.transform(X_test)

# Functions

The Export Function:

In [None]:
# Function to format the predictions in a dataframe and export to a csv file, to be uploaded on kaggle
def export_to_csv(y_hat, filename):
    df = pd.DataFrame({'Genre': y_hat})
    df.index.name = 'Id'
    df.to_csv(filename)

Define the function that assesses performance for each classifier, using three repeats of 4-fold CV:

In [None]:
def cv_clf(clf, X, y):
    cv = RepeatedStratifiedKFold(n_splits=4, n_repeats=3, random_state=15)
    # evaluate model accuracy
    cv_results = cross_validate(clf, X, y, scoring='accuracy', cv=cv, n_jobs=-1, return_train_score=True)
    ts_scores = cv_results['test_score']
    print('Mean Testing Accuracy: %.3f (%.3f)' % (mean(ts_scores), std(ts_scores)))

Define the grid search function for tuning parameters:

In [None]:
def grid_search(clf, X, y, parameter, values):
    # define grid
    grid = dict()
    grid[parameter] = values
    # define search   
    cv = RepeatedStratifiedKFold(n_splits=4, n_repeats=3, random_state=1)
    search = GridSearchCV(clf, grid, scoring='accuracy', cv=cv, n_jobs=-1)
    # perform the search
    results = search.fit(X, y)
    # summarize
    print('Mean Validation Accuracy: %.3f' % results.best_score_)
    print('Config: %s' % results.best_params_)

# Naive Bayes 

Note: For Feature Selection:
- ANOVA correlation method does not improve test accuracy
- Cannot apply RFE method as there is no NB estimator in RFE()
- Did not try removing colinear variables as NB is not affected by collinearity

### On Untransformed data

In [None]:
nb = GaussianNB()
cv_clf(nb, X_train, y_train) # on unscaled data, result on scaled data is worse

Mean Testing Accuracy: 0.415 (0.014)


### Feature Selection - ANOVA<br>
Select k=300 features with highest correlation with the output class.

In [None]:
# Train-test split on scaled data
X_tr, X_te, y_tr, y_te = train_test_split(X_train, y_train, test_size=0.20, random_state=15)

# Standardise data
scaler = StandardScaler() 
scaler.fit(X_tr)
X_tr_sd = scaler.transform(X_tr)
X_te_sd = scaler.transform(X_te)

In [None]:
# ANOVA feature selection 
fs = SelectKBest(score_func=f_classif, k=300)
X_tr_sd = fs.fit_transform(X_tr_sd, y_tr)
X_te_sd = fs.fit_transform(X_te_sd, y_te)

In [None]:
nb = GaussianNB()
nb.fit(X_tr_sd,y_tr)
nb.score(X_te_sd,y_te)

0.3358333333333333

### Project data on PCA/LDA components

Project data on LDA discriminant coordinates improve the test accuracy significantly.

In [None]:
# Train-test split on scaled data
X_tr, X_te, y_tr, y_te = train_test_split(X_train, y_train , test_size=0.20 , random_state=15)

# Standardise data
scaler = StandardScaler() 
scaler.fit(X_tr)
X_tr_sd = scaler.transform(X_tr)
X_te_sd = scaler.transform(X_te)

# Fit PCA projections on training data, obtain projections for both train and test
k = 10
PC = PCA(n_components = k)
PC.fit(X_tr_sd) 
ZPC_tr = PC.transform(X_tr_sd) 
ZPC_te = PC.transform(X_te_sd) 

# Fit LDA projections on training data, obtain projections for both train and test
LDA = LinearDiscriminantAnalysis(n_components = 7)
LDA.fit(X_tr_sd, y_tr)
ZLDA_tr = LDA.transform(X_tr_sd)
ZLDA_te = LDA.transform(X_te_sd)

In [None]:
nb = GaussianNB()
nb.fit(ZPC_tr,y_tr)
ts_score = nb.score(ZPC_te,y_te)
print("First", k, "PCs:", ts_score) # PCA does badly

nb.fit(ZLDA_tr,y_tr)
ts_score = nb.score(ZLDA_te,y_te) 
print("LDA Components:", ts_score)

First 10 PCs: 0.39
LDA Components: 0.5425


### Experiment with Other Scalers

Tried MinMaxScaler and QuantileTransformer. Overall, quantile transform to normal slightly improves the mean test accuracy from 0.561 to 0.565.

0. LDA + Standardised data

In [None]:
# Fit Standardised data
# define pipeline
trans = StandardScaler()
LDA = LinearDiscriminantAnalysis(n_components = 7)
model = GaussianNB()

pipeline = Pipeline(steps=[('t', trans), ('lda', LDA), ('m', model)])

# evaluate the pipeline
cv = RepeatedStratifiedKFold(n_splits=4, n_repeats=3, random_state=15)
n_scores = cross_val_score(pipeline, X_train, y_train, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')

print("Mean Testing Accuracy:", np.mean(n_scores))

Mean Testing Accuracy: 0.5607777777777777


1. LDA + Normalised data:

In [None]:
# For Normalised data
# define pipeline
trans = MinMaxScaler()
LDA = LinearDiscriminantAnalysis(n_components = 7)
model = GaussianNB()

pipeline = Pipeline(steps=[('t', trans), ('lda', LDA), ('m', model)])

# evaluate the pipeline
cv = RepeatedStratifiedKFold(n_splits=4, n_repeats=3, random_state=15)
n_scores = cross_val_score(pipeline, X_train, y_train, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')

print("Mean Testing Accuracy:", np.mean(n_scores))

Mean Testing Accuracy: 0.5607777777777777


2. LDA + Uniform-Scaled data:

In [None]:
# For Uniform Scaled data
# define pipeline
trans = QuantileTransformer(output_distribution='uniform')
LDA = LinearDiscriminantAnalysis(n_components = 7)
model = GaussianNB()

pipeline = Pipeline(steps=[('t', trans), ('lda', LDA), ('m', model)])

# evaluate the pipeline
cv = RepeatedStratifiedKFold(n_splits=4, n_repeats=3, random_state=15)
n_scores = cross_val_score(pipeline, X_train, y_train, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')

print("Mean Testing Accuracy:", np.mean(n_scores))

Mean Testing Accuracy: 0.5592222222222222


3. LDA + Normal-Scaled data:

In [None]:
# define pipeline
trans = QuantileTransformer(output_distribution='normal')
LDA = LinearDiscriminantAnalysis(n_components = 7)
model = GaussianNB()

pipeline = Pipeline(steps=[('t', trans), ('lda', LDA), ('m', model)])

# evaluate the pipeline
cv = RepeatedStratifiedKFold(n_splits=4, n_repeats=3, random_state=15)
n_scores = cross_val_score(pipeline, X_train, y_train, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')

print("Mean Testing Accuracy:", np.mean(n_scores))

Mean Testing Accuracy: 0.5652777777777778


Fit the model on the full training data:

In [None]:
scaler = QuantileTransformer(output_distribution='normal')
scaler.fit(X_train)
X_train_qn = scaler.transform(X_train)
X_test_qn = scaler.transform(X_test)

LDA = LinearDiscriminantAnalysis(n_components = 7)
LDA.fit(X_train_qn, y_train)
Z_train = LDA.transform(X_train_qn)
Z_test = LDA.transform(X_test_qn)

nb = GaussianNB()
nb.fit(Z_train, y_train)
y_hat = nb.predict(Z_test)

# Export to CSV file 
# export_to_csv(y_hat,'Predictions_NB.csv')