# Least Squares

On PCA components, LDA components, and full data

# Imports

In [1]:
# Standard scientific Python imports
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns # for visualisation
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn import linear_model

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV

import random
random.seed(15)

# Default plotting parameters
font = {'size'   : 18}
plt.rc('font', **font)

In [2]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [3]:
# Load training dataset
base_dir = "/content/drive/My Drive/SML Group Practical/Data/" #works for Kyla's drive
y_train = pd.read_csv(base_dir + 'y_train.csv', index_col = 0, squeeze = True)
X_train = pd.read_csv(base_dir + 'X_train.csv', index_col = 0, header=[0, 1, 2])
X_test = pd.read_csv(base_dir + 'X_test.csv', index_col = 0, header=[0, 1, 2])

#Create version with them together
Xy = pd.concat([X_train, y_train], axis = 1)

#scaled version
scaler = StandardScaler()
scaler.fit(X_train)
X_train_sc = scaler.transform(X_train)
X_test_sc = scaler.transform(X_test)

# Wrong Approach

* Obtain LDA and PCA projections (for PC, first 10) for X_train
* The problem with what I did here: if then try to do training and testing on these, even with cross-validation, you have overfit because all your data came from the projection you learned - that is, we're not examining whether still works when you have new data to which you apply the LDA transformation learned on the training data.  Below, I instead split off some data from the beginning for this purpose.

In [None]:
k = 10
PC = PCA(n_components = k)
PC.fit(X_train) #PC's from training data
ZPC_train = PC.transform(X_train) #project training data

k2 = 5
LDA = LinearDiscriminantAnalysis(n_components = k2)
LDA.fit(X_train, y_train)
ZLDA_train = LDA.transform(X_train)

Basic Least Squares with No Penalty for LDA, PCA, Full Data, with cross-validation

In [None]:
# Least Squares, no penalty
clf = linear_model.RidgeClassifier(alpha = 0)
# use three repeats of 10-fold CV
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
# evaluate model accuracy

def cross_val_results(X_train, y_train):
  cv_results = cross_validate(clf, X_train, y_train, scoring='accuracy',
                            cv=cv, n_jobs=-1, return_train_score=True)
  tr_scores = cv_results['train_score']
  ts_scores = cv_results['test_score']
  return(tr_scores, ts_scores)

tr_scores1, ts_scores1 = cross_val_results(X_train_sc, y_train)
tr_scores2, ts_scores2 = cross_val_results(ZPC_train, y_train)
tr_scores3, ts_scores3 = cross_val_results(ZLDA_train, y_train)

# summarize result
print("Full data")
print('Mean Training Accuracy: %.3f (%.3f)' % (np.mean(tr_scores1), np.std(tr_scores1)))
print('Mean Testing Accuracy: %.3f (%.3f)' % (np.mean(ts_scores1), np.std(ts_scores1)))

print("First", k, "PCs")
print('Mean Training Accuracy: %.3f (%.3f)' % (np.mean(tr_scores2), np.std(tr_scores2)))
print('Mean Testing Accuracy: %.3f (%.3f)' % (np.mean(ts_scores2), np.std(ts_scores2)))

print("LDA Components")
print('Mean Training Accuracy: %.3f (%.3f)' % (np.mean(tr_scores3), np.std(tr_scores3)))
print('Mean Testing Accuracy: %.3f (%.3f)' % (np.mean(ts_scores3), np.std(ts_scores3)))


Full data
Mean Training Accuracy: 0.688 (0.003)
Mean Testing Accuracy: 0.561 (0.017)
First 10 PCs
Mean Training Accuracy: 0.351 (0.002)
Mean Testing Accuracy: 0.346 (0.012)
LDA Components
Mean Training Accuracy: 0.630 (0.002)
Mean Testing Accuracy: 0.629 (0.016)


Grid Search to choose penalty for LDA and Full Data

In [None]:
# define grid
grid = dict()
grid['alpha'] = (0.0001, 0.001, 0.01, np.arange(0, 10, 0.5))
# define search
search = GridSearchCV(clf, grid, scoring='accuracy', cv=cv, n_jobs=-1)
# perform the search
results = search.fit(X_train, y_train)
# summarize
print("Full Data")
print('Mean Testing Accuracy: %.3f' % results.best_score_)
print('Config: %s' % results.best_params_)

Full Data
Mean Testing Accuracy: 0.562
Config: {'alpha': 0.01}


In [None]:
# define grid
grid = dict()
grid['alpha'] = (0.0001, 0.001, 0.01, np.arange(0, 10, 0.5))
# define search
search = GridSearchCV(clf, grid, scoring='accuracy', cv=cv, n_jobs=-1)
# perform the search
results = search.fit(ZLDA_train, y_train)
# summarize
print("LDA")
print('Mean Testing Accuracy: %.3f' % results.best_score_)
print('Config: %s' % results.best_params_)

LDA
Mean Testing Accuracy: 0.680
Config: {'alpha': 0.0001}


# Correct Approach

* not bothering with PCA since it does badly even with overfit

In [4]:
#Train-test split
X_tr, X_te, y_tr, y_te = train_test_split(X_train_sc, y_train , test_size = 0.20 , random_state=15)


In [None]:
#Fit LDA projections on training data, obtain projections for both
k = 5
LDA = LinearDiscriminantAnalysis(n_components = k)
LDA.fit(X_tr, y_tr)
ZLDA_train = LDA.transform(X_tr)
ZLDA_test = LDA.transform(X_te)

#set-up least squares classifier
clf = linear_model.RidgeClassifier(alpha = 0)
# use 6-fold
cv = RepeatedStratifiedKFold(n_splits=6, n_repeats=1, random_state=1)

#Grid Search      -- this still has overfit problem actually but not worth dealing with atm
# define grid
grid = dict()
grid['alpha'] = (0.0001, 0.001, 0.01, np.arange(0, 5, 0.5))
# define search
search = GridSearchCV(clf, grid, scoring='accuracy', cv=cv, n_jobs=-1)
# perform the search
results = search.fit(ZLDA_train, y_tr)
# summarize
print("LDA")
print('Mean Validation Accuracy: %.3f' % results.best_score_)      
print('Config: %s' % results.best_params_)

LDA
Mean Validation Accuracy: 0.642
Config: {'alpha': 0.0001}


In [None]:
#Final evaluation on held-out set
clf = linear_model.RidgeClassifier(alpha = 0.0001)
clf.fit(ZLDA_train, y_tr)
print("Train Accuracy:", clf.score(ZLDA_train, y_tr))
print("Test Accuracy:", clf.score(ZLDA_test, y_te))

#that's more like it

Train Accuracy: 0.6427083333333333
Test Accuracy: 0.5308333333333334


Compare to Least Squares with untransformed data

In [None]:
#set-up least squares classifier
clf = linear_model.RidgeClassifier(alpha = 0)
# use 1 repeat of 6-fold
cv = RepeatedStratifiedKFold(n_splits=6, n_repeats=1, random_state=1)

#Grid Search
# define grid
grid = dict()
grid['alpha'] = (0.0001, 0.001, 0.01, np.arange(0, 5, 0.5))
# define search
search = GridSearchCV(clf, grid, scoring='accuracy', cv=cv, n_jobs=-1)
# perform the search
results = search.fit(X_tr, y_tr)
# summarize
print("Full Data")
print('Mean Testing Accuracy: %.3f' % results.best_score_)
print('Config: %s' % results.best_params_)

Full Data
Mean Testing Accuracy: 0.549
Config: {'alpha': 0.0001}


In [None]:
#Final evaluation on held-out set
clf = linear_model.RidgeClassifier(alpha = 0.01)
clf.fit(X_tr, y_tr)
print("Train Accuracy:", clf.score(X_tr, y_tr))
print("Test Accuracy:", clf.score(X_te, y_te))

Train Accuracy: 0.7008333333333333
Test Accuracy: 0.5475


**Training least squares model on all the training data for purpose of submission**


In [None]:
clf = linear_model.RidgeClassifier(alpha = 0.01)
clf.fit(X_train_sc, y_train)
y_hat = clf.predict(X_test)

In [None]:
# Function to format the predictions in a dataframe and export to a csv file, to be uploaded on kaggle
def export_to_csv(y_hat, filename):
    df = pd.DataFrame({'Genre': y_hat})
    df.index.name = 'Id'
    df.to_csv(filename)

export_to_csv(y_hat, "LS_predictions.csv")

# Summary:

* Least Squares applied to full training is essentially equivalent when applied to LDA with all 7 components. If you decrease # of LDA components, does worse ( as expected), but not much worse

* Least squares applied to PC does terribly (even with some overfit advantage) - not so surprising; not worth pursuing further

*  **New Baseline:** 0.55 