# Model analysis

#### Required libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from processing.data_preprocessing import data_prep

from sklearn.model_selection import GridSearchCV, train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, RandomForestClassifier

import warnings
warnings.filterwarnings("ignore")

### Data preparation

In [2]:
data = data_prep('../data/extension/csv/data.csv')
data.head()

Unnamed: 0,year,X1,X2,X3,X4,X5,X6,X7,X8,X9,...,X55,X56,X57,X58,X59,X61,X62,X63,X64,X65
0,1year,0.20912,0.49988,0.47225,1.9447,14.786,0.0,0.25834,0.99601,1.6996,...,2304.6,0.1213,0.42002,0.853,0.0,3.2732,107.35,3.4,60.987,0
1,1year,0.24866,0.69592,0.26713,1.5548,-1.1523,0.0,0.30906,0.43695,1.309,...,6332.7,0.24114,0.81774,0.76599,0.69484,3.951,134.27,2.7185,5.2078,0
2,1year,0.081483,0.30734,0.45879,2.4928,51.952,0.14988,0.092704,1.8661,1.0571,...,20545.0,0.054015,0.14207,0.94598,0.0,3.6147,86.435,4.2228,5.5497,0
3,1year,0.18732,0.61323,0.2296,1.4063,-7.3128,0.18732,0.18732,0.6307,1.1559,...,3186.6,0.13485,0.48431,0.86515,0.12444,4.3158,127.21,2.8692,7.898,0
4,1year,0.22822,0.49794,0.35969,1.7502,-47.717,0.0,0.28139,1.0083,1.9786,...,7616.8,0.13932,0.45457,0.85891,0.023002,8.9949,88.444,4.1269,12.299,0


#### Train data and test data

In [3]:
train_ = pd.DataFrame()
test_ = pd.DataFrame()
for year in data.year.unique():
    tr, ts = train_test_split(data[data.year == year], test_size=0.2, random_state=9)
    train_ = train_.append(tr, ignore_index=True)
    test_ = test_.append(ts, ignore_index=True)

train_.shape, test_.shape

((34537, 61), (8637, 61))

#### Data standardization

In [4]:
categ = ['year', 'X65']

scaler =  StandardScaler()
scaler.fit(train_.drop(columns=categ))

train = pd.DataFrame(scaler.transform(train_.drop(columns=categ)), 
                      columns=train_.columns.drop(categ))
test = pd.DataFrame(scaler.transform(test_.drop(columns=categ)), 
                     columns=test_.columns.drop(categ))

train = pd.concat([train, train_[categ]], axis=1)
test = pd.concat([test, test_[categ]], axis=1)

In [5]:
train_y = train.pop('X65')
train_x = train.copy()

test_y = test.pop('X65')
test_x = test.copy()

#### Encode categorical variable : year

In [6]:
train_x = pd.get_dummies(train_x)
test_x = pd.get_dummies(test_x)

train_x.shape, test_x.shape

((34537, 64), (8637, 64))

### Model testing

12 max features because in visualisation part, we saw that only 12 was needed to reach the biggest parts of information contained in dataset

#### Logistic regression

In [7]:
lr = LogisticRegression()
cross_val_score(lr, train_x, train_y, cv=5)

array([0.95251882, 0.94991314, 0.95106414, 0.95149848, 0.95294629])

In [8]:
LogisticRegression(penalty='elasticnet', C=2)

LogisticRegression(C=2, penalty='elasticnet')

#### Linear Discriminant Analysis

In [9]:
lda = LDA(n_components=1)
cross_val_score(lda, train_x, train_y, cv=5)

array([0.95165026, 0.94557035, 0.94889243, 0.95034023, 0.95251194])

In [10]:
cross_val_score(LDA(solver='lsqr', tol=1e-3), train_x, train_y, cv=5)

array([0.9507817 , 0.95179502, 0.94918199, 0.95222238, 0.0479224 ])

#### Random Forest

In [None]:
rf = RandomForestClassifier()
cross_val_score(rf, train_x, train_y, cv=5)

In [None]:
#rf = RandomForestClassifier(n_estimators=500, criterion='entropy', 
#                            max_depth=50, max_features=20, ccp_alpha=0.2, max_samples=0.7)
#cross_val_score(rf, train_x, train_y, cv=5)

#### Gradient Boosting

In [None]:
gbm = GradientBoostingClassifier()
cross_val_score(gbm, train_x, train_y, cv=5)

In [None]:
#gbm = GradientBoostingClassifier(loss='exponential', learning_rate=0.01, n_estimators=500, 
#                                 criterion='mse', max_depth=10, max_features=20, tol=1e-3, 
#                                 ccp_alpha=0.2)
#cross_val_score(gbm, train_x, train_y, cv=5)

#### SVM

In [None]:
svm = SVC()
cross_val_score(svm, train_x, train_y, cv=5)

In [None]:
svm = SVC(kernel='linear', C=2, shrinking=True, tol=1e-3)
cross_val_score(svm, train_x, train_y, cv=5)

#### Bonus neural network : MultiLayer Perceptron

In [None]:
mlp = MLPClassifier()
mlp.fit(train_x, train_y)
mlp.score(train_x, train_y)