# SVM, LR and Random Forest

Siew Wei Feng

### Load data

In [1]:
import pickle

#to reaccess
f = open('x_train.pckl', 'rb')
x_train = pickle.load(f)
f.close()

f = open('x_test.pckl', 'rb')
x_test = pickle.load(f)
f.close()

f = open('y_train.pckl', 'rb')
y_train = pickle.load(f)
f.close()

f = open('y_test.pckl', 'rb')
y_test = pickle.load(f)
f.close()


In [2]:
x_train

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,190,191,192,193,194,195,196,197,198,199
0,-0.139053,0.085954,0.134424,0.007430,0.019548,0.039812,0.024023,0.047265,0.009495,0.017839,...,0.349064,0.107314,-0.197132,0.109178,0.292737,0.191383,0.073168,0.016185,0.120109,-0.019581
1,-0.155482,0.092528,0.087958,-0.006443,0.032637,0.091513,-0.024549,0.001986,0.040707,0.020159,...,0.328685,0.113341,-0.047412,-0.083207,0.077209,0.150267,0.034841,-0.167317,0.075683,0.011139
2,-0.147152,0.135515,0.160577,-0.011785,0.035533,0.032933,-0.039414,0.091675,0.009731,0.005287,...,0.443020,0.318361,0.036487,0.141171,0.242982,0.058814,-0.170503,0.040575,-0.080709,-0.162065
3,-0.151337,0.025105,0.133567,0.029539,-0.009014,0.011786,-0.076242,0.088142,-0.022388,-0.060334,...,0.385004,-0.086752,0.113248,0.035045,-0.017515,0.114395,-0.237837,0.083261,0.048203,0.010538
4,-0.145445,0.109250,0.171711,0.040833,-0.017474,-0.001341,0.001930,0.105575,0.028514,-0.056779,...,0.384351,0.089327,-0.154313,0.092013,0.032642,0.073208,0.131769,0.129483,0.101375,0.053342
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11995,-0.161575,0.003154,0.151651,-0.039999,0.029624,0.084016,-0.018009,0.060704,0.002283,-0.040601,...,0.310920,0.087655,-0.079134,0.030907,0.132617,0.114127,0.015371,-0.037972,0.098409,-0.032257
11996,-0.089031,0.156806,0.168515,0.054757,0.014283,-0.005096,-0.012236,0.101809,-0.019814,0.000736,...,0.432577,0.264423,0.253214,-0.064371,0.176547,0.154475,-0.207800,-0.040884,0.112471,0.043367
11997,-0.141774,0.106623,0.119957,0.017220,0.031261,0.023168,-0.023757,0.079469,0.008146,-0.016680,...,0.151211,0.072479,-0.107399,-0.055039,0.132766,0.046557,-0.031553,-0.185985,0.186369,0.081448
11998,-0.081258,0.037862,0.136440,0.056546,0.019306,-0.010508,-0.038380,-0.004174,-0.036269,0.052961,...,0.339121,0.150784,-0.219521,0.030969,0.006442,0.127398,-0.092799,0.062099,-0.062101,-0.069058


### Rescale and reformat data

In [2]:
import pandas as pd
x = pd.concat([x_train, x_test])
y = pd.concat([y_train, y_test])

In [3]:
from sklearn.preprocessing import MinMaxScaler

#rescale values in each column to between 0 and 1
scaler = MinMaxScaler()
scaler.fit(x)
max = scaler.data_max_ # get max values in each column
x= scaler.transform(x)
x= pd.DataFrame(x)
x = pd.DataFrame.to_numpy(x)

In [4]:
x.shape

(16000, 200)

In [5]:
#change to desired format
y = (pd.DataFrame.to_numpy(y)).reshape((len(y),))

In [6]:
y.shape

(16000,)

### Model training and evaluation (5-fold cross validation)

In [7]:
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from statistics import mean

def k_fold_cross_validation_scores(clf,x,y,k):

    kf = KFold(n_splits =k, shuffle= True, random_state= 21)

    Tot_acc=[]
    Tot_pre=[]
    Tot_rec=[]
    Tot_f1=[]

    i=0

    for train, test in kf.split(x):

        i=i+1
        print("\nFold %d" %(i))
        
        x_train, x_test = x[train],  x[test]
        y_train, y_test = y[train], y[test]
        y_pred = clf.fit(x_train, y_train).predict(x_test)

        acc=accuracy_score(y_test, y_pred)
        pre=precision_score(y_test, y_pred)
        rec=recall_score(y_test, y_pred)
        f1=f1_score(y_test, y_pred)
        
        Tot_acc.append(acc)
        Tot_pre.append(pre)
        Tot_rec.append(rec)
        Tot_f1.append(f1)

    print ("\nAverage Accuracy: %2.3f" % (mean(Tot_acc)))
    print ("\nAverage Precision: %2.3f" % (mean(Tot_pre)))
    print ("\nAverage Recall: %2.3f" % (mean(Tot_rec)))
    print ("\nAverage F1-score: %2.3f" % (mean(Tot_f1)))

In [8]:
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

In [9]:
svm = SVC(kernel='rbf', C=1, gamma='scale')
k_fold_cross_validation_scores(clf= svm,x=x,y= y,k=5)


Fold 1

Fold 2

Fold 3

Fold 4

Fold 5

Average Accuracy: 0.570

Average Precision: 0.562

Average Recall: 0.637

Average F1-score: 0.596


In [10]:
lr = LogisticRegression(C=1, solver='liblinear', random_state=42)
k_fold_cross_validation_scores(clf= lr,x=x,y= y,k=5)


Fold 1

Fold 2

Fold 3

Fold 4

Fold 5

Average Accuracy: 0.479

Average Precision: 0.480

Average Recall: 0.490

Average F1-score: 0.484


In [11]:
rf = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42)
k_fold_cross_validation_scores(clf= rf,x=x,y= y,k=5)


Fold 1

Fold 2

Fold 3

Fold 4

Fold 5

Average Accuracy: 0.556

Average Precision: 0.553

Average Recall: 0.597

Average F1-score: 0.573
