# Ex00 Regularization

In [53]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
import joblib
from sklearn.model_selection import StratifiedKFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
import pandas as pd
import numpy as np

## 1. Preprocessing

In [54]:
df = pd.read_csv('../data/dayofweek.csv')
df.head()

Unnamed: 0,numTrials,hour,dayofweek,uid_user_0,uid_user_1,uid_user_10,uid_user_11,uid_user_12,uid_user_13,uid_user_14,...,labname_lab02,labname_lab03,labname_lab03s,labname_lab05s,labname_laba04,labname_laba04s,labname_laba05,labname_laba06,labname_laba06s,labname_project1
0,-0.788667,-2.562352,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,-0.756764,-2.562352,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,-0.724861,-2.562352,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,-0.692958,-2.562352,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,-0.661055,-2.562352,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [55]:
X = df.drop(columns=['dayofweek'])
y = df['dayofweek']

In [56]:
# stratify гарантирует сохранение пропорций классов в обучающей и тестовой выборке
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size =0.2, random_state=21, stratify=y)

## 2. Logreg regularization

### a. Default regularization

In [57]:
logreg = LogisticRegression(random_state=21, fit_intercept=False)
logreg.fit(X_train, y_train)
log_pred = logreg.predict(X_test)

In [58]:
def crossval(n_splits, X, y, model):
    # KFold кросс-валидация для регрессионных моделей
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=21)
    accuracy_train = []
    accuracy_test = []
    
    for train_index, test_index in kf.split(X):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        
        model.fit(X_train, y_train)
        y_train_pred = model.predict(X_train)
        y_test_pred = model.predict(X_test)
        
        train_accuracy = accuracy_score(y_train,y_train_pred)
        test_accuracy = accuracy_score(y_test, y_test_pred)
        
        accuracy_train.append(train_accuracy)
        accuracy_test.append(test_accuracy)
        
        print(f"train - {train_accuracy} | valid - {test_accuracy}")
    
    avg_accuracy = np.mean(accuracy_test)
    std_accuracy = np.std(accuracy_test)
    print(f'Average accuracy on crossval is {avg_accuracy:.5f}')
    print(f'Std is {std_accuracy:.5f}')

In [59]:
%%time
crossval(10, X, y, logreg)

train - 0.6282135794330916 | valid - 0.7396449704142012
train - 0.6526038233355307 | valid - 0.6153846153846154
train - 0.6539222148978246 | valid - 0.6094674556213018
train - 0.6361239288068556 | valid - 0.5443786982248521
train - 0.6453526697429136 | valid - 0.6331360946745562
train - 0.6453526697429136 | valid - 0.5798816568047337
train - 0.6291172595520421 | valid - 0.5714285714285714
train - 0.6442687747035574 | valid - 0.6190476190476191
train - 0.6363636363636364 | valid - 0.6011904761904762
train - 0.6403162055335968 | valid - 0.6130952380952381
Average accuracy on crossval is 0.61267
Std is 0.04916
CPU times: user 1min 38s, sys: 1min 4s, total: 2min 42s
Wall time: 15.8 s


In [60]:
crossval(10, X, y, logreg)

train - 0.6282135794330916 | valid - 0.7396449704142012
train - 0.6526038233355307 | valid - 0.6153846153846154
train - 0.6539222148978246 | valid - 0.6094674556213018
train - 0.6361239288068556 | valid - 0.5443786982248521
train - 0.6453526697429136 | valid - 0.6331360946745562
train - 0.6453526697429136 | valid - 0.5798816568047337
train - 0.6291172595520421 | valid - 0.5714285714285714
train - 0.6442687747035574 | valid - 0.6190476190476191
train - 0.6363636363636364 | valid - 0.6011904761904762
train - 0.6403162055335968 | valid - 0.6130952380952381
Average accuracy on crossval is 0.61267
Std is 0.04916


### b. Optimizing regularization parameters

In [61]:
logreg_no_penalty = LogisticRegression(penalty = None, random_state=21, fit_intercept=False, solver='saga')

In [62]:
%%time
crossval(10, X, y, logreg_no_penalty)



train - 0.6605141727092947 | valid - 0.7514792899408284




train - 0.6631509558338826 | valid - 0.6272189349112426




train - 0.6565589980224127 | valid - 0.621301775147929




train - 0.6618325642715887 | valid - 0.5739644970414202




train - 0.6644693473961767 | valid - 0.6627218934911243




train - 0.6690837178642056 | valid - 0.6153846153846154




train - 0.6541501976284585 | valid - 0.6190476190476191




train - 0.6574440052700923 | valid - 0.625




train - 0.6548089591567853 | valid - 0.6071428571428571
train - 0.6712779973649539 | valid - 0.6309523809523809
Average accuracy on crossval is 0.63342
Std is 0.04452
CPU times: user 3.86 s, sys: 2.75 s, total: 6.61 s
Wall time: 3.98 s




In [63]:
logreg_l1 = LogisticRegression(penalty = 'l1', random_state=21, fit_intercept=False, solver='liblinear')

In [64]:
%%time
crossval(10, X, y, logreg_l1)

train - 0.6117336849044166 | valid - 0.6863905325443787
train - 0.6341463414634146 | valid - 0.5976331360946746
train - 0.6427158866183257 | valid - 0.591715976331361
train - 0.6150296638101516 | valid - 0.5325443786982249
train - 0.6295319709953856 | valid - 0.5976331360946746
train - 0.6301911667765326 | valid - 0.5562130177514792
train - 0.6251646903820817 | valid - 0.5535714285714286
train - 0.6370223978919631 | valid - 0.625
train - 0.6258234519104084 | valid - 0.6130952380952381
train - 0.621870882740448 | valid - 0.5892857142857143
Average accuracy on crossval is 0.59431
Std is 0.04093
CPU times: user 1.24 s, sys: 1.97 s, total: 3.21 s
Wall time: 681 ms


In [65]:
logreg_l2 = LogisticRegression(penalty = 'l2', random_state=21, fit_intercept=False)

In [66]:
%%time
crossval(10, X, y, logreg_l2)

train - 0.6282135794330916 | valid - 0.7396449704142012
train - 0.6526038233355307 | valid - 0.6153846153846154
train - 0.6539222148978246 | valid - 0.6094674556213018
train - 0.6361239288068556 | valid - 0.5443786982248521
train - 0.6453526697429136 | valid - 0.6331360946745562
train - 0.6453526697429136 | valid - 0.5798816568047337
train - 0.6291172595520421 | valid - 0.5714285714285714
train - 0.6442687747035574 | valid - 0.6190476190476191
train - 0.6363636363636364 | valid - 0.6011904761904762
train - 0.6403162055335968 | valid - 0.6130952380952381
Average accuracy on crossval is 0.61267
Std is 0.04916
CPU times: user 1min 49s, sys: 40.8 s, total: 2min 29s
Wall time: 20.9 s


## 3. SVM regularization

### a. Default regularization

In [67]:
svc = SVC(probability=True, kernel='linear', random_state=21)

In [68]:
def crossval_svc(n_splits, X, y, model):
    kf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=21)
    accuracy_train = []
    accuracy_test = []
    
    for train_index, test_index in kf.split(X, y):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        
        model.fit(X_train, y_train)
        y_train_pred = model.predict(X_train)
        y_test_pred = model.predict(X_test)
        
        train_accuracy = accuracy_score(y_train,y_train_pred)
        test_accuracy = accuracy_score(y_test, y_test_pred)
        
        accuracy_train.append(train_accuracy)
        accuracy_test.append(test_accuracy)
        
        print(f"train - {train_accuracy} | valid - {test_accuracy}")
    
    avg_accuracy = np.mean(accuracy_test)
    std_accuracy = np.std(accuracy_test)
    print(f'Average accuracy on crossval is {avg_accuracy:.5f}')
    print(f'Std is {std_accuracy:.5f}')

In [69]:
%%time
crossval_svc(10, X, y, svc)

train - 0.7013843111404087 | valid - 0.7159763313609467
train - 0.6967699406723797 | valid - 0.6863905325443787
train - 0.7040210942649967 | valid - 0.7100591715976331
train - 0.6994067237969677 | valid - 0.6390532544378699
train - 0.7112722478576137 | valid - 0.621301775147929
train - 0.7033618984838497 | valid - 0.6982248520710059
train - 0.6903820816864296 | valid - 0.6785714285714286
train - 0.7048748353096179 | valid - 0.6904761904761905
train - 0.6989459815546772 | valid - 0.7142857142857143
train - 0.7002635046113307 | valid - 0.6190476190476191
Average accuracy on crossval is 0.67734
Std is 0.03553
CPU times: user 5.88 s, sys: 0 ns, total: 5.88 s
Wall time: 6.35 s


### b. Optimizing regularization parameters

In [70]:
# увеличение штрафа за ошибочные классификации
svc_10 = SVC(probability=True, kernel='linear', random_state=21, C = 10)

In [71]:
%%time
crossval_svc(10, X, y, svc_10)

train - 0.7752142386288727 | valid - 0.757396449704142
train - 0.7758734344100198 | valid - 0.7396449704142012
train - 0.7798286090969018 | valid - 0.757396449704142
train - 0.7804878048780488 | valid - 0.7514792899408284
train - 0.7851021753460777 | valid - 0.6982248520710059
train - 0.7666446934739618 | valid - 0.757396449704142
train - 0.7819499341238472 | valid - 0.7738095238095238
train - 0.7819499341238472 | valid - 0.7559523809523809
train - 0.7779973649538867 | valid - 0.7857142857142857
train - 0.766798418972332 | valid - 0.7142857142857143
Average accuracy on crossval is 0.74913
Std is 0.02470
CPU times: user 9.12 s, sys: 0 ns, total: 9.12 s
Wall time: 10.4 s


In [72]:
svc_05 = SVC(probability=True, kernel='linear', random_state=21, C = 0.5)

In [73]:
%%time
crossval_svc(10, X, y, svc_05)

train - 0.6789716545814107 | valid - 0.6982248520710059
train - 0.6816084377059987 | valid - 0.6627218934911243
train - 0.6802900461437047 | valid - 0.6745562130177515
train - 0.6730388925510876 | valid - 0.6035502958579881
train - 0.6947923533289387 | valid - 0.621301775147929
train - 0.6888595912986157 | valid - 0.6745562130177515
train - 0.6666666666666666 | valid - 0.5952380952380952
train - 0.6837944664031621 | valid - 0.6785714285714286
train - 0.6798418972332015 | valid - 0.6964285714285714
train - 0.6870882740447958 | valid - 0.625
Average accuracy on crossval is 0.65301
Std is 0.03630
CPU times: user 5.59 s, sys: 0 ns, total: 5.59 s
Wall time: 5.95 s


## 4. Tree

### a. Default regularization

In [74]:
tree = DecisionTreeClassifier(max_depth=10, random_state=21)

In [75]:
%%time
crossval_svc(10, X, y, tree)

train - 0.8200395517468688 | valid - 0.7928994082840237
train - 0.8266315095583389 | valid - 0.6982248520710059
train - 0.8292682926829268 | valid - 0.7633136094674556
train - 0.8180619644034278 | valid - 0.7159763313609467
train - 0.8226763348714569 | valid - 0.7455621301775148
train - 0.8055372445616348 | valid - 0.7751479289940828
train - 0.8333333333333334 | valid - 0.7559523809523809
train - 0.8155467720685112 | valid - 0.7678571428571429
train - 0.8122529644268774 | valid - 0.7738095238095238
train - 0.8175230566534915 | valid - 0.6904761904761905
Average accuracy on crossval is 0.74792
Std is 0.03306
CPU times: user 169 ms, sys: 0 ns, total: 169 ms
Wall time: 220 ms


### b. Optimizing regularization parameters

In [76]:
tree_5 = DecisionTreeClassifier(max_depth=5, random_state=21)

In [77]:
%%time
crossval_svc(10, X, y, tree_5)

train - 0.6209624258404747 | valid - 0.621301775147929
train - 0.6104152933421226 | valid - 0.5502958579881657
train - 0.6242584047462096 | valid - 0.5976331360946746
train - 0.6097560975609756 | valid - 0.591715976331361
train - 0.6130520764667106 | valid - 0.5384615384615384
train - 0.6038233355306526 | valid - 0.5621301775147929
train - 0.6119894598155468 | valid - 0.6190476190476191
train - 0.6119894598155468 | valid - 0.5952380952380952
train - 0.6073781291172595 | valid - 0.6428571428571429
train - 0.621870882740448 | valid - 0.5595238095238095
Average accuracy on crossval is 0.58782
Std is 0.03255
CPU times: user 108 ms, sys: 0 ns, total: 108 ms
Wall time: 182 ms


In [78]:
tree_15 = DecisionTreeClassifier(max_depth=15, random_state=21)

In [79]:
%%time
crossval_svc(10, X, y, tree_15)

train - 0.9512195121951219 | valid - 0.8875739644970414
train - 0.9499011206328279 | valid - 0.8284023668639053
train - 0.9630850362557679 | valid - 0.8816568047337278
train - 0.951878707976269 | valid - 0.8698224852071006
train - 0.944627554383652 | valid - 0.8757396449704142
train - 0.9340804218852999 | valid - 0.8757396449704142
train - 0.9552042160737813 | valid - 0.8869047619047619
train - 0.958498023715415 | valid - 0.8809523809523809
train - 0.9453227931488801 | valid - 0.8928571428571429
train - 0.9578392621870883 | valid - 0.8630952380952381
Average accuracy on crossval is 0.87427
Std is 0.01742
CPU times: user 138 ms, sys: 0 ns, total: 138 ms
Wall time: 268 ms


## 5. Random forest

### a. Default regularization

In [80]:
forest = RandomForestClassifier(n_estimators=50, max_depth=14, random_state=21)

In [81]:
%%time
crossval_svc(10, X, y, forest)

train - 0.970336189848385 | valid - 0.9053254437869822
train - 0.9670402109426499 | valid - 0.8757396449704142
train - 0.9690177982860909 | valid - 0.9112426035502958
train - 0.974291364535267 | valid - 0.893491124260355
train - 0.962425840474621 | valid - 0.8698224852071006
train - 0.966381015161503 | valid - 0.9408284023668639
train - 0.9703557312252964 | valid - 0.9226190476190477
train - 0.9703557312252964 | valid - 0.9166666666666666
train - 0.9683794466403162 | valid - 0.8988095238095238
train - 0.9756258234519104 | valid - 0.8869047619047619
Average accuracy on crossval is 0.90214
Std is 0.02069
CPU times: user 1.31 s, sys: 0 ns, total: 1.31 s
Wall time: 1.39 s


### b. Optimizing regularization parameters

In [82]:
forest_depth_5 = RandomForestClassifier(n_estimators=50, max_depth=5, random_state=21)

In [83]:
%%time
crossval_svc(10, X, y, forest_depth_5)

train - 0.6038233355306526 | valid - 0.5857988165680473
train - 0.6150296638101516 | valid - 0.5976331360946746
train - 0.6123928806855636 | valid - 0.6094674556213018
train - 0.5833882663150955 | valid - 0.5621301775147929
train - 0.5959129861568886 | valid - 0.5207100591715976
train - 0.5866842452208306 | valid - 0.5562130177514792
train - 0.5915678524374176 | valid - 0.5595238095238095
train - 0.569828722002635 | valid - 0.5892857142857143
train - 0.5961791831357048 | valid - 0.5714285714285714
train - 0.6027667984189723 | valid - 0.5773809523809523
Average accuracy on crossval is 0.57296
Std is 0.02388
CPU times: user 1.19 s, sys: 0 ns, total: 1.19 s
Wall time: 1.33 s


In [84]:
forest_depth_20 = RandomForestClassifier(n_estimators=50, max_depth=20, random_state=21)

In [85]:
%%time
crossval_svc(10, X, y, forest_depth_20)

train - 0.996044825313118 | valid - 0.9289940828402367
train - 0.998022412656559 | valid - 0.9053254437869822
train - 0.998022412656559 | valid - 0.9349112426035503
train - 0.996044825313118 | valid - 0.9230769230769231
train - 0.998681608437706 | valid - 0.9230769230769231
train - 0.998022412656559 | valid - 0.9526627218934911
train - 0.9967061923583662 | valid - 0.9404761904761905
train - 0.997364953886693 | valid - 0.9107142857142857
train - 0.9960474308300395 | valid - 0.9345238095238095
train - 0.9953886693017128 | valid - 0.9107142857142857
Average accuracy on crossval is 0.92645
Std is 0.01411
CPU times: user 1.75 s, sys: 0 ns, total: 1.75 s
Wall time: 2.06 s


In [86]:
forest_est_20 = RandomForestClassifier(n_estimators=20, max_depth=14, random_state=21)

In [87]:
%%time
crossval_svc(10, X, y, forest_est_20)

train - 0.9650626235992089 | valid - 0.9053254437869822
train - 0.9591298615688859 | valid - 0.863905325443787
train - 0.956493078444298 | valid - 0.8994082840236687
train - 0.966381015161503 | valid - 0.9112426035502958
train - 0.952537903757416 | valid - 0.8579881656804734
train - 0.955833882663151 | valid - 0.9289940828402367
train - 0.9552042160737813 | valid - 0.9226190476190477
train - 0.9492753623188406 | valid - 0.8869047619047619
train - 0.9552042160737813 | valid - 0.8928571428571429
train - 0.9696969696969697 | valid - 0.875
Average accuracy on crossval is 0.89442
Std is 0.02261
CPU times: user 693 ms, sys: 0 ns, total: 693 ms
Wall time: 985 ms


In [88]:
forest_est_100 = RandomForestClassifier(n_estimators=100, max_depth=14, random_state=21)

In [89]:
%%time
crossval_svc(10, X, y, forest_est_100)

train - 0.970995385629532 | valid - 0.9230769230769231
train - 0.9630850362557679 | valid - 0.8875739644970414
train - 0.965721819380356 | valid - 0.8994082840236687
train - 0.970336189848385 | valid - 0.8994082840236687
train - 0.967699406723797 | valid - 0.8875739644970414
train - 0.968358602504944 | valid - 0.9408284023668639
train - 0.9644268774703557 | valid - 0.9226190476190477
train - 0.9736495388669302 | valid - 0.9107142857142857
train - 0.97167325428195 | valid - 0.9166666666666666
train - 0.9769433465085638 | valid - 0.8869047619047619
Average accuracy on crossval is 0.90748
Std is 0.01739
CPU times: user 2.74 s, sys: 0 ns, total: 2.74 s
Wall time: 3.5 s


In [90]:
forest_best = RandomForestClassifier(n_estimators=70, max_depth=20, random_state=21)

In [91]:
%%time
crossval_svc(10, X, y, forest_best)

train - 0.997363216875412 | valid - 0.9289940828402367
train - 0.996704021094265 | valid - 0.9230769230769231
train - 0.998681608437706 | valid - 0.9349112426035503
train - 0.996044825313118 | valid - 0.9289940828402367
train - 0.998681608437706 | valid - 0.9230769230769231
train - 0.996704021094265 | valid - 0.9526627218934911
train - 0.9967061923583662 | valid - 0.9404761904761905
train - 0.9953886693017128 | valid - 0.9166666666666666
train - 0.9953886693017128 | valid - 0.9285714285714286
train - 0.9960474308300395 | valid - 0.9166666666666666
Average accuracy on crossval is 0.92941
Std is 0.01049
CPU times: user 2.49 s, sys: 0 ns, total: 2.49 s
Wall time: 2.64 s


## 6. Predictions

the best model is Random Forest (n_est = 70, max_depth = 20)

In [92]:
forest_best.fit(X_train, y_train)

In [93]:
pred = forest_best.predict(X)

In [94]:
accuracy_score(pred, y)

0.9833926453143536

Analyze: for which weekday your model makes the most errors (in % of the total number of samples of that class in your test dataset).

In [95]:
analyze_df = pd.DataFrame(y).reset_index().drop(columns='index')

In [96]:
analyze_df['pred'] = pred

The most errors: Monday - 0.4% of all samples

In [97]:
analyze_df[analyze_df.dayofweek != analyze_df.pred].dayofweek.value_counts() / len(pred)

dayofweek
0    0.004745
1    0.004152
5    0.002372
2    0.001779
4    0.001779
3    0.001186
6    0.000593
Name: count, dtype: float64

In [98]:
joblib.dump(forest_best, 'random_forest_model.joblib')

['random_forest_model.joblib']