# Day 09. Exercise 00
# Regularization

## 0. Imports

In [435]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split,cross_val_score,StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

from sklearn.metrics import confusion_matrix
import joblib

## 1. Preprocessing

1. Read the file `dayofweek.csv` that you used in the previous day to a dataframe.
2. Using `train_test_split` with parameters `test_size=0.2`, `random_state=21` get `X_train`, `y_train`, `X_test`, `y_test`. Use the additional parameter `stratify`.

In [436]:
df = pd.read_csv('../data/dayofweek.csv')
df

Unnamed: 0,numTrials,hour,dayofweek,uid_user_0,uid_user_1,uid_user_10,uid_user_11,uid_user_12,uid_user_13,uid_user_14,...,labname_lab02,labname_lab03,labname_lab03s,labname_lab05s,labname_laba04,labname_laba04s,labname_laba05,labname_laba06,labname_laba06s,labname_project1
0,-0.788667,-2.562352,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,-0.756764,-2.562352,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,-0.724861,-2.562352,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,-0.692958,-2.562352,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,-0.661055,-2.562352,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1681,-0.533442,0.945382,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1682,-0.629151,0.945382,3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1683,-0.597248,0.945382,3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1684,-0.565345,0.945382,3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [437]:
X = df.drop('dayofweek',axis=1).values
y = df['dayofweek'].values


In [438]:
X_train,X_test,y_train,y_test = train_test_split(X,y, test_size=0.2, random_state=21,stratify=y)

## 2. Logreg regularization

### a. Default regularization

1. Train a baseline model with the only parameters `random_state=21`, `fit_intercept=False`.
2. Use stratified K-fold cross-validation with `10` splits to evaluate the accuracy of the model


The result of the code where you trained and evaluated the baseline model should be exactly like this (use `%%time` to get the info about how long it took to run the cell):

```
train -  0.62902   |   valid -  0.59259
train -  0.64633   |   valid -  0.62963
train -  0.63479   |   valid -  0.56296
train -  0.65622   |   valid -  0.61481
train -  0.63397   |   valid -  0.57778
train -  0.64056   |   valid -  0.59259
train -  0.64138   |   valid -  0.65926
train -  0.65952   |   valid -  0.56296
train -  0.64333   |   valid -  0.59701
train -  0.63674   |   valid -  0.62687
Average accuracy on crossval is 0.60165
Std is 0.02943
```

In [439]:
%%time

skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=21)

train_scores = []
valid_scores = []

for train_index, val_index in skf.split(X_train, y_train):
    X_tr, X_val = X_train[train_index], X_train[val_index]
    y_tr, y_val = y_train[train_index], y_train[val_index]
    
    model = LogisticRegression(random_state=21, fit_intercept=False)
    model.fit(X_tr, y_tr)
    
    train_acc = accuracy_score(y_tr, model.predict(X_tr))
    val_acc = accuracy_score(y_val, model.predict(X_val))
    
    train_scores.append(train_acc)
    valid_scores.append(val_acc)
    
    print(f"train -  {train_acc:.5f}   |   valid -  {val_acc:.5f}")

print(f"Average accuracy on crossval is {np.mean(valid_scores):.5f}")
print(f"Std is {np.std(valid_scores):.5f}")


train -  0.64056   |   valid -  0.65926
train -  0.63561   |   valid -  0.62222
train -  0.64468   |   valid -  0.60000
train -  0.64056   |   valid -  0.64444
train -  0.65375   |   valid -  0.60741
train -  0.62902   |   valid -  0.60000
train -  0.66117   |   valid -  0.60000
train -  0.63726   |   valid -  0.54074
train -  0.63756   |   valid -  0.66418
train -  0.64745   |   valid -  0.61194
Average accuracy on crossval is 0.61502
Std is 0.03399
CPU times: user 2.71 s, sys: 2.98 ms, total: 2.71 s
Wall time: 275 ms


### b. Optimizing regularization parameters

1. In the cells below try different values of penalty: `none`, `l1`, `l2` – you can change the values of solver too.

In [440]:
%%time 

model = LogisticRegression(random_state=21,fit_intercept=False,penalty='l2',solver='newton-cg')
skf = StratifiedKFold(n_splits=10,shuffle=True,random_state=21)


valid_scores = []


for train_index,valid_index in skf.split(X_train, y_train):
    X_tr, X_val = X_train[train_index], X_train[val_index]
    y_tr, y_val = y_train[train_index], y_train[val_index]

    model.fit(X_tr,y_tr)
    train_pred = model.predict(X_tr)
    valid_pred = model.predict(X_val)

    train_acc = accuracy_score(y_tr, train_pred)
    val_acc = accuracy_score(y_val,valid_pred)
    valid_scores.append(val_acc)

    print(f"train - {train_acc:.5f}   |   valid - {val_acc:.5f}")

print(f"Average accuracy on crossval is {np.mean(valid_scores):.5f}")
print(f"Std is {np.std(valid_scores):.5f}")

train - 0.64056   |   valid - 0.64179
train - 0.63397   |   valid - 0.61194
train - 0.64303   |   valid - 0.64179
train - 0.64056   |   valid - 0.64179
train - 0.65375   |   valid - 0.63433
train - 0.62819   |   valid - 0.61194
train - 0.66035   |   valid - 0.64179
train - 0.63726   |   valid - 0.61940
train - 0.63839   |   valid - 0.62687
train - 0.64745   |   valid - 0.61194
Average accuracy on crossval is 0.62836
Std is 0.01284
CPU times: user 954 ms, sys: 2.01 ms, total: 956 ms
Wall time: 214 ms


In [441]:
%%time 

model = LogisticRegression(random_state=21,fit_intercept=False,penalty='l1',solver='liblinear')
skf = StratifiedKFold(n_splits=10,shuffle=True,random_state=21)


valid_scores = []


for train_index,valid_index in skf.split(X_train, y_train):
    X_tr, X_val = X_train[train_index], X_train[val_index]
    y_tr, y_val = y_train[train_index], y_train[val_index]

    model.fit(X_tr,y_tr)
    train_pred = model.predict(X_tr)
    valid_pred = model.predict(X_val)

    train_acc = accuracy_score(y_tr, train_pred)
    val_acc = accuracy_score(y_val,valid_pred)
    valid_scores.append(val_acc)

    print(f"train - {train_acc:.5f}   |   valid - {val_acc:.5f}")

print(f"Average accuracy on crossval is {np.mean(valid_scores):.5f}")
print(f"Std is {np.std(valid_scores):.5f}")

train - 0.62242   |   valid - 0.63433
train - 0.62242   |   valid - 0.62687
train - 0.62242   |   valid - 0.62687
train - 0.62737   |   valid - 0.64179
train - 0.63397   |   valid - 0.61194
train - 0.61913   |   valid - 0.61194
train - 0.63974   |   valid - 0.64179
train - 0.62737   |   valid - 0.61194
train - 0.61367   |   valid - 0.61194
train - 0.62768   |   valid - 0.57463
Average accuracy on crossval is 0.61940
Std is 0.01888
CPU times: user 97.9 ms, sys: 999 μs, total: 98.9 ms
Wall time: 98.1 ms


In [442]:
%%time 

model = LogisticRegression(random_state=21,fit_intercept=False,penalty=None,solver='newton-cg')
skf = StratifiedKFold(n_splits=10,shuffle=True,random_state=21)


valid_scores = []


for train_index,valid_index in skf.split(X_train, y_train):
    X_tr, X_val = X_train[train_index], X_train[val_index]
    y_tr, y_val = y_train[train_index], y_train[val_index]

    model.fit(X_tr,y_tr)
    train_pred = model.predict(X_tr)
    valid_pred = model.predict(X_val)

    train_acc = accuracy_score(y_tr, train_pred)
    val_acc = accuracy_score(y_val,valid_pred)
    valid_scores.append(val_acc)

    print(f"train - {train_acc:.5f}   |   valid - {val_acc:.5f}")

print(f"Average accuracy on crossval is {np.mean(valid_scores):.5f}")
print(f"Std is {np.std(valid_scores):.5f}")

train - 0.66035   |   valid - 0.64179
train - 0.67189   |   valid - 0.63433
train - 0.65787   |   valid - 0.61940
train - 0.66117   |   valid - 0.64179
train - 0.66777   |   valid - 0.63433
train - 0.64716   |   valid - 0.61940
train - 0.66859   |   valid - 0.64925
train - 0.66529   |   valid - 0.62687
train - 0.65980   |   valid - 0.63433
train - 0.66969   |   valid - 0.62687
Average accuracy on crossval is 0.63284
Std is 0.00932
CPU times: user 319 ms, sys: 2 ms, total: 321 ms
Wall time: 320 ms


## 3. SVM regularization

### a. Default regularization

1. Train a baseline model with the only parameters `probability=True`, `kernel='linear'`, `random_state=21`.
2. Use stratified K-fold cross-validation with `10` splits to evaluate the accuracy of the model.
3. The format of the result of the code where you trained and evaluated the baseline model should be similar to what you have got for the logreg.

In [443]:
%%time 

model = SVC(probability=True,random_state=21,kernel='linear')
skf = StratifiedKFold(n_splits=10,shuffle=True,random_state=21)

val_scores = []

for train_index,train_val in skf.split(X_train, y_train):
    
    X_tr, X_val = X_train[train_index], X_train[val_index]
    y_tr, y_val = y_train[train_index], y_train[val_index]

    model.fit(X_tr,y_tr)
    train_pred = model.predict(X_tr)
    valid_pred = model.predict(X_val)

    train_acc = accuracy_score(y_tr, train_pred)
    val_acc = accuracy_score(y_val,valid_pred)
    val_scores.append(val_acc)

    print(f"train - {train_acc:.5f}   |   valid - {val_acc:.5f}")

print(f"Average accuracy on crossval is {np.mean(val_scores):.5f}")
print(f"Std is {np.std(val_scores):.5f}")

train - 0.70651   |   valid - 0.71642
train - 0.68920   |   valid - 0.65672
train - 0.69744   |   valid - 0.69403
train - 0.68920   |   valid - 0.67910
train - 0.69497   |   valid - 0.66418
train - 0.68673   |   valid - 0.67164
train - 0.69827   |   valid - 0.67164
train - 0.70486   |   valid - 0.67164
train - 0.68863   |   valid - 0.68657
train - 0.71005   |   valid - 0.64179
Average accuracy on crossval is 0.67537
Std is 0.01953
CPU times: user 1.65 s, sys: 1 ms, total: 1.65 s
Wall time: 1.65 s


### b. Optimizing regularization parameters

1. In the cells below try different values of the parameter `C`.

In [444]:
%%time

C_values = [0.1, 1, 10,100]
best_score = 0
best_C = None

for C in C_values:
    print(f"\nTraining model with C={C}")
    model = SVC(probability=True, random_state=21, kernel='linear', C=C)
    skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=21)
    
    val_scores = []
    
    for train_index, val_index in skf.split(X_train, y_train):
        X_tr, X_val = X_train[train_index], X_train[val_index]
        y_tr, y_val = y_train[train_index], y_train[val_index]
        
        model.fit(X_tr, y_tr)
        
        train_pred = model.predict(X_tr)
        val_pred = model.predict(X_val)
        
        train_acc = accuracy_score(y_tr, train_pred)
        val_acc = accuracy_score(y_val, val_pred)
        
        val_scores.append(val_acc)
        
        # print(f"train - {train_acc:.5f}   |   valid - {val_acc:.5f}")
    
    avg_val = np.mean(val_scores)
    std_val = np.std(val_scores)
    
    print(f"Average accuracy on crossval is {avg_val:.5f}")
    print(f"Std is {std_val:.5f}")
    
    if avg_val > best_score:
        best_score = avg_val
        best_C = C

print(f"\nBest C value: {best_C} with average accuracy: {best_score:.5f}")


Training model with C=0.1
Average accuracy on crossval is 0.55787
Std is 0.02522

Training model with C=1
Average accuracy on crossval is 0.65286
Std is 0.03800

Training model with C=10
Average accuracy on crossval is 0.71814
Std is 0.04026

Training model with C=100
Average accuracy on crossval is 0.75001
Std is 0.02849

Best C value: 100 with average accuracy: 0.75001
CPU times: user 15.1 s, sys: 1e+03 ns, total: 15.1 s
Wall time: 15.1 s


## 4. Tree

### a. Default regularization

1. Train a baseline model with the only parameter `max_depth=10` and `random_state=21`.
2. Use stratified K-fold cross-validation with `10` splits to evaluate the accuracy of the model.
3. The format of the result of the code where you trained and evaluated the baseline model should be similar to what you have got for the logreg.

In [445]:
%%time 

model = DecisionTreeClassifier(max_depth=10,random_state=21)
skf = StratifiedKFold(n_splits=10)

val_scores = []

for train_index,train_val in skf.split(X_train, y_train):
    
    X_tr, X_val = X_train[train_index], X_train[val_index]
    y_tr, y_val = y_train[train_index], y_train[val_index]

    model.fit(X_tr,y_tr)
    train_pred = model.predict(X_tr)
    valid_pred = model.predict(X_val)

    train_acc = accuracy_score(y_tr, train_pred)
    val_acc = accuracy_score(y_val,valid_pred)
    val_scores.append(val_acc)

    print(f"train - {train_acc:.5f}   |   valid - {val_acc:.5f}")

print(f"Average accuracy on crossval is {np.mean(val_scores):.5f}")
print(f"Std is {np.std(val_scores):.5f}")

train - 0.81039   |   valid - 0.80597
train - 0.77741   |   valid - 0.73881
train - 0.83347   |   valid - 0.79851
train - 0.79720   |   valid - 0.79104
train - 0.82440   |   valid - 0.81343
train - 0.80379   |   valid - 0.76119
train - 0.80709   |   valid - 0.81343
train - 0.80132   |   valid - 0.79104
train - 0.80807   |   valid - 0.79851
train - 0.80478   |   valid - 0.76866
Average accuracy on crossval is 0.78806
Std is 0.02317
CPU times: user 34.9 ms, sys: 2 ms, total: 36.9 ms
Wall time: 36.2 ms


### b. Optimizing regularization parameters

1. In the cells below try different values of the parameter `max_depth`.
2. As a bonus, play with other regularization parameters trying to find the best combination.

In [446]:
%%time

max_depths = [10, 20, 30,100]
best_score = 0
best_C = None

for depth in max_depths:
    print(f"\nTraining model with max_depth={depth}")
    model = DecisionTreeClassifier(max_depth=depth,random_state=21)
    skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=21)
    
    val_scores = []
    
    for train_index, val_index in skf.split(X_train, y_train):
        X_tr, X_val = X_train[train_index], X_train[val_index]
        y_tr, y_val = y_train[train_index], y_train[val_index]
        
        model.fit(X_tr, y_tr)
        
        train_pred = model.predict(X_tr)
        val_pred = model.predict(X_val)
        
        train_acc = accuracy_score(y_tr, train_pred)
        val_acc = accuracy_score(y_val, val_pred)
        
        val_scores.append(val_acc)
        
        # print(f"train - {train_acc:.5f}   |   valid - {val_acc:.5f}")
    
    avg_val = np.mean(val_scores)
    std_val = np.std(val_scores)
    
    print(f"Average accuracy on crossval is {avg_val:.5f}")
    print(f"Std is {std_val:.5f}")
    
    if avg_val > best_score:
        best_score = avg_val
        best_C = depth

print(f"\nBest max_depth value: {best_C} with average accuracy: {best_score:.5f}")


Training model with max_depth=10
Average accuracy on crossval is 0.71963
Std is 0.04791

Training model with max_depth=20
Average accuracy on crossval is 0.88798
Std is 0.01862

Training model with max_depth=30
Average accuracy on crossval is 0.88797
Std is 0.01840

Training model with max_depth=100
Average accuracy on crossval is 0.88797
Std is 0.01840

Best max_depth value: 20 with average accuracy: 0.88798
CPU times: user 150 ms, sys: 0 ns, total: 150 ms
Wall time: 149 ms


## 5. Random forest

### a. Default regularization

1. Train a baseline model with the only parameters `n_estimators=50`, `max_depth=14`, `random_state=21`.
2. Use stratified K-fold cross-validation with `10` splits to evaluate the accuracy of the model.
3. The format of the result of the code where you trained and evaluated the baseline model should be similar to what you have got for the logreg.

In [447]:
%%time 

model = RandomForestClassifier(n_estimators=50,max_depth=14,random_state=21)
skf = StratifiedKFold(n_splits=10)

val_scores = []

for train_index,train_val in skf.split(X_train, y_train):
    
    X_tr, X_val = X_train[train_index], X_train[val_index]
    y_tr, y_val = y_train[train_index], y_train[val_index]

    model.fit(X_tr,y_tr)
    train_pred = model.predict(X_tr)
    valid_pred = model.predict(X_val)

    train_acc = accuracy_score(y_tr, train_pred)
    val_acc = accuracy_score(y_val,valid_pred)
    val_scores.append(val_acc)

    print(f"train - {train_acc:.5f}   |   valid - {val_acc:.5f}")

print(f"Average accuracy on crossval is {np.mean(val_scores):.5f}")
print(f"Std is {np.std(val_scores):.5f}")

train - 0.96455   |   valid - 0.94776
train - 0.96208   |   valid - 0.93284
train - 0.96785   |   valid - 0.94776
train - 0.96455   |   valid - 0.96269
train - 0.96538   |   valid - 0.94776
train - 0.96538   |   valid - 0.93284
train - 0.97115   |   valid - 0.97015
train - 0.96867   |   valid - 0.96269
train - 0.97364   |   valid - 0.96269
train - 0.97941   |   valid - 0.96269
Average accuracy on crossval is 0.95299
Std is 0.01251
CPU times: user 569 ms, sys: 1.99 ms, total: 571 ms
Wall time: 572 ms


### b. Optimizing regularization parameters

1. In the new cells try different values of the parameters `max_depth` and `n_estimators`.
2. As a bonus, play with other regularization parameters trying to find the best combination.

In [448]:
max_depths = [10,20,30,40,60]
n_estimators = [20,40,60,80]

best_score = 0
best_params = {}

for n_estimator in n_estimators:
    for depth in max_depths:
        val_scores = []
        model = RandomForestClassifier(
            max_depth=depth,
            n_estimators = n_estimator,
            random_state=21
        )
        skf = StratifiedKFold(n_splits=10,shuffle=True,random_state=21)

        for train_index, val_index in skf.split(X_train, y_train):
            X_tr, X_val = X[train_index], X[val_index]
            y_tr, y_val = y[train_index], y[val_index]

            model.fit(X_tr, y_tr)
            val_pred = model.predict(X_val)
            val_acc = accuracy_score(y_val, val_pred)
            val_scores.append(val_acc)

            avg_score = np.mean(val_scores)
            # print(avg_score)
            if avg_score > best_score:
                best_score = avg_score
                best_params = {
                            "max_depth": depth,
                            "n_estimators": n_estimator,

                        }

print("✅ Best Parameters Found:")
print(best_params)
print(f"🎯 Best Cross-Validated Accuracy: {best_score:.5f}")


✅ Best Parameters Found:
{'max_depth': 40, 'n_estimators': 60}
🎯 Best Cross-Validated Accuracy: 0.94444


## 6. Predictions

1. Choose the best model and use it to make predictions for the test dataset.
2. Calculate the final accuracy.
3. Analyze: for which weekday your model makes the most errors (in % of the total number of samples of that class in your test dataset).
4. Save the model.

In [449]:
best_model = RandomForestClassifier(max_depth=20,n_estimators=80,random_state=21)
best_model.fit(X_train,y_train)
y_pred = best_model.predict(X_test)
accuracy =accuracy_score(y_pred,y_test)
accuracy

0.9260355029585798

In [450]:
results = pd.DataFrame({'true': y_test, 'pred': y_pred})
results

Unnamed: 0,true,pred
0,1,1
1,5,5
2,6,6
3,3,3
4,2,2
...,...,...
333,3,3
334,1,1
335,2,2
336,1,1


In [459]:

weekday = {0: "Monday", 1: "Tuesday", 2: "Wednesday", 3: "Thursday",
           4: "Friday", 5: "Saturday", 6: "Sunday"}


errors = results[results['true'] != results['pred']]['true'].value_counts()
totals = results['true'].value_counts()

errors


true
0    7
1    6
5    4
4    3
3    2
2    2
6    1
Name: count, dtype: int64

In [455]:
error_percent = (errors / totals * 100).round(2).fillna(0)

print("Misclassification percentages per weekday:")
for day_num in error_percent.sort_values(ascending=False).index:
    print(f"{weekday[day_num]}: {error_percent[day_num]}%")

worst_day = error_percent.idxmax()
print(f"\nWeekday with most errors: {weekday[worst_day]} ({error_percent[worst_day]}%)")

Misclassification percentages per weekday:
Monday: 25.93%
Friday: 14.29%
Tuesday: 10.91%
Saturday: 7.41%
Wednesday: 6.67%
Thursday: 2.5%
Sunday: 1.41%

Weekday with most errors: Monday (25.93%)


In [453]:
joblib.dump(best_model,"../data/Randomforest_model.pkl")

['../data/Randomforest_model.pkl']