# Machine Learning in action - Underfitting vs overfitting
Parameters to play with: 

1) Linear regression vs neural net
2) Size - max: 3 layers, 30 nodes each
2) Train test split and no of iterations - overfitting/underfitting

Run button
Feedback: overfitting/underfitting - 2 plots, train actual vs predicted, test actual vs predicted  + written feedback + table of best params (use same as notebook 1)
Return to best model (optional)

Remedies: 


## Establish a benchmark: linear regression with optimal train/test split

In [1]:
%matplotlib widget
import Neural_net_scripts as nr
from sklearn.linear_model import LinearRegression

Imports done
Imports done
Prepared data


Layer 1: 20
Layer 2: 20
Layer 3: 20
Training your neural net (20, 20, 20)


In [2]:
nr.main

VBox(children=(HTML(value='<div style="font-size: 2em; font-weight: bold;display: flex;justify-content: center…

In [3]:
prediction_log

NameError: name 'prediction_log' is not defined

In [4]:
df_raw = nr.fetch_data()
target_var = 'Mobile_Traffic'

In [5]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
import numpy as np

X = df_raw.drop(target_var, axis=1).values
y = df_raw[target_var].values

scores = []
for seed in range(30):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.6, random_state=seed)

    mlr = LinearRegression()
    mlr.fit(X_train, y_train)
    scores.append(mlr.score(X_test, y_test))
    print("Seed: {}, Score: {}".format(seed, mlr.score(X_test, y_test)))
print(np.average(scores))

Seed: 0, Score: 0.6354047579860557
Seed: 1, Score: 0.6408509244510331
Seed: 2, Score: 0.6328064456796397
Seed: 3, Score: 0.6385722156781369
Seed: 4, Score: 0.6379332927853032
Seed: 5, Score: 0.635063619414485
Seed: 6, Score: 0.6389149144464745
Seed: 7, Score: 0.6300050165564818
Seed: 8, Score: 0.6371424306987963
Seed: 9, Score: 0.6234916114689609
Seed: 10, Score: 0.6297854512144565
Seed: 11, Score: 0.6357664743886664
Seed: 12, Score: 0.6298167577316891
Seed: 13, Score: 0.6358278049725525
Seed: 14, Score: 0.6352247043390941
Seed: 15, Score: 0.6339664294164502
Seed: 16, Score: 0.6329044179388226
Seed: 17, Score: 0.6457335636069221
Seed: 18, Score: 0.6318033016790028
Seed: 19, Score: 0.635716273223299
Seed: 20, Score: 0.642375543609397
Seed: 21, Score: 0.637532718377549
Seed: 22, Score: 0.6344241921846625
Seed: 23, Score: 0.6369590773794525
Seed: 24, Score: 0.6320137526391083
Seed: 25, Score: 0.6373269259903882
Seed: 26, Score: 0.6328311184454175
Seed: 27, Score: 0.6351218184048905
Seed: 

In [6]:
scores = []
train_scores = []
for seed in range(30):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=seed)

    mlr = LinearRegression()
    mlr.fit(X_train, y_train)
    scores.append(mlr.score(X_test, y_test))
    train_scores.append(mlr.score(X_train, y_train))
    print("Seed: {}, Score: {}".format(seed, mlr.score(X_test, y_test)))
print(np.average(scores))
print(np.average(train_scores))


Seed: 0, Score: 0.630423300863734


Seed: 1, Score: 0.6537876538985032
Seed: 2, Score: 0.6367943367459634
Seed: 3, Score: 0.643070278896716
Seed: 4, Score: 0.6484852704053355
Seed: 5, Score: 0.6275820376620447
Seed: 6, Score: 0.6385564932362453
Seed: 7, Score: 0.6322994488590652
Seed: 8, Score: 0.6384553562757685
Seed: 9, Score: 0.6277289223508289
Seed: 10, Score: 0.6282316311008977
Seed: 11, Score: 0.6364677424194262
Seed: 12, Score: 0.62389519536296
Seed: 13, Score: 0.6401162772388713
Seed: 14, Score: 0.631453487476944
Seed: 15, Score: 0.6431139307010235
Seed: 16, Score: 0.6295913535466364
Seed: 17, Score: 0.6379219259081762
Seed: 18, Score: 0.6112020588275214
Seed: 19, Score: 0.629980075531949
Seed: 20, Score: 0.642875071799984
Seed: 21, Score: 0.6408655408993815
Seed: 22, Score: 0.6456507527186293
Seed: 23, Score: 0.6386786252580402
Seed: 24, Score: 0.6269609640919594
Seed: 25, Score: 0.6383620190579112
Seed: 26, Score: 0.6438286945522586
Seed: 27, Score: 0.6359727966272104
Seed: 28, Score: 0.6391727628357495
Seed: 

## Let's see if we can improve on that benchmark with a nerual net

In [7]:
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_validate
from sklearn.metrics import r2_score

def mlp_model(X, Y):

    estimator=MLPRegressor()


    param_grid = {'hidden_layer_sizes': [(5, 5, 5), (20,20,20)],# (40,40,40), (20, 15, 15, 20), (19, 18, 23)],
            'activation': ['relu', 'tanh','logistic'],
            'alpha': [0.0001, 0.05],
            'learning_rate': ['constant', 'adaptive'],
            'solver': ['adam']}

    gsc = GridSearchCV(
        estimator,
        param_grid,
        cv=3, verbose=4, n_jobs=-1)

    grid_result = gsc.fit(X, Y)

    print('grid search completed')
    best_params = grid_result.best_params_

    best_mlp = MLPRegressor(hidden_layer_sizes = best_params["hidden_layer_sizes"], 
                            activation =best_params["activation"],
                            solver=best_params["solver"],
                            max_iter= 500, n_iter_no_change = 100, random_state=42
                )

    scoring = {
            'abs_error': 'neg_mean_absolute_error',
            'squared_error': 'neg_mean_squared_error',
            'r2':'r2'}

    #scores = cross_validate(best_mlp, X, Y, cv=3, scoring=scoring, return_train_score=True, return_estimator = True)
    scores = []
    train_scores = []
    for seed in range(5):
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=seed)
        
        model = MLPRegressor(hidden_layer_sizes = best_params["hidden_layer_sizes"], 
                            activation =best_params["activation"],
                            solver=best_params["solver"],
                            max_iter= 1000, n_iter_no_change =100, random_state=seed
                )
        model.fit(X_train, y_train)
        scores.append(model.score(X_test, y_test))
        train_scores.append(model.score(X_train, y_train))
        #print("Seed: {}, Score: {}".format(seed, mlr.score(X_test, y_test)))
    
    print(np.average(scores))
    return grid_result, scores, train_scores

In [8]:
grid_result, scores, train_scores = mlp_model(X, y)

Fitting 3 folds for each of 24 candidates, totalling 72 fits
grid search completed
0.6675211340148642


In [9]:
grid_result.best_params_

{'activation': 'tanh',
 'alpha': 0.05,
 'hidden_layer_sizes': (20, 20, 20),
 'learning_rate': 'constant',
 'solver': 'adam'}

In [10]:
np.average(train_scores)

0.6970854922306791

In [11]:
np.average(scores)

0.6675211340148642

In [12]:
scores

[0.6456311366841327,
 0.6734177901431624,
 0.6575006962214536,
 0.6691237221642344,
 0.6919323248613383]

In [None]:
scores

In [None]:
scores['test_r2']

In [None]:
nr.set_up_layer_sliders()

In [None]:
nr.run_get_slider_values()

In [None]:
# run button - run with watchers of sliders, show test results - similar to linear reg
nr.run_get_slider_values()

In [None]:
nr.main

In [3]:
from sklearn.metrics import mean_absolute_error
from sklearn.neural_network import MLPRegressor

scores = []
mae_scores=[]
train_scores=[]
for seed in range(2):
        print(str(seed))
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=seed)
        
        model = MLPRegressor(hidden_layer_sizes =(60, 60, 60), 
                            activation ='relu',
                            solver='adam',
                            learning_rate='constant',
                            max_iter= 500, n_iter_no_change =100, random_state=seed
                )
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        mae = mean_absolute_error(y_test, y_pred)
        mae_scores.append(mae)
        scores.append(model.score(X_test, y_test))
        train_scores.append(model.score(X_train, y_train))
        #print("Seed: {}, Score: {}".format(seed, mlr.score(X_test, y_test)))

print(np.average(scores))

0


NameError: name 'train_test_split' is not defined

In [15]:
scores

[0.6368380326012488, 0.6278386409631944]

In [16]:
mae_scores

[1.0665475272207048, 1.09652056118331]

In [17]:
train_scores

[0.7419694068801929, 0.7439410103190577]

In [26]:
scores = []
mae_scores=[]
train_scores=[]
for seed in range(2):
        print(str(seed))
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=seed)
        
        model = MLPRegressor(hidden_layer_sizes =(20, 20, 20), 
                            activation ='relu',
                            solver='adam',
                            learning_rate='constant',
                            max_iter= 1200, n_iter_no_change =100, random_state=seed
                )
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        mae = mean_absolute_error(y_test, y_pred)
        mae_scores.append(mae)
        scores.append(model.score(X_test, y_test))
        train_scores.append(model.score(X_train, y_train))
        #print("Seed: {}, Score: {}".format(seed, mlr.score(X_test, y_test)))

print(np.average(scores))

0
1
0.6644857455204309


In [27]:
mae_scores

[1.0344818566755567, 1.0386097813579158]

In [28]:
train_scores

[0.7008449027182735, 0.7028561381885705]

In [25]:
scores

[0.664144742064494, 0.6798226936920146]