## Devoir 4: Features Scaling - Régression Linéaire Multiple

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pygments.lexers import go
from sklearn.model_selection import train_test_split
%matplotlib inline

In [None]:
dataset=pd.read_csv("Advertising.csv")
X = dataset[['TV', 'Radio', 'Newspaper']].values
Y = dataset[['Sales']].values

In [None]:
## theta
theta = np.random.rand(4, 1)
print(theta)

[[0.40672694]
 [0.69813057]
 [0.63272059]
 [0.68475969]]


In [None]:
def model(X, theta):
    return X.dot(theta)

In [None]:
def cost_function(X, Y, theta):
    m = len(Y)
    return 1/(2*m) * np.sum((model(X, theta) - Y)**2)

In [None]:
def grad(X, Y, theta):
    m = len(Y)
    return 1/m * X.T.dot(model(X, theta) - Y)

In [None]:
def gradient_descent(X, Y, theta, learning_rate, n_iterations):
    cost_history = np.zeros(n_iterations)
    for i in range(0, n_iterations):
        theta = theta - learning_rate * grad(X, Y, theta)
        cost_history[i] = cost_function(X, Y, theta)
    return theta, cost_history

In [None]:
def coef_det(y, pred):
    u = ((y - pred)**2).sum()
    v = ((y - y.mean())**2).sum()
    return 1 - u/v

## normalisation from scatch

In [None]:
def normalize(X):
    X = (X - X.mean()) / X.std()
    return X

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

X_train = normalize(X_train)
X_test = normalize(X_test)
X_train = np.hstack((X_train, np.ones((X_train.shape[0], 1))))

print(X_train.shape)

print(X_train[:10])

(160, 4)
[[ 0.62602631 -0.77516426 -0.57591833  1.        ]
 [ 1.41524722 -0.75446339 -0.79198373  1.        ]
 [-0.31715737 -0.52934136 -0.42066176  1.        ]
 [-0.06874686 -0.71176783 -0.63802096  1.        ]
 [ 2.02333547 -0.84373592 -0.67295369  1.        ]
 [-0.38055381 -0.82691646 -0.69624217  1.        ]
 [ 0.03863894 -0.66777847 -0.34691488  1.        ]
 [ 1.03098721 -0.56556789 -0.62767052  1.        ]
 [ 0.47853257 -0.80104036 -0.42971839  1.        ]
 [ 0.11238581 -0.51899092 -0.66777847  1.        ]]


## entrainement

In [None]:
n_iterations = 10000
learning_rate = 0.01

theta_final, cost_history = gradient_descent(X_train, Y_train, theta, learning_rate, n_iterations)


print(theta_final)

print("Cout final:" ,cost_history[-1])


[[ 4.23186905]
 [ 6.82362028]
 [ 0.61189781]
 [15.0550387 ]]
Cout final: 1.3529990225239517


In [None]:
X_test = np.hstack((X_test, np.ones((X_test.shape[0], 1))))
Y_pred = model(X_test, theta_final)


coef_determination = coef_det(Y_test, Y_pred)
print("coef_determination : ", coef_determination)
print("theta final: ", theta_final)

coef_determination :  0.8788156035221214
theta final:  [[ 4.23186905]
 [ 6.82362028]
 [ 0.61189781]
 [15.0550387 ]]


## normalisation avec standard scaler

In [None]:
from sklearn.preprocessing import StandardScaler
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
X_train = np.hstack((X_train, np.ones((X_train.shape[0], 1))))
print(X_train[:10])

X_train.shape

[[-0.40424839 -1.02823707 -0.33767538  1.        ]
 [ 0.32060772 -0.91982774 -1.16143931  1.        ]
 [-1.27051084  0.2591237   0.25425079  1.        ]
 [-1.04235941 -0.6962335  -0.57444585  1.        ]
 [ 0.8791034  -1.38734296 -0.70762924  1.        ]
 [-1.32873699 -1.29926038 -0.79641817  1.        ]
 [-0.94373145 -0.46586368  0.53541572  1.        ]
 [-0.03231403  0.06940738 -0.53498411  1.        ]
 [-0.5397133  -1.16374872  0.21972176  1.        ]
 [-0.875999    0.31332837 -0.68789837  1.        ]]


(160, 4)

In [None]:
## entrainement
n_iterations = 10000
learning_rate = 0.01


theta_final_2, cost_history_2 = gradient_descent(X_train, Y_train, theta, learning_rate, n_iterations)

X_test = np.hstack((X_test, np.ones((X_test.shape[0], 1))))
Y_pred_2 = model(X_test, theta_final_2)

coef_determination_2 = coef_det(Y_test, Y_pred_2)
print("coef_determination : ", coef_determination)
print("theta final: ", theta_final_2)



coef_determination :  0.8788156035221214
theta final:  [[ 4.58720774]
 [ 1.48984025]
 [ 0.08791597]
 [15.330625  ]]


In [None]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots


fig = make_subplots(rows=1, cols=2)


fig.add_trace(go.Scatter(y=cost_history, mode='lines', name='From Scratch'), row=1, col=1)


fig.add_trace(go.Scatter(y=cost_history_2, mode='lines', name='From Standard Scaler'), row=1, col=2)


fig.update_layout(title='Cost Function Comparison', xaxis=dict(title='Iterations'), yaxis=dict(title='Cost'))


fig.show()

In [None]:
# print the theta_final and coef_determination for both normalisation
print("Normalisation from scratch")
print("theta final: ", theta_final)
print("coef_determination : ", coef_determination)
print("\n\nNormalisation with standard scaler\n\n")
print("theta final: ", theta_final_2)
print("coef_determination : ", coef_determination_2)

Normalisation from scratch
theta final:  [[ 4.23186905]
 [ 6.82362028]
 [ 0.61189781]
 [15.0550387 ]]
coef_determination :  0.8788156035221214


Normalisation with standard scaler


theta final:  [[ 4.58720774]
 [ 1.48984025]
 [ 0.08791597]
 [15.330625  ]]
coef_determination :  0.9059011844150844
