In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv('/home/legacy/anaconda3/Data Science Master Class/UNZIP_FOR_NOTEBOOKS_FINAL/DATA/Advertising.csv')

In [3]:
df.head()

Unnamed: 0,TV,radio,newspaper,sales
0,230.1,37.8,69.2,22.1
1,44.5,39.3,45.1,10.4
2,17.2,45.9,69.3,9.3
3,151.5,41.3,58.5,18.5
4,180.8,10.8,58.4,12.9


## Train / Test Split procedure:

0. Clean and adjust data as necessary for X and Y
1. Split Data in Train/Test for both X and Y
2. Fit/Train Scaler on Training X Data
3. Scale X Test Data.
4. Create a model
5. Fit/Train model on X Train Data
6. Evaluate model on X Test data (by creating predictions and comparing to Y_test)
7. Adjust parameters as necessary and repeat steps 5 and 6.


In [4]:
X = df.drop('sales', axis=1)

In [5]:
y = df['sales']

In [6]:
from sklearn.model_selection import train_test_split

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)

### Scaling data:

In [8]:
from sklearn.preprocessing import StandardScaler

In [9]:
scaler = StandardScaler()

In [10]:
# We only scale data from the training set to avoid data leakage:
scaler.fit(X_train)

StandardScaler()

In [11]:
X_train = scaler.transform(X_train)

In [12]:
X_test = scaler.transform(X_test)

### Ridge without CV:


In [13]:
from sklearn.linear_model import Ridge

In [14]:
model = Ridge(alpha=100)

In [15]:
model.fit(X_train, y_train)

Ridge(alpha=100)

In [16]:
y_pred = model.predict(X_test)

In [17]:
from sklearn.metrics import mean_squared_error

In [18]:
mean_squared_error(y_test, y_pred)

7.341775789034129

In [19]:
[y_test, y_pred]

[37     14.7
 109    19.8
 31     11.9
 89     16.7
 66      9.5
 119     6.6
 54     20.2
 74     17.0
 145    10.3
 142    20.1
 148    10.9
 112    14.1
 174    11.5
 55     23.7
 141    19.2
 149    10.1
 25     12.0
 34      9.5
 170     8.4
 39     21.5
 172     7.6
 153    19.0
 175    27.0
 61     24.2
 65      9.3
 50     11.4
 42     20.7
 129     9.7
 179    12.6
 2       9.3
 12      9.2
 133    19.6
 90     11.2
 22      5.6
 41     17.1
 32      9.6
 125    10.6
 196     9.7
 158     7.3
 180    10.5
 16     12.5
 186    10.3
 144    11.4
 121     7.0
 80     11.8
 18     11.3
 78      5.3
 48     14.8
 4      12.9
 15     22.4
 1      10.4
 43     12.9
 102    14.8
 164    11.9
 9      10.6
 155     3.2
 36     25.4
 190    10.8
 33     17.4
 45     14.9
 Name: sales, dtype: float64,
 array([15.34908128, 17.05755308, 12.73784965, 16.18231062, 10.85075815,
         9.87999576, 17.6105132 , 15.80786278, 11.32616781, 17.30158479,
        12.8883864 , 13.64670913, 13.7163672

In [20]:
model_two = Ridge(alpha=1)

In [21]:
model_two.fit(X_train, y_train)

Ridge(alpha=1)

In [22]:
y_pred_two = model_two.predict(X_test)

In [23]:
mean_squared_error(y_test, y_pred_two)

2.3190215794287514

In [24]:
[y_test, y_pred_two]

[37     14.7
 109    19.8
 31     11.9
 89     16.7
 66      9.5
 119     6.6
 54     20.2
 74     17.0
 145    10.3
 142    20.1
 148    10.9
 112    14.1
 174    11.5
 55     23.7
 141    19.2
 149    10.1
 25     12.0
 34      9.5
 170     8.4
 39     21.5
 172     7.6
 153    19.0
 175    27.0
 61     24.2
 65      9.3
 50     11.4
 42     20.7
 129     9.7
 179    12.6
 2       9.3
 12      9.2
 133    19.6
 90     11.2
 22      5.6
 41     17.1
 32      9.6
 125    10.6
 196     9.7
 158     7.3
 180    10.5
 16     12.5
 186    10.3
 144    11.4
 121     7.0
 80     11.8
 18     11.3
 78      5.3
 48     14.8
 4      12.9
 15     22.4
 1      10.4
 43     12.9
 102    14.8
 164    11.9
 9      10.6
 155     3.2
 36     25.4
 190    10.8
 33     17.4
 45     14.9
 Name: sales, dtype: float64,
 array([15.73544249, 19.56177685, 11.47282584, 16.99614361,  9.19583919,
         7.06034338, 20.24078477, 17.27047482,  9.7997058 , 19.18969381,
        12.40827613, 13.88321006, 13.7233062

 ## Cross Validation:
 
 - To achieve this, we are going to split the data two times.

### Train | Validation | Test Split :

This is often also called a "hold-out" set, since you should not adjust parameters based on the final test set, but instead use it only for reporting final expected performance.

0. Clean and adjust data as necessary for X and Y.
1. Split data in Train/Validation/Test for both X and Y.
2. Fit/Train Scaler on Training X Data.
3. Scale X Eval Data.
4. Create Model.
5. Fit/Train Model on X Train Data.
6. Evaluate Model on X Evaluation Data (by creating predictions and comparing to Y_eval).
7. Adjust parameters as necessary, and repeat steps 5 and 6. 
8. Get final metrics on the test set (not allowed to go back and adjust after this)!.

In [25]:
X = df.drop('sales', axis=1)

In [26]:
y = df['sales']

In [28]:
X_train, X_other, y_train, y_other = train_test_split(X, y, test_size=0.3, random_state=101)

In [29]:
X_eval, X_test, y_eval, y_test = train_test_split(X_other, y_other, test_size=0.5, random_state=101)

In [30]:
from sklearn.preprocessing import StandardScaler

In [31]:
scaler = StandardScaler()

In [33]:
scaler.fit(X_train) # Only uses train set to avoid data leakage.

StandardScaler()

In [34]:
X_train = scaler.transform(X_train)

In [35]:
X_test = scaler.transform(X_test)

In [36]:
X_eval = scaler.transform(X_eval)

In [37]:
from sklearn.linear_model import Ridge

In [38]:
model_one = Ridge(alpha=100)

In [39]:
model_one.fit(X_train, y_train)

Ridge(alpha=100)

In [40]:
y_eval_pred = model_one.predict(X_eval)

In [41]:
from sklearn.metrics import mean_squared_error

In [42]:
mean_squared_error(y_eval, y_eval_pred)

7.320101458823872

In [43]:
model_two = Ridge(alpha=1)

In [44]:
model_two.fit(X_train, y_train)

Ridge(alpha=1)

In [45]:
new_pred_eval = model_two.predict(X_eval)

In [46]:
mean_squared_error(y_eval, new_pred_eval)

2.3837830750569866

In [49]:
y_final_test_pred = model_two.predict(X_test)

In [50]:
mean_squared_error(y_test, y_final_test_pred)

2.254260083800517

## Cross Val

### Cross val score:

In [51]:
X = df.drop('sales', axis=1)

In [52]:
y = df['sales']

In [53]:
from sklearn.model_selection import train_test_split

In [54]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)


In [55]:
from sklearn.preprocessing import StandardScaler

In [56]:
scaler = StandardScaler()

In [57]:
scaler.fit(X_train)

StandardScaler()

In [58]:
X_train = scaler.transform(X_train)

In [59]:
X_test = scaler.transform(X_test)

In [60]:
from sklearn.model_selection import cross_val_score

In [61]:
scores = cross_val_score(model, X_train, y_train, scoring='neg_mean_squared_error', cv=5)

In [62]:
scores

array([ -9.32552967,  -4.9449624 , -11.39665242,  -7.0242106 ,
        -8.38562723])

In [63]:
abs(scores.mean())

8.215396464543607

In [64]:
model = Ridge(alpha=1)

In [65]:
scores = cross_val_score(model, X_train, y_train, scoring='neg_mean_squared_error', cv=5)

In [66]:
scores

array([-3.15513238, -1.58086982, -5.40455562, -2.21654481, -4.36709384])

In [67]:
abs(scores.mean())

3.344839296530695

In [68]:
model.fit(X_train, y_train)

Ridge(alpha=1)

In [69]:
y_final_pred = model.predict(X_test)

In [70]:
mean_squared_error(y_test, y_final_pred)

2.3190215794287514

### Cross_validate function:

In [71]:
# CREATE X AND Y:
X = df.drop('sales', axis=1)

In [72]:
y = df['sales']

In [73]:

# TRAIN TEST SPLIT:
from sklearn.model_selection import train_test_split

In [75]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)

In [76]:
# SCALE DATA:
from sklearn.preprocessing import StandardScaler

In [77]:
scaler = StandardScaler()
scaler.fit(X_train)

StandardScaler()

In [78]:
X_train = scaler.transform(X_train)

In [79]:
X_test = scaler.transform(X_test)

In [80]:
from sklearn.model_selection import cross_validate

In [81]:
model = Ridge(alpha=100)

In [82]:
scores = cross_validate(model, X_train, y_train, scoring=['neg_mean_squared_error', 'neg_mean_absolute_error'], cv=10)

In [83]:
scores = pd.DataFrame(scores)

In [84]:
scores

Unnamed: 0,fit_time,score_time,test_neg_mean_squared_error,test_neg_mean_absolute_error
0,0.003865,0.002415,-6.060671,-1.810212
1,0.002293,0.001308,-10.627031,-2.541958
2,0.001569,0.001094,-3.993426,-1.469594
3,0.001155,0.001144,-5.009494,-1.862769
4,0.002028,0.000764,-9.1418,-2.520697
5,0.001048,0.000704,-13.086256,-2.459995
6,0.00093,0.000757,-3.839405,-1.451971
7,0.000891,0.000608,-9.058786,-2.377395
8,0.000842,0.000682,-9.055457,-2.443344
9,0.000945,0.000565,-5.778882,-1.899797


In [85]:
scores.mean()

fit_time                        0.001557
score_time                      0.001004
test_neg_mean_squared_error    -7.565121
test_neg_mean_absolute_error   -2.083773
dtype: float64

In [86]:
# Now, with alpha=1:
model = Ridge(alpha=1)

In [87]:
scores = cross_validate(model, X_train, y_train, scoring=['neg_mean_squared_error', 'neg_mean_absolute_error'], cv=10)

In [88]:
scores

{'fit_time': array([0.0024879 , 0.00136209, 0.00114441, 0.00101757, 0.00118732,
        0.00105858, 0.00085759, 0.00076604, 0.000911  , 0.00088215]),
 'score_time': array([0.00129986, 0.00096536, 0.00082421, 0.00086069, 0.00076675,
        0.00074768, 0.00066423, 0.00069499, 0.0007205 , 0.00060868]),
 'test_neg_mean_squared_error': array([-2.96250773, -3.05737833, -2.1737403 , -0.83303438, -3.46401792,
        -8.2326467 , -1.90586431, -2.76504844, -4.98950515, -2.84643818]),
 'test_neg_mean_absolute_error': array([-1.45717399, -1.5553078 , -1.23877012, -0.76893775, -1.43448944,
        -1.4943158 , -1.08136203, -1.25001123, -1.58097132, -1.22332553])}

In [89]:
scores = pd.DataFrame(scores)

In [90]:
scores

Unnamed: 0,fit_time,score_time,test_neg_mean_squared_error,test_neg_mean_absolute_error
0,0.002488,0.0013,-2.962508,-1.457174
1,0.001362,0.000965,-3.057378,-1.555308
2,0.001144,0.000824,-2.17374,-1.23877
3,0.001018,0.000861,-0.833034,-0.768938
4,0.001187,0.000767,-3.464018,-1.434489
5,0.001059,0.000748,-8.232647,-1.494316
6,0.000858,0.000664,-1.905864,-1.081362
7,0.000766,0.000695,-2.765048,-1.250011
8,0.000911,0.000721,-4.989505,-1.580971
9,0.000882,0.000609,-2.846438,-1.223326


In [91]:
scores.mean()

fit_time                        0.001167
score_time                      0.000815
test_neg_mean_squared_error    -3.323018
test_neg_mean_absolute_error   -1.308467
dtype: float64

In [92]:
model.fit(X_train, y_train)

Ridge(alpha=1)

In [93]:
y_final_pred = model.predict(X_test)

In [94]:
mean_squared_error(y_test, y_final_pred) # This is the final metric that we will report to someone.

2.3190215794287514