## Importing initial libraries

In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

### Read in the data set

In [2]:
df = pd.read_csv('Advertising.csv')

### Initial EDA

In [3]:
df.head()

Unnamed: 0,TV,radio,newspaper,sales
0,230.1,37.8,69.2,22.1
1,44.5,39.3,45.1,10.4
2,17.2,45.9,69.3,9.3
3,151.5,41.3,58.5,18.5
4,180.8,10.8,58.4,12.9


In [4]:
df.describe()

Unnamed: 0,TV,radio,newspaper,sales
count,200.0,200.0,200.0,200.0
mean,147.0425,23.264,30.554,14.0225
std,85.854236,14.846809,21.778621,5.217457
min,0.7,0.0,0.3,1.6
25%,74.375,9.975,12.75,10.375
50%,149.75,22.9,25.75,12.9
75%,218.825,36.525,45.1,17.4
max,296.4,49.6,114.0,27.0


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   TV         200 non-null    float64
 1   radio      200 non-null    float64
 2   newspaper  200 non-null    float64
 3   sales      200 non-null    float64
dtypes: float64(4)
memory usage: 6.4 KB


## Data preparation

In [7]:
X = df.drop('sales',axis=1)
y = df['sales']

### Train | Validation | Hold out test set
**70% trainning data, 15% Validation, 15% Test set**

1st split - 70/30

2nd split - 50/50

In [8]:
from sklearn.model_selection import train_test_split

In [9]:
X_train, X_test, y_train, y_test = train_test_split(
...     X, y, test_size=0.3, random_state=101)

In [10]:
X_validation, X_holdout, y_validation, y_holdout = train_test_split(
...     X_test, y_test, test_size=0.5, random_state=101)

## Model trainning

In [11]:
from sklearn.ensemble import RandomForestRegressor

In [23]:
model = RandomForestRegressor(n_estimators=30,
                              random_state=101)

In [24]:
model.fit(X_train, y_train)

RandomForestRegressor(n_estimators=30, random_state=101)

### Model evaluation

In [25]:
from sklearn.metrics import mean_squared_error,mean_absolute_error

In [26]:
y_preds = model.predict(X_validation)

In [34]:
mean_absolute_error(y_validation,y_preds) # --> compare against target mean value

0.6575555555555552

In [35]:
np.sqrt(mean_squared_error(y_validation,y_preds))# --> compare against target std value

0.8542009478215644

In [36]:
df.describe()['sales'] 

count    200.000000
mean      14.022500
std        5.217457
min        1.600000
25%       10.375000
50%       12.900000
75%       17.400000
max       27.000000
Name: sales, dtype: float64

In [30]:
# MAE 0.85 , RMSE 1.1 --> n_estimator=3
# MAE 0.65 , RMSE 0.85 --> n_estimator=30

## Final metrics on hold out data
**No more tunning data after this**

In [31]:
final_predictions = model.predict(X_holdout)

In [33]:
mean_absolute_error(y_holdout,final_predictions)

0.5937777777777775

In [32]:
np.sqrt(mean_squared_error(y_holdout,final_predictions))

0.745323693040418

### The final model on entire dataset

In [37]:
final_model = RandomForestRegressor(n_estimators=30,random_state=101)

In [38]:
final_model.fit(X,y)

RandomForestRegressor(n_estimators=30, random_state=101)

### Saving your final model

In [39]:
import joblib

In [40]:
joblib.dump(final_model,'final_model.pkl')

['final_model.pkl']

In [42]:
list(X.columns)

['TV', 'radio', 'newspaper']

In [43]:
joblib.dump(list(X.columns),'col_names.pkl')

['col_names.pkl']

### Loading the model as a test

In [44]:
new_columns = joblib.load('col_names.pkl')

In [45]:
new_columns

['TV', 'radio', 'newspaper']

In [46]:
loaded_model = joblib.load('final_model.pkl')

#### make a prediction o new data

In [47]:
loaded_model.predict([[230.1,37.8,69.2]])

array([21.99])