In [1]:
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv("Advertising.csv")

In [3]:
data.head()

Unnamed: 0,TV,radio,newspaper,sales
0,230.1,37.8,69.2,22.1
1,44.5,39.3,45.1,10.4
2,17.2,45.9,69.3,9.3
3,151.5,41.3,58.5,18.5
4,180.8,10.8,58.4,12.9


In [4]:
data.describe()

Unnamed: 0,TV,radio,newspaper,sales
count,200.0,200.0,200.0,200.0
mean,147.0425,23.264,30.554,14.0225
std,85.854236,14.846809,21.778621,5.217457
min,0.7,0.0,0.3,1.6
25%,74.375,9.975,12.75,10.375
50%,149.75,22.9,25.75,12.9
75%,218.825,36.525,45.1,17.4
max,296.4,49.6,114.0,27.0


In [5]:
X = data.drop('sales', axis=1)
y = data['sales']

In [6]:
# We will split our data into Train Test Validation and Test data, so we will perform the train_test split twice

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3, random_state=48)

In [7]:
X_validation, X_final_test, y_validation, y_final_test = train_test_split(X_test,y_test,test_size=0.5, random_state=48)

In [8]:
len(X)

200

In [9]:
len(X_train)

140

In [10]:
len(X_validation)

30

In [11]:
len(X_final_test)

30

**Model Training**

In [12]:
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor(n_estimators=10, random_state=48)

In [13]:
model.fit(X_train, y_train)

RandomForestRegressor(n_estimators=10, random_state=48)

In [14]:
validation_pred = model.predict(X_validation)

In [15]:
from sklearn.metrics import mean_absolute_error, mean_squared_error
print("Mean absolute error is {}".format(mean_absolute_error(y_validation, validation_pred)))
print("Root Mean squared error is {}".format(np.sqrt(mean_squared_error(y_validation, validation_pred))))

Mean absolute error is 0.7110000000000003
Root Mean squared error is 0.91207638569


**Final Test Set Prdiction**

In [16]:
final_test_pred = model.predict(X_final_test)

In [17]:
print("Mean absolute error is {}".format(mean_absolute_error(y_final_test, final_test_pred)))
print("Root Mean squared error is {}".format(np.sqrt(mean_squared_error(y_final_test, final_test_pred))))

Mean absolute error is 0.5996666666666667
Root Mean squared error is 0.7811721961257965


In [18]:
final_model = RandomForestRegressor(n_estimators=10, random_state=48)

In [19]:
final_model.fit(X,y)

RandomForestRegressor(n_estimators=10, random_state=48)

**Saving the Final Model**

In [20]:
import joblib

In [21]:
joblib.dump(final_model, 'final_model.pkl')

['final_model.pkl']

In [22]:
# save the column name as a list
joblib.dump(list(X.columns), 'column_names.pkl')

['column_names.pkl']

**Load the Model**

In [23]:
new_columns = joblib.load('column_names.pkl')

In [24]:
new_columns

['TV', 'radio', 'newspaper']

In [25]:
loaded_model = joblib.load('final_model.pkl')

In [26]:
# Predicting a value for a row from our dataset, it should be near to 10.4
predicted_value = loaded_model.predict([[44.5,39.3,45.1]])

In [27]:
predicted_value

array([10.42])