# Import Data

In [6]:
import pandas as pd
df = pd.read_csv('Advertising.csv')

In [7]:
df.head()

Unnamed: 0,TV,radio,newspaper,sales
0,230.1,37.8,69.2,22.1
1,44.5,39.3,45.1,10.4
2,17.2,45.9,69.3,9.3
3,151.5,41.3,58.5,18.5
4,180.8,10.8,58.4,12.9


# Data preparation

In [8]:
X = df.drop('sales',axis=1)
y = df['sales']

In [9]:
from sklearn.model_selection import train_test_split

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=101)

In [11]:
# Further split 30% of test into validation and hold-out (15% and 15% each)
X_validation, X_holdout_test, y_validation, y_holdout_test = train_test_split(X_test, y_test, test_size=0.5, random_state=101)

# Model training

In [12]:
from sklearn.ensemble import RandomForestRegressor

In [13]:
model = RandomForestRegressor(n_estimators=10,random_state=101)

In [14]:
model.fit(X_train,y_train)

RandomForestRegressor(n_estimators=10, random_state=101)

# Model evaluation

In [15]:
validation_predictions = model.predict(X_validation)

In [16]:
from sklearn.metrics import mean_absolute_error,mean_squared_error

In [17]:
mean_absolute_error(y_validation,validation_predictions)

0.6636666666666673

In [18]:
# RMSE
mean_squared_error(y_validation,validation_predictions)**0.5 

0.7831368547918899

# Hyperparameter Tuning

In [19]:
model = RandomForestRegressor(n_estimators=35, random_state=101)
model.fit(X_train,y_train)

RandomForestRegressor(n_estimators=35, random_state=101)

In [20]:
validation_predictions = model.predict(X_validation)

In [21]:
mean_absolute_error(y_validation, validation_predictions)

0.6759047619047621

In [22]:
# RMSE
mean_squared_error(y_validation,validation_predictions)**0.5 

0.8585352183157281

# Test pada Holdout

In [23]:
model = RandomForestRegressor(n_estimators=35,random_state=101)
model.fit(X_train, y_train)

RandomForestRegressor(n_estimators=35, random_state=101)

In [24]:
test_predictions = model.predict(X_holdout_test)

In [25]:
mean_absolute_error(y_holdout_test, test_predictions)

0.5817142857142852

In [26]:
mean_squared_error(y_holdout_test, test_predictions)**0.5

0.730550812603694

# Full training

In [27]:
final_model = RandomForestRegressor(n_estimators=35, random_state=101)

In [28]:
final_model.fit(X, y)

RandomForestRegressor(n_estimators=35, random_state=101)

# Save model

In [29]:
import joblib

In [30]:
joblib.dump(final_model, 'final_model.pkl')

['final_model.pkl']

In [31]:
X.columns

Index(['TV', 'radio', 'newspaper'], dtype='object')

In [32]:
list(X.columns)

['TV', 'radio', 'newspaper']

In [33]:
joblib.dump(list(X.columns), 'column_names.pkl')

['column_names.pkl']

# Loading model

In [34]:
col_names = joblib.load('column_names.pkl')

In [35]:
col_names

['TV', 'radio', 'newspaper']

In [36]:
loaded_model = joblib.load('final_model.pkl')

In [37]:
loaded_model.predict([[230.1,37.8,69.2]])

array([21.98857143])