# Revision II

The better understand our recommendation and gain confidence, we use another model and see
if recommendations change.

**Prepare our coding setup**

In [1]:
# code starts here
# some of the tools we use
from itertools import product

from ipywidgets import interact
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

# These imports may not work out of the box when you run the code in Google Colab
# from util import XNAMES, YNAME, COLORS
# from util import plot_coefficients, generate_valid_budget_allocations, create_model_explorer, generate_budget_plans, predict_sales_and_compare_plans

# Therefore I include the code literally here.
# util.py
"""
Utilities to simplify live coding.
"""
from itertools import product

from IPython.display import display
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd


COLORS = {
    'TV': '#1b9e77',
    'radio': '#d95f02',
    'newspaper': '#7570b3',
}
XNAMES = ['TV', 'radio', 'newspaper']
YNAME = 'sales'


def plot_coefficients(coef, names):
    importance = pd.Series(coef, index=names, name='coef')
    fig, ax = plt.subplots()
    importance.plot.bar(ax=ax)
    ax.set_title('Coefficients')
    return fig


def create_model_explorer(df, model):

    def explore_budget_plan(tv_budget=140, radio_budget=20, newspaper_budget=30):
        planned_budget = pd.DataFrame([[tv_budget, radio_budget, newspaper_budget]], columns=XNAMES)
        
        sales_pred = model.predict(planned_budget)

        fig, axs = plt.subplots(figsize=(14, 6), ncols=3, sharey=True)
        for xname, ax in zip(XNAMES, axs):
            df.plot.scatter(ax=ax, x=xname, y=YNAME, color=COLORS[xname])

        for xname, ax in zip(XNAMES, axs):
            ax.scatter(planned_budget[xname][0], sales_pred, color='k', s=800, marker='+')
            # TODO add vlines and hlines
        return fig
        
    return explore_budget_plan  
 

def generate_valid_budget_allocations(total_budget):
    
    budget_options = range(0, total_budget + 1, 2)
    
    budget_allocations = []
    for budget_allocation in product(budget_options, budget_options, budget_options):
        if sum(budget_allocation) != total_budget:
            continue
        budget_allocations.append(budget_allocation)
    return pd.DataFrame(budget_allocations, columns=XNAMES)


def generate_budget_plans(reference, extra):
    budget_plans = reference[XNAMES].values + extra
    return budget_plans


def predict_sales_and_compare_plans(model, reference, budget_plans):
    sales_pred = model.predict(budget_plans)

    plan = budget_plans.copy()
    plan['predicted_sales'] = sales_pred
    
    absolute_difference = (plan - reference.values).sort_values('predicted_sales', ascending=False)

    relative_difference = ((plan - reference.values)/reference.values).sort_values('predicted_sales', ascending=False)

    return absolute_difference, relative_difference


%matplotlib inline
%load_ext autoreload
%autoreload 2

In [2]:
df = pd.read_csv('https://www.statlearning.com/s/Advertising.csv', index_col=0)
df.head()

Unnamed: 0,TV,radio,newspaper,sales
1,230.1,37.8,69.2,22.1
2,44.5,39.3,45.1,10.4
3,17.2,45.9,69.3,9.3
4,151.5,41.3,58.5,18.5
5,180.8,10.8,58.4,12.9


## Update Model

1. Choose a model
2. Select hyperparameter
  - Grid seach
  - Cross validation
3. Evaluate

In [3]:
from sklearn.model_selection import train_test_split

X = df[XNAMES]
y = df[YNAME]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [9]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

# a random forest requires little tuning
# still, we vary n_estimators to include CV in this final example
# and step through the entire process
param_grid = {'n_estimators': [50, 100, 200]}

grid_search_cv = GridSearchCV(RandomForestRegressor(oob_score=True), 
                              param_grid=param_grid, 
                              cv=10, 
                              return_train_score=True, 
                              n_jobs=3)
grid_search_cv.fit(X_train, y_train)

In [10]:
# observe that, as noted above (random forest requires little tuning), changing the parameter had little impact
print('params', grid_search_cv.cv_results_['params'])
print('mean_test_score', grid_search_cv.cv_results_['mean_test_score'])

params [{'n_estimators': 50}, {'n_estimators': 100}, {'n_estimators': 200}]
mean_test_score [0.96355323 0.96416679 0.96677467]


In [11]:
model = grid_search_cv.best_estimator_
model.fit(X_train, y_train)  # for use in evaluation
# we skip this part, cf. the previous notebooks

In [12]:
# compute test score to compare with previous model
model.score(X_test, y_test)

0.9839256960402362

In [13]:
model = grid_search_cv.best_estimator_
model.fit(X, y)  # for use in production
# we use this final model to support our decision making

## TODO

Apply analysis and discussion from previous model to this one.

# Concluding Remarks

- There are a range of different search strategies to chose
  hyperparameters. For us, using a grid search is fine.
- The model itself is a hyperparameter is well. In this class, 
  it is sufficient to change it manually.
- Random forests are popular *off the shelve* models. They are easy
  to use and often provide a good base line.
- Ideally, the decision making process is modelled in a way that 
  different models can be plugged in as needed. As we did above.

# Follow Up Questions & Exercises

- How does the feature importance estimation compare between the random forest and 
  the linear model with polynomial terms?
- How does a tree based model behave out of data (compared to the linear model)?