<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"></ul></div>

In [None]:
import requests
import re
import dateutil.parser
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import pickle
import sys
import datetime as dt
import statsmodels.api as sm
import statsmodels.formula.api as smf
import patsy


from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import LassoCV
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import RidgeCV
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.metrics import r2_score


%matplotlib inline
import seaborn as sns
sns.set()
import matplotlib.pyplot as plt

In [None]:
with open("modeling_data.pkl", 'rb') as picklefile: 
    df = pickle.load(picklefile)

In [None]:
df.info()

In [None]:
sns.heatmap(df.corr(), cmap="seismic", annot=True, vmin=-1, vmax=1);

In [None]:
plot_df = df.loc[:,['Runtime(min)','Widest Release','Budget Adj',
               'Franchise Flag','GDP','Days in Release Calc','Netflix','CNK','Domestic Total Gross']]
sns.pairplot(plot_df);

In [None]:
#Separate our features from our target
X = df.loc[:,['Runtime(min)','Widest Release','Budget Adj',
               'Franchise Flag','GDP','Days in Release Calc','Netflix','CNK','FF2']]

y = df.loc[:,'Domestic Total Gross']

model_df = X.copy()
#SPLIT TRAIN/VAL AND TEST HERE
Xt_v, X_test, yt_v, y_test = train_test_split(X, y, test_size=.2, random_state=10)


In [None]:
sns.distplot(yt_v);

In [None]:
yt_v.describe()

In [None]:
model_df.head()

In [None]:
Xt_v == model_df.loc[Xt_v.index]

In [None]:
def split_reg_and_validate(X, y, a):
    '''
    Cross validate train/val data on three models (linear, LASSO, Ridge) and report results
    '''
    
    features = X.columns
    X_ar = np.array(X) # returns an np.array
    y_ar = np.array(y) #this helps with the way kf will generate indices below
    

    
    #peform CV
    kf = KFold(n_splits=5, shuffle=True, random_state = 73)
    lm_r2s = [] #collect train results
    lm_list = [] 
    cv_lm_r2s = [] #collect the validation results
    lm_coefs = [] # collect coefs simple regression
    
    lasso_r2s = [] #collect lasso val results
    lasso_coefs = [] #collect lasso coefs for LASSO
    lasso_list = []
    
    ridge_list = []
    ridge_r2s = []
    ridge_coefs = []
    
    stds = []
    
    for train_ind, val_ind in kf.split(X_ar,y_ar):
    
        X_train, y_train = X_ar[train_ind], y_ar[train_ind]
        X_val, y_val = X_ar[val_ind], y_ar[val_ind] 

        std = StandardScaler()
        X_train_std = std.fit_transform(X_train)
        stds.append(std)
        X_val_std = std.transform(X_val)
        
        #simple linear regression
        lm = LinearRegression()
        
        lm.fit(X_train_std, y_train)
        lm_list.append(lm)
        lm_r2s.append(lm.score(X_train_std,y_train))
        cv_lm_r2s.append(lm.score(X_val_std, y_val))
        lm_coefs.append(lm.coef_)
        
        
        #Lasso model
        lasso = Lasso(alpha = a,max_iter = 1000000)
        lasso.fit(X_train_std,y_train)
        lasso_list.append(lasso)
        lasso_r2s.append(lasso.score(X_val_std,y_val))
        lasso_coefs.append(lasso.coef_)
        
        #Ridge model
        ridge = Ridge(alpha = a,)
        ridge.fit(X_train_std,y_train)
        ridge_list.append(ridge)
        ridge_r2s.append(ridge.score(X_val_std,y_val))
        ridge_coefs.append(ridge.coef_)
    
   
    
    print('Simple regression train scores: ', lm_r2s)
    print('Simple regression cv scores: ', cv_lm_r2s)
    print('LASSO regression cv scores: ', lasso_r2s)
    print('Ridge regression cv scores: ', ridge_r2s)
    
    print(f'Simple mean train r^2: {np.mean(lm_r2s):.3f} +- {np.std(lm_r2s):.3f}')
    print(f'Simple mean cv r^2: {np.mean(cv_lm_r2s):.3f} +- {np.std(cv_lm_r2s):.3f}')
    print(f'LASSO mean cv r^2: {np.mean(lasso_r2s):.3f} +- {np.std(lasso_r2s):.3f}')
    print(f'Ridge mean cv r^2: {np.mean(ridge_r2s):.3f} +- {np.std(ridge_r2s):.3f}')
    
    print('Simple reg coefs: \n', dict(zip(features,lm_coefs[cv_lm_r2s.index(max(cv_lm_r2s))])))
    print('LASSO coefs: \n', dict(zip(features,lasso_coefs[lasso_r2s.index(max(lasso_r2s))])))
    
    plt.scatter(y_val,y_val-lasso.predict(X_val_std),alpha=.1)
    plt.plot(np.linspace(0,6000,1000),np.linspace(0,0,1000))

    return lm_list[cv_lm_r2s.index(max(cv_lm_r2s))], lasso_list[lasso_r2s.index(max(lasso_r2s))],stds[cv_lm_r2s.index(max(cv_lm_r2s))]

In [None]:
lm1, lasso1, std1 = split_reg_and_validate(model_df.loc[Xt_v.index],yt_v,1000)

In [None]:
Xt_v.columns

In [None]:
X2 = model_df.copy()
#adding polynomial features

X2['WR2'] = X2['Widest Release'] ** 2
X2['B2'] = X2['Budget Adj'] ** 2
X2['GDP2'] = X2['GDP'] ** 2
X2['FF2^2'] = X2['FF2'] ** 2

lm2, lasso2, std2 = split_reg_and_validate(X2.loc[Xt_v.index],yt_v,100000) 

In [None]:
X3 = X2.copy()
#adding interaction terms

X3['B_x_WR'] = X3['Budget Adj'] * X3['Widest Release']
X3['B_x_DR'] = X3['Budget Adj'] * X3['Days in Release Calc']
X3['B_x_FF'] = X3['Budget Adj'] * X3['Franchise Flag']
X3['B_x_GDP'] = X3['Budget Adj'] * X3['GDP']

X3['WR_x_DR'] = X3['Widest Release'] * X3['Days in Release Calc']

X3['RT_x_DR'] = X3['Runtime(min)'] * X3['Days in Release Calc']
X3['RT_x_FF'] = X3['Runtime(min)'] * X3['Franchise Flag']

X3['DR_x_FF'] = X3['Days in Release Calc'] * X3['Franchise Flag']
X3['DR_x_CNK'] = X3['Days in Release Calc'] * X3['CNK']

lm3, lasso3, std3 = split_reg_and_validate(X3.loc[Xt_v.index],yt_v,1000)

In [None]:
X4 = X3.copy()

#ADD CATEGORICAL VARIABLES FROM df

X4['MPAA'] = df.loc[Xt_v.index,'MPAA']
X4['Month'] = df.loc[Xt_v.index,'Month']
X4['Dist'] = df.loc[Xt_v.index,'Distributor']
X4['Actor'] = df.loc[Xt_v.index,'Actor']

X4 = pd.get_dummies(X4, columns = ['MPAA','Month','Dist','Actor'])
lm4, lasso4, std4 = split_reg_and_validate(X4.loc[Xt_v.index],yt_v,100000)

In [None]:
X5 = X4.copy()

#adding interaction terms with dummy variables

X5['FF_x_RDJ'] = X5['Franchise Flag'] * X5['Actor_Robert Downey, Jr.']
X5['FF_x_TP'] = X5['Franchise Flag'] * X5['Actor_Tyler Perry']
X5['FF_x_BV'] = X5['Franchise Flag'] * X5['Dist_Buena Vista']
X5['FF2_x_June'] = X5['FF2'] * X5['Month_June']
X5['WR2_x_FF2'] = X5['WR2'] * X5['FF2']


split_reg_and_validate(X5.loc[Xt_v.index],yt_v,100000)

In [None]:
std_t = StandardScaler()
std_t.fit(X5.loc[Xt_v.index].values)

In [None]:
X5tr = std_t.transform(X5.loc[Xt_v.index].values)

In [None]:
alphavec = 10**np.linspace(2,6,1000)

lasso_model5 = LassoCV(alphas = alphavec, cv=5,max_iter = 100000)
lasso_model5.fit(X5tr, yt_v)
lasso_model5.score(X5tr,yt_v)

In [None]:
final_alpha = lasso_model5.alpha_
final_alpha

In [None]:
X5.loc[X_test.index].shape

In [None]:
X_te = std_t.transform(X5.loc[X_test.index].values)

In [None]:
X_te.shape

In [None]:
lasso_final = Lasso(alpha = final_alpha, max_iter = 100000)
lasso_final.fit(X5tr,yt_v)
lasso_final.score(X5tr,yt_v)

In [None]:
final_score = lasso_final.score(X_te,y_test)
final_score

In [None]:
final_pred = lasso_final.predict(X_te)

In [None]:
sns.regplot(final_pred,y_test,scatter_kws = {'alpha':.2});

In [None]:
final_test_df = X5.loc[X_test.index]

In [None]:
final_test_df = final_test_df.join(y_test)
final_test_df['preds'] = lasso_final.predict(X_te)
final_test_df['resid'] = final_test_df['preds']- final_test_df['Domestic Total Gross']
final_test_df.sort_values(by = 'resid',ascending = False)
final_test_df['resid abs'] = np.absolute(final_test_df['resid'])
final_test_df.sort_values(by = 'resid abs')
final_test_df = final_test_df.join(df['Title'])
final_test_df.groupby('Title')

In [None]:
sns.regplot(final_test_df['preds'],final_test_df['Domestic Total Gross'],scatter_kws = {'alpha':.2});

In [None]:
#Mean Absolute Error (MAE)
def mae(y_true, y_pred):
    return np.mean(np.abs(y_pred - y_true)) 

In [None]:
mae(y_test,final_pred)

In [None]:
with open('final_test_df.pkl','wb') as picklefile:
    pickle.dump(final_test_df,picklefile)