In [10]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


# Example: Movie-going and Weather

This notebook will illustrate the entire supervised machine learning process in the context of predicting movie attendance based on the weather on opening weekend.

## 1. Figure out your question

How many people would be expected to attend a movie on a weekend with temperatures in the X1s, precipitation of X2, humidity of X3, . . . ?

## 2. Obtain a labeled dataset

In [11]:
import pandas as pd
import numpy as np

In [12]:
moviedata=pd.read_csv('/content/gdrive/My Drive/Econ 484/datasets/opening_wkend.csv')
print(moviedata.head())
print("Shape: {}".format(str(moviedata.shape)))

   tickets_wk1d_r  ...  res_own_mat10_90_0
0       -1.718881  ...           -0.000291
1       -1.311943  ...           -0.009499
2        0.958356  ...            0.020613
3       -1.305518  ...           -0.103631
4        2.094907  ...            0.205718

[5 rows x 193 columns]
Shape: (1671, 193)


Let's define our "label" (y) vector and our "feature" matrix (X):

In [13]:
y = moviedata.filter(items=['tickets_wk1d_r'])
X = moviedata.filter(like='res_own',axis=1)

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler() #create scaler object
scaler.fit(X) #feed the scaler object the x
x_scaled = scaler.transform(X) #then actually standardize it
x_scaled.shape

print('our y vector is:\n',y.head)
print('our X matrix is:\n',X.head)

our y vector is:
 <bound method NDFrame.head of       tickets_wk1d_r
0          -1.718881
1          -1.311943
2           0.958356
3          -1.305518
4           2.094907
...              ...
1666        2.575074
1667        6.183819
1668       -2.352185
1669       -2.064675
1670       -1.981390

[1671 rows x 1 columns]>
our X matrix is:
 <bound method NDFrame.head of       res_own_snow  res_own_rain  ...  res_own_mat10_90_6  res_own_mat10_90_0
0        -0.115566     -0.084558  ...            0.002458           -0.000291
1         0.041614     -0.100350  ...           -0.003902           -0.009499
2         0.065586      0.139913  ...            0.011824            0.020613
3         0.000226     -0.102481  ...           -0.135380           -0.103631
4        -0.000154     -0.041737  ...            0.107601            0.205718
...            ...           ...  ...                 ...                 ...
1666     -0.031120     -0.062038  ...           -0.029375           -0.035598
16

## 3. Divide into training and set sets

In [14]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x_scaled,y,random_state=42)

## 4. Pick an appropriate method

In [15]:
from sklearn.linear_model import Lasso

## 5. Choose regularization parameters via cross-validation on the training set

By hand if you really want:

In [16]:
from sklearn.model_selection import cross_val_score

In [17]:
lasso = Lasso(alpha=.032, max_iter=100000) #this is just an alpha (lambda) that we picked out of nowhere. we can tweak it to find ones with higher r^2 returned values
scores = cross_val_score(lasso,X_train,y_train,cv=5)
print("Cross-validation scores: {}".format(scores))
print("Average cross-validation score: {:.4f}".format(scores.mean()))

#for i in [.0001, .0005,.001, .002,.0022, .003, .004, .006, .008, .01, .012, .014, .016 ,.018, .02 ]:

  #lasso = Lasso(alpha=i, max_iter=100000) #this is just an alpha (lambda) that we picked out of nowhere. we can tweak it to find ones with higher r^2 returned values
  #scores = cross_val_score(lasso,X_train,y_train,cv=5)
  #print("Cross-validation scores: {}".format(scores))
  #print("Average cross-validation score: {:.4f}".format(scores.mean()))


#you can also standardize the data which helps improve the score. Thas can help score, get rid of multicollinearity, and leave an interpretable model
# https://statisticsbyjim.com/regression/standardize-variables-regression/


Cross-validation scores: [ 0.04373883 -0.01780268  0.04292143  0.01118177  0.04983462]
Average cross-validation score: 0.0260


Or use GridSearchCV and do it automatically:

In [18]:
from sklearn.model_selection import GridSearchCV
# define grid for alpha
alpha_grid = {'alpha': [.0001, .0005,.001, .002,.0022, .003, .004, .006, .008, .01, .012, .014, .016 ,.018, .02 ,.49],'max_iter': [100000]}
grid_search = GridSearchCV(Lasso(),alpha_grid,cv=5,return_train_score=True)
best_model=grid_search.fit(X_train,y_train)
print("Best alpha: ",best_model.best_estimator_.get_params()['alpha'])

  positive)


Best alpha:  0.02


Or, even easier, just use LassoCV:

In [19]:
from sklearn.linear_model import LassoCV
lassocv = LassoCV(cv=5).fit(X_train, np.ravel(y_train))
print(lassocv.score(X_train,y_train))
lassocv.alpha_

  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng

0.08242934213125919


  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)


0.031625433905831085

## 6. Fit model on whole training set using the cross-validated parameters

In [20]:
lassowcvalpha=Lasso(alpha = lassocv.alpha_,max_iter=100000).fit(X_train,y_train)

## 7. Evaluate model by applying it to test set

In [21]:
print('Lasso score on test set: {:.4f}'.format(lassowcvalpha.score(X_test,y_test)))

Lasso score on test set: 0.0543


## 8. Repeat 4-7 for several methods

In [27]:
from sklearn.linear_model import RidgeCV
Ridgecv = RidgeCV(cv=5).fit(X_train, np.ravel(y_train))
print(Ridgecv.score(X_train,y_train))
Ridgecv.alpha_

Ridgecvalpha=Ridge(alpha = Ridgecv.alpha_,max_iter=100000).fit(X_train,y_train)
print('Ridge score on test set: {:.4f}'.format(Ridgecvalpha.score(X_test,y_test)))

#fix later

0.19078511795168207


NameError: ignored

## 9. Apply to new observations for which we have no labels

In [23]:
Xnew=pd.read_csv('/content/gdrive/My Drive/Econ 484/datasets/newobs.csv')
yhatnew=lassowcvalpha.predict(Xnew)
print("predicted residualized ticket sales for new observation: ",yhatnew)

predicted residualized ticket sales for new observation:  [0.00153969]
