In [2]:
import numpy as np
import pandas as pd
from sklearn import linear_model
from sklearn import preprocessing
from sklearn.linear_model import Ridge, RidgeCV, LassoCV
from numpy import array
from numpy import argmax
from math import sqrt
from sklearn.metrics import mean_squared_error

In [5]:
# read traning and testing data
pd_training_data = pd.read_csv("dodgers_training (1).csv")

pd_testing_data=pd.read_csv("dodgers_testing (1).csv")

pd_train=pd_training_data[["month","attend","day_of_week","temp","skies","bobblehead"]]

pd_test=pd_testing_data[["month","attend","day_of_week","temp","skies","bobblehead"]]
print(pd_train.head())

  month  attend day_of_week  temp   skies bobblehead
0   AUG   36878      Monday    80  Clear          NO
1   AUG   40284    Saturday    70  Clear          NO
2   AUG   55024     Tuesday    80  Clear         YES
3   APR   26376      Monday    60  Cloudy         NO
4   AUG   37084   Wednesday    84  Clear          NO


In [6]:
#one hot encoding for skies
training_skies=pd.get_dummies(pd_train['skies'])
pd_train=pd_train.drop('skies',axis=1)
pd_train=pd_train.join(training_skies)

testing_skies=pd.get_dummies(pd_test['skies'])
pd_test=pd_test.drop('skies',axis=1)
pd_test=pd_test.join(testing_skies)

print(pd_train.head())
print(pd_test.head())

  month  attend day_of_week  temp bobblehead  Clear   Cloudy
0   AUG   36878      Monday    80         NO       1       0
1   AUG   40284    Saturday    70         NO       1       0
2   AUG   55024     Tuesday    80        YES       1       0
3   APR   26376      Monday    60         NO       0       1
4   AUG   37084   Wednesday    84         NO       1       0
  month  attend day_of_week  temp bobblehead  Clear   Cloudy
0   AUG   40173   Wednesday    75         NO       1       0
1   JUN   45210    Saturday    68         NO       1       0
2   AUG   32659      Monday    79         NO       1       0
3   JUN   55279     Tuesday    66        YES       0       1
4   JUL   55359      Sunday    75        YES       1       0


In [7]:
#one hot encoding for skies for month
training_month=pd.get_dummies(pd_train['month'])
pd_train=pd_train.drop('month',axis=1)
pd_train=pd_train.join(training_month)


test_month=pd.get_dummies(pd_test['month'])
pd_test=pd_test.drop('month',axis=1)
pd_test=pd_test.join(test_month)


In [8]:
#one hot encoding for skies for days of week
training_day_of_week=pd.get_dummies(pd_train['day_of_week'])
pd_train=pd_train.drop('day_of_week',axis=1)
pd_train=pd_train.join(training_day_of_week)

test_day_of_week=pd.get_dummies(pd_test['day_of_week'])
pd_test=pd_test.drop('day_of_week',axis=1)
pd_test=pd_test.join(test_day_of_week)
 


In [9]:
#one hot encoding for skies for days of bobblehead
train_bobblehead=pd.get_dummies(pd_train['bobblehead'])
pd_train=pd_train.drop('bobblehead',axis=1)
pd_train=pd_train.join(train_bobblehead)


test_bobblehead=pd.get_dummies(pd_test['bobblehead'])
pd_test=pd_test.drop('bobblehead',axis=1)
pd_test=pd_test.join(test_bobblehead)




In [10]:
# training dataset
X_pd_train = pd_train.iloc[0:,1:]
Y_pd_train = pd_train.iloc[0:,0:1]

#testing dataset
X_pd_test = pd_test.iloc[0:,1:]
Y_pd_test = pd_test.iloc[0:,0:1]

X_pd_test['Thursday']=0

#rearranging columns in testing dataset to match it with training dataset
cols = X_pd_train.columns.tolist()
X_pd_test = X_pd_test[cols]

# scale train dataset
scaler = preprocessing.StandardScaler().fit(X_pd_train)
X_pd_train_scaled = scaler.transform(X_pd_train)

# scale test dataset
X_pd_test_scaled = scaler.transform(X_pd_test)


### Using the training dataset, train 100 L2-regularized linear models corresponding to 100
#regularization coefficients evenly spaced between 0.1 and 1000. Use the leave-one-out
#cross-validation.
step1=(1000-0.1)/(100-1)
l2_cv = RidgeCV(alphas=np.arange(0.1,1000,step=step1),cv=None,store_cv_values=True)
l2_cv.fit(X_pd_train_scaled, Y_pd_train)

#same operaton with L1-regularization
l1_cv = LassoCV(alphas=np.arange(0.1,1000,step=step1),cv=None,max_iter=100000)
l1_cv.fit(X_pd_train_scaled, Y_pd_train.values.ravel())


#Model without regularizaion
ols = linear_model.LinearRegression(fit_intercept=True)
ols.fit(X_pd_train_scaled,Y_pd_train)
print(ols.intercept_)
print(ols.coef_)

In [12]:
#Q1)[10pts] List the means and standard deviations of the encoded attributes before scaling
print('original mean of training dataset =\n', np.mean(pd_train, axis=0))
print('original standard deviation of training dataset = \n', np.std(pd_train, axis=0))

print('original mean of testing dataset =\n', np.mean(pd_test, axis=0))
print('original standard deviation of testing dataset = \n', np.std(pd_test, axis=0))

original mean of training dataset =
 attend       39664.803571
temp            73.642857
Clear            0.803571
Cloudy           0.196429
APR              0.160714
AUG              0.214286
JUL              0.089286
JUN              0.089286
MAY              0.232143
OCT              0.017857
SEP              0.196429
Friday           0.178571
Monday           0.142857
Saturday         0.160714
Sunday           0.196429
Thursday         0.089286
Tuesday          0.089286
Wednesday        0.142857
NO               0.910714
YES              0.089286
dtype: float64
original standard deviation of training dataset = 
 attend       7868.654889
temp            8.868944
Clear           0.397296
Cloudy          0.397296
APR             0.367267
AUG             0.410326
JUL             0.285156
JUN             0.285156
MAY             0.422200
OCT             0.132432
SEP             0.397296
Friday          0.382993
Monday          0.349927
Saturday        0.367267
Sunday          0.397296
T

In [13]:
#[15pts] What is the best L2 regularization coefficient? Provide the corresponding linear
#coefficients.
print('For L2 Regularization->')
l2_alpha=l2_cv.alpha_
print('alpha = ', l2_alpha) 
print('coef = ', l2_cv.coef_)


For L2 Regularization->
alpha =  50.6
coef =  [[  515.85380007   271.19412682  -271.19412682  -273.56102806
    457.54654532  -216.88248689  1102.57100111  -605.71237164
    -75.78927596  -186.41964828    67.69322043  -835.67182144
    490.25315842   238.14165655  -341.53918562  1070.28517076
   -617.19884398 -1145.58152754  1145.58152754]]


In [14]:
#What is the best L1 regularization coefficient? Provide the corresponding linear
#coefficients
print('\nFor L1 Regularization->')
l1_alpha=l1_cv.alpha_
print('alpha = ', l1_alpha) 
coef= l1_cv.coef_
print(coef)



For L1 Regularization->
alpha =  616.2
[ 7.86580235e+02  0.00000000e+00 -0.00000000e+00 -0.00000000e+00
  6.49813690e+02 -0.00000000e+00  1.78831426e+03 -6.34824198e+01
 -0.00000000e+00 -0.00000000e+00  0.00000000e+00 -1.02533871e+03
  1.26701250e+02  0.00000000e+00 -2.71435063e+02  1.32734869e+03
 -7.50782105e+02 -2.53047289e+03  3.11826755e-12]


In [15]:
#[15pts] What are the predictive attributes selected as a result of L1 regularization?
i=0
predictive_attributes=[]
for x in X_pd_train.columns:
    if(coef[i]!=0):
        predictive_attributes.append(x)
    i=i+1
print("Predictive attributes selected as result of L1 regularization:\n",predictive_attributes)

Predictive attributes selected as result of L1 regularization:
 ['temp', 'AUG', 'JUN', 'MAY', 'Monday', 'Saturday', 'Thursday', 'Tuesday', 'Wednesday', 'NO', 'YES']


In [16]:
#For the models corresponding to the best L2 and L1 regularization coefficients
#list the following root-mean-square-error (RMSE): (1) RMSE on the training set, (2)
##expected prediction RMSE obtained during the cross-validation, (3) RMSE on the testing
#set. For the linear model without regularization list the following RMSE: (1) RMSE on
#the training set, and (2) RMSE on the testing set

#expected prediction RMSE obtained during the cross-validation

L2_MSE=l2_cv.cv_values_
RMSE_mean=np.mean(L2_MSE,axis=0)
RMSE_min=np.min(np.sqrt(RMSE_mean))
print("L2 Expected prediction RMSE obtained during the cross-validation",RMSE_min)

L1_MSE=l1_cv.mse_path_
L1_RMSE_mean=np.mean(L1_MSE,axis=1)
L1_RMSE_min=np.min(np.sqrt(L1_RMSE_mean))
print("L1 Expected prediction RMSE obtained during the cross-validation",L1_RMSE_min)


# L2
l2_train_pred = l2_cv.predict(X_pd_train_scaled)
print('L2 - RMSE train = ', sqrt(mean_squared_error(l2_train_pred,Y_pd_train)))

l2_test_pred=l2_cv.predict(X_pd_test_scaled)

print('L2 - RMSE test = ', sqrt(mean_squared_error(l2_test_pred,Y_pd_test)))

#L1
l1_train_pred = l1_cv.predict(X_pd_train_scaled)
print('L1 - RMSE train = ', sqrt(mean_squared_error(l1_train_pred,Y_pd_train)))

l1_test_pred=l1_cv.predict(X_pd_test_scaled)

print('L1 - RMSE test = ', sqrt(mean_squared_error(l1_test_pred,Y_pd_test)))

#noreg

noreg_train_pred = ols.predict(X_pd_train_scaled)
print('No regularization - RMSE train = ', sqrt(mean_squared_error(noreg_train_pred,Y_pd_train)))

noreg_test_pred=ols.predict(X_pd_test_scaled)

print('No regularization - RMSE test = ', sqrt(mean_squared_error(noreg_test_pred,Y_pd_test)))

L2 Expected prediction RMSE obtained during the cross-validation 6737.660948839507
L1 Expected prediction RMSE obtained during the cross-validation 6753.242263861002
L2 - RMSE train =  5648.86421638529
L2 - RMSE test =  6710.58206628092
L1 - RMSE train =  5568.617853858854
L1 - RMSE test =  6804.849508052487
No regularization - RMSE train =  5254.5387249249425
No regularization - RMSE test =  7058.684526782614


In [17]:
#[15pts] Predict the attendance on a clear Monday in June when the expected temperature
#is 72 for all three models with and without bobbleheads. Does bobblehead promotion
#have an impact on the attendance?
#1)With bobblehead
with_bobblehead=pd.Series([72,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,1])
with_bobblehead_scaled=scaler.transform(with_bobblehead.values.reshape(1, -1))
#2)Without bobblehead
without_bobblehead=pd.Series([72,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0])
without_bobblhead_scaled=scaler.transform(without_bobblehead.values.reshape(1, -1))

#L2 regularization
l2_with_bobblhead_pred = l2_cv.predict(with_bobblehead_scaled)

print("L2- with bobblhead:",l2_with_bobblhead_pred[0])

l2_without_bobblhead_pred = l2_cv.predict(without_bobblhead_scaled)

print("L2- without bobblhead:",l2_without_bobblhead_pred[0])

#L1 regularization
l1_with_bobblhead_pred = l1_cv.predict(with_bobblehead_scaled)

print("L1- with bobblhead:",l1_with_bobblhead_pred)

l1_without_bobblhead_pred = l1_cv.predict(without_bobblhead_scaled)

print("L1- without bobblhead:",l1_without_bobblhead_pred)

#no regularization

noreg_with_bobblhead_pred = ols.predict(with_bobblehead_scaled)

print("No regularization- with bobblhead:",noreg_with_bobblhead_pred[0])

noreg_without_bobblhead_pred = ols.predict(without_bobblhead_scaled)

print("No regularization- without bobblhead:",noreg_without_bobblhead_pred[0])

L2- with bobblhead: [48673.22150451]
L2- without bobblhead: [40638.44252892]
L1- with bobblhead: [50416.6365564]
L1- without bobblhead: [41542.63245279]
No regularization- with bobblhead: [51684.84997165]
No regularization- without bobblhead: [41823.22849175]


Conclusion: Yes bobblehead as impact on attedance of the game as you can see from the prediction result.
