In [1]:
%matplotlib inline
import pandas as pd
import os
import matplotlib as plt
import seaborn as sns
import numpy as np
import statsmodels.formula.api as sm

In [2]:
#custom style HTML output

from IPython.core.display import HTML

csspath1 = r'C:\COURSERA\PYCON2015_TUTORIALS\Brandon Rhodes - Pandas From The Ground Up - PyCon 2015\style-table.css'
csspath2 = r'C:\COURSERA\PYCON2015_TUTORIALS\Brandon Rhodes - Pandas From The Ground Up - PyCon 2015\style-notebook.css'

css = open(csspath1).read() + open(csspath2).read()
HTML('<style>{}</style>'.format(css))

In [3]:
# Load the data set. 

elantra = pd.read_csv('DATA/elantra.csv')
elantra.head(3)

Unnamed: 0,Month,Year,ElantraSales,Unemployment,Queries,CPI_energy,CPI_all
0,1,2010,7690,9.7,153,213.377,217.466
1,1,2011,9659,9.1,259,229.353,221.082
2,1,2012,10900,8.2,354,244.178,227.666


In [4]:
# Split the data set into training 
# and testing sets as follows: place all observations 
# for 2012 and earlier in the training set, and all 
# observations for 2013 and 2014 into the testing set.

train = elantra[elantra.Year <= 2012]
test = elantra[elantra.Year > 2012]

In [5]:
train.shape,test.shape

((36, 7), (14, 7))

In [6]:
elantra.columns

Index(['Month', 'Year', 'ElantraSales', 'Unemployment', 'Queries',
       'CPI_energy', 'CPI_all'],
      dtype='object')

In [7]:
# Build a linear regression model to predict monthly 
# Elantra sales using Unemployment, CPI_all, CPI_energy
# and Queries as the independent variables. 
# Use all of the training set data to do this.

formula = "ElantraSales ~ Unemployment + CPI_all + CPI_energy + Queries"

linreg1 = sm.ols(formula=formula,data=train).fit()
print(linreg1.summary2())

                      Results: Ordinary least squares
Model:                  OLS                Adj. R-squared:       0.354     
Dependent Variable:     ElantraSales       AIC:                  689.9816  
Date:                   2016-05-03 00:03   BIC:                  697.8992  
No. Observations:       36                 Log-Likelihood:       -339.99   
Df Model:               4                  F-statistic:          5.803     
Df Residuals:           31                 Prob (F-statistic):   0.00132   
R-squared:              0.428              Scale:                1.0854e+07
---------------------------------------------------------------------------
               Coef.      Std.Err.     t    P>|t|     [0.025       0.975]  
---------------------------------------------------------------------------
Intercept    95385.3636 170663.8142  0.5589 0.5802 -252685.7802 443456.5074
Unemployment -3179.8996   3610.2623 -0.8808 0.3852  -10543.0780   4183.2788
CPI_all       -297.6456    704.836

In [8]:
# How many variables are significant, or have levels that are significant? 
# Use 0.10 as your p-value cutoff.

(linreg1.pvalues < 0.1000).sum()

0

In [9]:
# What is the coefficient of the Unemployment variable?

linreg1.params.Unemployment

-3179.899573378636

In [14]:
# To incorporate the seasonal effect due to the month, 
# build a new linear regression model that predicts 
# monthly Elantra sales using Month as well as Unemployment, 
# CPI_all, CPI_energy and Queries. Do not modify the training 
# and testing data frames before building the model.

formula = "ElantraSales ~ Unemployment + CPI_all + CPI_energy + Queries + Month"

linreg2 = sm.ols(formula=formula,data=train).fit()
print(linreg2.summary2())

                      Results: Ordinary least squares
Model:                 OLS                 Adj. R-squared:        0.340     
Dependent Variable:    ElantraSales        AIC:                   691.5836  
Date:                  2016-05-03 00:03    BIC:                   701.0847  
No. Observations:      36                  Log-Likelihood:        -339.79   
Df Model:              5                   F-statistic:           4.609     
Df Residuals:          30                  Prob (F-statistic):    0.00308   
R-squared:             0.434               Scale:                 1.1092e+07
----------------------------------------------------------------------------
                Coef.      Std.Err.     t    P>|t|     [0.025       0.975]  
----------------------------------------------------------------------------
Intercept    148330.4877 195373.5066  0.7592 0.4536 -250675.4435 547336.4189
Unemployment  -4137.2826   4008.5579 -1.0321 0.3103  -12323.8499   4049.2847
CPI_all        -517.99

In [11]:
# In the new model, given two monthly periods that are 
# otherwise identical in Unemployment, CPI_all, 
# CPI_energy and Queries, what is the absolute 
# difference in predicted Elantra sales given 
# that one period is in January and one is in March?

linreg2.params.Month*2

221.37053930261632

In [12]:
# In the new model, given two monthly periods that are 
# otherwise identical in Unemployment, CPI_all, 
# CPI_energy and Queries, what is the absolute 
# difference in predicted Elantra sales given 
# that one period is in January and one is in May?

linreg2.params.Month*4

442.74107860523264

In [18]:
# Re-run the regression with the Month variable modeled as 
# a factor variable. (Create a new variable that models 
# the Month as a factor (using the as.factor function) 
# instead of overwriting the current Month variable. 
# We'll still use the numeric version of Month later in 
#  the problem.)

# What is the model R-Squared?

formula = "ElantraSales ~ Unemployment + CPI_all + CPI_energy + Queries + C(Month)"

linreg3 = sm.ols(formula=formula,data=train).fit()
print(linreg3.summary2())

                       Results: Ordinary least squares
Model:                  OLS                 Adj. R-squared:        0.684     
Dependent Variable:     ElantraSales        AIC:                   670.5157  
Date:                   2016-05-03 00:07    BIC:                   695.8521  
No. Observations:       36                  Log-Likelihood:        -319.26   
Df Model:               15                  F-statistic:           6.044     
Df Residuals:           20                  Prob (F-statistic):    0.000147  
R-squared:              0.819               Scale:                 5.3172e+06
-----------------------------------------------------------------------------
                  Coef.      Std.Err.     t    P>|t|     [0.025      0.975]  
-----------------------------------------------------------------------------
Intercept      312509.2802 144061.8671  2.1693 0.0423  12001.4913 613017.0690
C(Month)[T.2]    2254.9978   1943.2486  1.1604 0.2595  -1798.5477   6308.5433
C(Month)[

In [19]:
# Which variables are significant, or have 
# levels that are significant? Use 0.10 as 
# your p-value cutoff.

linreg3.pvalues < 0.1000

Intercept          True
C(Month)[T.2]     False
C(Month)[T.3]      True
C(Month)[T.4]      True
C(Month)[T.5]      True
C(Month)[T.6]      True
C(Month)[T.7]      True
C(Month)[T.8]      True
C(Month)[T.9]      True
C(Month)[T.10]    False
C(Month)[T.11]    False
C(Month)[T.12]     True
Unemployment       True
CPI_all            True
CPI_energy         True
Queries           False
dtype: bool

In [31]:
# Which variables is CPI_energy highly correlated with? 
# (Include only variables where the absolute value of 
# the correlation exceeds 0.6. For the purpose of this question, 
# treat Month as a numeric variable, not a factor variable.)

for var in ['Month', 'Unemployment', 'Queries', 'CPI_all']:
        print('Correlation of CPI_energy and',var,abs(np.corrcoef(train.CPI_energy,train[var])[0][1]))

Correlation of CPI_energy and Month 0.17601982066
Correlation of CPI_energy and Unemployment 0.800718807452
Correlation of CPI_energy and Queries 0.832838105861
Correlation of CPI_energy and CPI_all 0.913225909008


In [33]:
# Which variables is Queries highly correlated with? 
# (Include only variables where the absolute value of 
# the correlation exceeds 0.6. For the purpose of this question, 
# treat Month as a numeric variable, not a factor variable.)

for var in ['Month', 'Unemployment', 'CPI_energy', 'CPI_all']:
        print('Correlation of Queries and',var,abs(np.corrcoef(train.Queries,train[var])[0][1]))

Correlation of Queries and Month 0.0158442958745
Correlation of Queries and Unemployment 0.641109291034
Correlation of Queries and CPI_energy 0.832838105861
Correlation of Queries and CPI_all 0.75367323625


In [42]:
# Which variables, and in what order, are removed by this process?

formula = "ElantraSales ~ Unemployment + CPI_all + CPI_energy + C(Month)"

linreg4 = sm.ols(formula=formula,data=train).fit()
print(linreg4.pvalues.sort_values(ascending=False) < 0.1000)

# The variable with the highest p-value is "Queries". After removing 
# it and looking at the model summary again, we can see that there 
# are no variables that are insignificant, at the 0.10 p-level. 
# Note that Month has a few values that are insignificant, but we 
# don't want to remove it because many values are very significant.

C(Month)[T.2]     False
C(Month)[T.10]    False
C(Month)[T.11]     True
Intercept          True
CPI_all            True
C(Month)[T.9]      True
Unemployment       True
C(Month)[T.12]     True
CPI_energy         True
C(Month)[T.3]      True
C(Month)[T.5]      True
C(Month)[T.8]      True
C(Month)[T.4]      True
C(Month)[T.6]      True
C(Month)[T.7]      True
dtype: bool


In [43]:
# Using the model from above, make predictions on the 
# test set. 

predictions = linreg4.predict(test)

In [50]:
# What is the sum of squared errors of the model on the test set?

sum((predictions - test.ElantraSales)**2)

190757747.44427505

In [55]:
# What would the baseline method predict for all observations 
# in the test set? Remember that the baseline method we use predicts 
# the average outcome of all observations in the training set.

train.ElantraSales.mean()

# The baseline method that is used in the R-Squared calculation 
# (to compute SST, the total sum of squares) simply predicts the 
# mean of ElantraSales in the training set for every observation 
# (i.e., without regard to any of the independent variables).

14462.25

In [125]:
# What is the test set R-Squared?

SSE = sum((test.ElantraSales - predictions)**2)
SST = sum((test.ElantraSales - np.mean(train.ElantraSales))**2)
test_set_r_squared = 1 - (SSE/SST)

SSE,SST,test_set_r_squared

(190757747.44427505, 701375142.375, 0.72802322762918248)

In [62]:
# What is the largest absolute error that we make in our test set predictions?

abs(predictions - test.ElantraSales).max()

7491.4876927105215

In [109]:
# In which period (Month,Year pair) do we make the largest 
# absolute error in our prediction?

a = abs(predictions - test.ElantraSales)
a.sort_values(ascending=False).head(1)

13    7491.487693
Name: ElantraSales, dtype: float64

In [107]:
test[['Month','Year']].loc[13]

Month       3
Year     2013
Name: 13, dtype: int64