# DS-SF-25 | Codealong 07 | Introduction to Regression and Model Fit, Part 2 | Answer Key

In [2]:
import os

import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 20)
pd.set_option('display.notebook_repr_html', True)
pd.set_option('display.max_columns', 10)

import statsmodels.api as sm
import statsmodels.formula.api as smf

from sklearn import feature_selection, linear_model

import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('ggplot')

import seaborn as sns

## Part A - Model's F-statistic

In [3]:
df = pd.read_csv(os.path.join('..', 'datasets', 'zillow-07.csv'), index_col = 'ID')

> ### `SalePrice` as a function of `Size`

In [4]:
model = smf.ols(formula = 'SalePrice ~ Size', data = df).fit()

model.summary()

0,1,2,3
Dep. Variable:,SalePrice,R-squared:,0.236
Model:,OLS,Adj. R-squared:,0.235
Method:,Least Squares,F-statistic:,297.4
Date:,"Thu, 04 Aug 2016",Prob (F-statistic):,2.67e-58
Time:,14:18:11,Log-Likelihood:,-1687.9
No. Observations:,967,AIC:,3380.0
Df Residuals:,965,BIC:,3390.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
Intercept,0.1551,0.084,1.842,0.066,-0.010 0.320
Size,0.7497,0.043,17.246,0.000,0.664 0.835

0,1,2,3
Omnibus:,1842.865,Durbin-Watson:,1.704
Prob(Omnibus):,0.0,Jarque-Bera (JB):,3398350.943
Skew:,13.502,Prob(JB):,0.0
Kurtosis:,292.162,Cond. No.,4.4


> ### `SalePrice` as a function of `IsAStudio`

In [5]:
model = smf.ols(formula = 'SalePrice ~ IsAStudio', data = df).fit()

model.summary()

0,1,2,3
Dep. Variable:,SalePrice,R-squared:,0.0
Model:,OLS,Adj. R-squared:,-0.001
Method:,Least Squares,F-statistic:,0.07775
Date:,"Thu, 04 Aug 2016",Prob (F-statistic):,0.78
Time:,14:18:11,Log-Likelihood:,-1847.4
No. Observations:,986,AIC:,3699.0
Df Residuals:,984,BIC:,3709.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
Intercept,1.3811,0.051,27.088,0.000,1.281 1.481
IsAStudio,0.0829,0.297,0.279,0.780,-0.501 0.666

0,1,2,3
Omnibus:,1682.807,Durbin-Watson:,1.488
Prob(Omnibus):,0.0,Jarque-Bera (JB):,1342290.714
Skew:,10.942,Prob(JB):,0.0
Kurtosis:,182.425,Cond. No.,5.92


### Model's F-value (with significance level of `5%`)

In [6]:
model.fvalue

0.077751247187816605

### Corresponding p-value

In [7]:
model.f_pvalue

0.78042689060360249

## Part B1 - Linear Regression Modeling with `sklearn`

In [8]:
def summary(X, y, model):
    fvalues, f_pvalues = feature_selection.f_regression(X, y)
    print 'F-statistic (not join but instead done sequentially for each regressor)'
    print '- F-value', fvalues
    print '- p-value', f_pvalues
    print

    print 'R^2 =', model.score(X, y)
    print

    print 'Coefficients'
    print '- beta_0 (Intercept) = {}'.format(model.intercept_)
    for i, coef in enumerate(model.coef_):
        print '- beta_{} ({}) = {}'.format(i + 1, X.columns[i], coef)

> ### Remove samples with `NaN` in `IsAStudio`, `Size`, or `LotSize`

In [9]:
df.dropna(axis = 'index', subset = ['IsAStudio', 'Size', 'LotSize'], inplace = True)

### SalePrice ~ IsAStudio with `statsmodels`

In [10]:
smf.ols(formula = 'SalePrice ~ IsAStudio', data = df).fit().summary()

0,1,2,3
Dep. Variable:,SalePrice,R-squared:,0.0
Model:,OLS,Adj. R-squared:,-0.001
Method:,Least Squares,F-statistic:,0.2519
Date:,"Thu, 04 Aug 2016",Prob (F-statistic):,0.616
Time:,14:18:11,Log-Likelihood:,-1159.0
No. Observations:,545,AIC:,2322.0
Df Residuals:,543,BIC:,2331.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
Intercept,1.5571,0.088,17.615,0.000,1.383 1.731
IsAStudio,0.2589,0.516,0.502,0.616,-0.755 1.272

0,1,2,3
Omnibus:,860.527,Durbin-Watson:,1.337
Prob(Omnibus):,0.0,Jarque-Bera (JB):,301122.117
Skew:,8.992,Prob(JB):,0.0
Kurtosis:,116.741,Cond. No.,5.93


> ### SalePrice ~ IsAStudio with `sklearn`

In [11]:
X = df[ ['IsAStudio'] ]
y = df.SalePrice

model = linear_model.LinearRegression().fit(X, y)

summary(X, y, model)

F-statistic (not join but instead done sequentially for each regressor)
- F-value [ 0.25187926]
- p-value [ 0.61595836]

R^2 = 0.000463650973037

Coefficients
- beta_0 (Intercept) = 1.55707559924
- beta_1 (IsAStudio) = 0.258924400756


### SalePrice ~ Size + LotSize with `statsmodels`

In [12]:
smf.ols(formula = 'SalePrice ~ Size + LotSize', data = df).fit().summary()

0,1,2,3
Dep. Variable:,SalePrice,R-squared:,0.224
Model:,OLS,Adj. R-squared:,0.221
Method:,Least Squares,F-statistic:,78.29
Date:,"Thu, 04 Aug 2016",Prob (F-statistic):,1.3599999999999999e-30
Time:,14:18:11,Log-Likelihood:,-1090.0
No. Observations:,545,AIC:,2186.0
Df Residuals:,542,BIC:,2199.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
Intercept,-0.1902,0.173,-1.098,0.273,-0.530 0.150
Size,0.8171,0.069,11.907,0.000,0.682 0.952
LotSize,0.0500,0.037,1.362,0.174,-0.022 0.122

0,1,2,3
Omnibus:,974.589,Durbin-Watson:,1.648
Prob(Omnibus):,0.0,Jarque-Bera (JB):,812622.772
Skew:,11.225,Prob(JB):,0.0
Kurtosis:,190.833,Cond. No.,9.81


> ### SalePrice ~ Size + LotSize with `sklearn`

In [13]:
X = df[ ['Size', 'LotSize'] ]
y = df.SalePrice

model = linear_model.LinearRegression().fit(X, y)

summary(X, y, model)

F-statistic (not join but instead done sequentially for each regressor)
- F-value [ 154.47734612   11.74608887]
- p-value [  2.18094764e-31   6.55921409e-04]

R^2 = 0.224134357118

Coefficients
- beta_0 (Intercept) = -0.190237755455
- beta_1 (Size) = 0.81709073459
- beta_2 (LotSize) = 0.0500489289305


## Part B2 - Linear Regression Modeling with `sklearn` (cont.)

In [14]:
df = pd.read_csv(os.path.join('..', 'datasets', 'advertising.csv'))

In [15]:
df

Unnamed: 0,TV,Radio,Newspaper,Sales
0,230.1,37.8,69.2,22.1
1,44.5,39.3,45.1,10.4
2,17.2,45.9,69.3,9.3
3,151.5,41.3,58.5,18.5
4,180.8,10.8,58.4,12.9
5,8.7,48.9,75.0,7.2
6,57.5,32.8,23.5,11.8
7,120.2,19.6,11.6,13.2
8,8.6,2.1,1.0,4.8
9,199.8,2.6,21.2,10.6


## Plots

> ### Sales ~ TV

In [15]:
sns.lmplot('TV', 'Sales', df)

> ### Sales ~ Radio

In [16]:
sns.lmplot('Radio', 'Sales', df)

> ### Sales ~ Newspaper

In [17]:
sns.lmplot('Newspaper', 'Sales', df)

## Simple linear regressions

> ### Sales ~ TV

In [18]:
model_tv = smf.ols(formula = 'Sales ~ TV', data = df).fit()

model_tv.summary()

> ### Sales ~ Radio

In [19]:
model_radio = smf.ols(formula = 'Sales ~ Radio', data = df).fit()

model_radio.summary()

> ### Sales ~ Newspaper

In [20]:
model_newspaper = smf.ols(formula = 'Sales ~ Newspaper', data = df).fit()

model_newspaper.summary()

## Residuals

> ### Sales ~ TV

In [21]:
sm.qqplot(model_tv.resid, line = 's')

pass

In [22]:
sm.graphics.plot_regress_exog(model_tv, 'TV')

pass

> ### Sales ~ Radio

In [23]:
sm.qqplot(model_radio.resid, line = 's')

pass

In [24]:
sm.graphics.plot_regress_exog(model_radio, 'Radio')

pass

> ### Sales ~ Newspaper

In [25]:
sm.qqplot(model_newspaper.resid, line = 's')

pass

In [26]:
sm.graphics.plot_regress_exog(model_newspaper, 'Newspaper')

pass

> ### Sales ~ TV + Radio + Newspaper

In [27]:
model = smf.ols(formula = 'Sales ~ TV + Radio + Newspaper', data = df).fit()

model.summary()

> ### Sales ~ TV + Radio

In [28]:
model = smf.ols(formula = 'Sales ~ TV + Radio', data = df).fit()

model.summary()

In [29]:
sm.qqplot(model.resid, line = 's')

pass

In [30]:
sm.graphics.plot_regress_exog(model, 'TV')

pass

In [31]:
sm.graphics.plot_regress_exog(model, 'Radio')

pass

## Part C - Interaction Effects

### Sales ~ TV + Radio + TV * Radio

In [32]:
model = smf.ols(formula = 'Sales ~ TV + Radio + TV * Radio', data = df).fit()

model.summary()

In [33]:
sm.qqplot(model.resid, line = 's')

pass

In [34]:
sm.graphics.plot_regress_exog(model, 'TV')

pass

In [35]:
sm.graphics.plot_regress_exog(model, 'Radio')

pass

In [36]:
sm.graphics.plot_regress_exog(model, 'TV:Radio')

pass

## Part D - One-hot encoding for categorical variables

In [37]:
df = pd.read_csv(os.path.join('..', 'datasets', 'zillow-07.csv'), index_col = 'ID')

In [38]:
df.drop(df[df.IsAStudio == 1].index, inplace = True)

In [39]:
smf.ols(formula = 'SalePrice ~ BathCount', data = df).fit().summary()

> ### What's the bathrooms' distribution in the dataset?

In [40]:
df.BathCount.value_counts(dropna = False).sort_index()

> ### Let's keep properties with 1, 2, 3, or 4 bathrooms

In [41]:
df = df[df.BathCount.isin([1, 2, 3, 4])]

In [42]:
df.BathCount.value_counts(dropna = False).sort_index()

> ### Let's use `pandas`'s `get_dummies` to create our one-hot encoding

In [43]:
baths_df = pd.get_dummies(df.BathCount, prefix = 'Bath')

In [44]:
baths_df

In [45]:
baths_df.rename(columns = {'Bath_1.0': 'Bath_1',
                           'Bath_2.0': 'Bath_2',
                           'Bath_3.0': 'Bath_3',
                           'Bath_4.0': 'Bath_4'}, inplace = True)

In [46]:
baths_df

In [47]:
df = df.join([baths_df])

In [48]:
df.columns

## Activity | One-hot encoding for categorical variables

> ### `SalesPrice` as a function of `Bath_2`, `Bath_3`, and `Bath_4`

In [49]:
smf.ols(formula = 'SalePrice ~ Size + Bath_2 + Bath_3 + Bath_4', data = df).fit().summary()

> ### `SalesPrice` as a function of `Bath_1`, `Bath_3`, and `Bath_4`

In [50]:
smf.ols(formula = 'SalePrice ~ Bath_1 + Bath_3 + Bath_4', data = df).fit().summary()

> ### `SalesPrice` as a function of `Bath_1`, `Bath_2`, and `Bath_4`

In [51]:
smf.ols(formula = 'SalePrice ~ Bath_1 + Bath_2 + Bath_4', data = df).fit().summary()

> ### `SalesPrice` as a function of `Bath_1`, `Bath_2`, and `Bath_3`

In [52]:
smf.ols(formula = 'SalePrice ~ Bath_1 + Bath_2 + Bath_3', data = df).fit().summary()