# DS-SF-30 | Codealong 09: Linear Regression, Part 3

In [2]:
import os

import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 20)
pd.set_option('display.notebook_repr_html', True)
pd.set_option('display.max_columns', 10)

import statsmodels.api as sm
import statsmodels.formula.api as smf

from sklearn import linear_model

import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('ggplot')

import seaborn as sns

## Part A - Model Fit and Customer Retention

In [4]:
train_df = pd.DataFrame({'t': range(8),
                        'survival_rate': [1., .869, .743, .653, .593, .551, .517, .491]},
                       columns = ['t', 'survival_rate'])

In [7]:
train_df

Unnamed: 0,t,survival_rate
0,0,1.0
1,1,0.869
2,2,0.743
3,3,0.653
4,4,0.593
5,5,0.551
6,6,0.517
7,7,0.491


In [None]:
Next step is to do feature engineering. You have year so then do that

In [10]:
train_df_log = train_df.survival_rate.apply(np.log10)
train_df_sqrt = train_df.apply(np.sqrt)
train_df_cbrt = train_df.apply(np.cbrt)




print "The square root is {1}".format(train_df_log, train_df_sqrt)
print "The log is {0}".format(train_df_log, train_df_sqrt)

The square root is           t  survival_rate
0  0.000000       1.000000
1  1.000000       0.932202
2  1.414214       0.861974
3  1.732051       0.808084
4  2.000000       0.770065
5  2.236068       0.742294
6  2.449490       0.719027
7  2.645751       0.700714
The log is 0    0.000000
1   -0.060980
2   -0.129011
3   -0.185087
4   -0.226945
5   -0.258848
6   -0.286509
7   -0.308919
Name: survival_rate, dtype: float64


In [12]:
train_df.plot(kind = 1, y = "survival_rate", x = 't' )

train_df_sqrt[t] = train_df.apply(np.sqrt)


AttributeError: 'int' object has no attribute 'lower'

## Part B - One-Hot Encoding for Categorical Variables and SF Housing

In [15]:
df = pd.read_csv(os.path.join('..', 'datasets', 'dataset-09-zillow.csv'), index_col = 'ID')

In [16]:
df.drop(df[df.IsAStudio == 1].index, inplace = True)

In [17]:
smf.ols(formula = 'SalePrice ~ Baths', data = df).fit().summary()

0,1,2,3
Dep. Variable:,SalePrice,R-squared:,0.137
Model:,OLS,Adj. R-squared:,0.136
Method:,Least Squares,F-statistic:,146.6
Date:,"Thu, 12 Jan 2017",Prob (F-statistic):,1.94e-31
Time:,20:39:21,Log-Likelihood:,-1690.7
No. Observations:,929,AIC:,3385.0
Df Residuals:,927,BIC:,3395.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
Intercept,0.3401,0.099,3.434,0.001,0.146 0.535
Baths,0.5242,0.043,12.109,0.000,0.439 0.609

0,1,2,3
Omnibus:,1692.623,Durbin-Watson:,1.582
Prob(Omnibus):,0.0,Jarque-Bera (JB):,2167434.305
Skew:,12.317,Prob(JB):,0.0
Kurtosis:,238.345,Cond. No.,5.32


> ### What's the bathrooms' distribution in the dataset?

In [None]:
# TODO

> ### Let's keep properties with 1, 2, 3, or 4 bathrooms

In [18]:
df = df[df.Baths.isin([1,2,3,4])]

In [19]:
df

Unnamed: 0_level_0,Address,DateOfSale,SalePrice,IsAStudio,Beds,Baths,Size,LotSize,BuiltInYear
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
15063505,"740 Francisco St, San Francisco, CA",11/30/15,2.1500,0.0,,2.0,1.430,2.435,1948.0
15064044,"199 Chestnut St APT 5, San Francisco, CA",12/11/15,1.5000,0.0,1.0,1.0,1.060,,1930.0
15064257,"111 Chestnut St APT 403, San Francisco, CA",1/15/16,0.9700,0.0,2.0,2.0,1.299,,1993.0
15064295,"111 Chestnut St APT 702, San Francisco, CA",12/17/15,0.9400,0.0,2.0,2.0,1.033,,1993.0
15064391,"1821 Grant Ave APT 101, San Francisco, CA",12/15/15,0.8350,0.0,1.0,1.0,1.048,,1975.0
15064536,"2300 Leavenworth St, San Francisco, CA",12/4/15,2.8300,0.0,3.0,2.0,2.115,1.271,1913.0
15064669,"1055 Lombard St # C, San Francisco, CA",12/31/15,2.1900,0.0,2.0,3.0,1.182,,1986.0
15065032,"101 Lombard St APT 116E, San Francisco, CA",12/7/15,0.8000,0.0,1.0,1.0,1.000,,1983.0
15065140,"1407 Montgomery St APT 2, San Francisco, CA",1/28/16,0.9760,0.0,1.0,1.0,1.000,,1870.0
15065727,"1360 Montgomery St APT 1, San Francisco, CA",11/16/15,0.7200,0.0,1.0,1.0,0.552,,1937.0


> ### Let's use `pandas`'s `get_dummies` to create a one-hot encoding of the `Bath` categorical feature

In [20]:
baths_df = pd.get_dummies(df.Baths, prefix = 'Bath')

In [21]:
baths_df

Unnamed: 0_level_0,Bath_1.0,Bath_2.0,Bath_3.0,Bath_4.0
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
15063505,0.0,1.0,0.0,0.0
15064044,1.0,0.0,0.0,0.0
15064257,0.0,1.0,0.0,0.0
15064295,0.0,1.0,0.0,0.0
15064391,1.0,0.0,0.0,0.0
15064536,0.0,1.0,0.0,0.0
15064669,0.0,0.0,1.0,0.0
15065032,1.0,0.0,0.0,0.0
15065140,1.0,0.0,0.0,0.0
15065727,1.0,0.0,0.0,0.0


In [23]:
baths_df.rename(columns = {'Bath_1.0': 'Bath_1',
                           'Bath_2.0': 'Bath_2',
                           'Bath_3.0': 'Bath_3',
                           'Bath_4.0': 'Bath_4'}, inplace = True)

In [22]:
baths_df

Unnamed: 0_level_0,Bath_1.0,Bath_2.0,Bath_3.0,Bath_4.0
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
15063505,0.0,1.0,0.0,0.0
15064044,1.0,0.0,0.0,0.0
15064257,0.0,1.0,0.0,0.0
15064295,0.0,1.0,0.0,0.0
15064391,1.0,0.0,0.0,0.0
15064536,0.0,1.0,0.0,0.0
15064669,0.0,0.0,1.0,0.0
15065032,1.0,0.0,0.0,0.0
15065140,1.0,0.0,0.0,0.0
15065727,1.0,0.0,0.0,0.0


In [25]:
df = df.join([baths_df])

In [26]:
df.columns

Index([u'Address', u'DateOfSale', u'SalePrice', u'IsAStudio', u'Beds',
       u'Baths', u'Size', u'LotSize', u'BuiltInYear', u'Bath_1', u'Bath_2',
       u'Bath_3', u'Bath_4'],
      dtype='object')

### Activity | Linear modeling using `Bath`'s one-hot encoding

> ### `SalesPrice` as a function of `Bath_2`, `Bath_3`, and `Bath_4`

In [27]:
smf.ols(formula = 'SalePrice ~ Bath_2 + Bath_3 + Bath_4', data = df).fit().summary()

0,1,2,3
Dep. Variable:,SalePrice,R-squared:,0.043
Model:,OLS,Adj. R-squared:,0.039
Method:,Least Squares,F-statistic:,11.78
Date:,"Thu, 12 Jan 2017",Prob (F-statistic):,1.49e-07
Time:,20:46:36,Log-Likelihood:,-1314.2
No. Observations:,794,AIC:,2636.0
Df Residuals:,790,BIC:,2655.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
Intercept,0.9914,0.070,14.249,0.000,0.855 1.128
Bath_2,0.2831,0.099,2.855,0.004,0.088 0.478
Bath_3,0.4808,0.142,3.383,0.001,0.202 0.760
Bath_4,1.2120,0.232,5.231,0.000,0.757 1.667

0,1,2,3
Omnibus:,1817.972,Durbin-Watson:,1.867
Prob(Omnibus):,0.0,Jarque-Bera (JB):,8069883.811
Skew:,19.917,Prob(JB):,0.0
Kurtosis:,495.28,Cond. No.,5.79


> ### `SalesPrice` as a function of `Bath_1`, `Bath_3`, and `Bath_4`

In [28]:
smf.ols(formula = 'SalePrice ~ Bath_1 + Bath_3 + Bath_4', data = df).fit().summary()

0,1,2,3
Dep. Variable:,SalePrice,R-squared:,0.043
Model:,OLS,Adj. R-squared:,0.039
Method:,Least Squares,F-statistic:,11.78
Date:,"Thu, 12 Jan 2017",Prob (F-statistic):,1.49e-07
Time:,20:47:34,Log-Likelihood:,-1314.2
No. Observations:,794,AIC:,2636.0
Df Residuals:,790,BIC:,2655.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
Intercept,1.2745,0.071,18.040,0.000,1.136 1.413
Bath_1,-0.2831,0.099,-2.855,0.004,-0.478 -0.088
Bath_3,0.1977,0.143,1.386,0.166,-0.082 0.478
Bath_4,0.9290,0.232,4.003,0.000,0.473 1.384

0,1,2,3
Omnibus:,1817.972,Durbin-Watson:,1.867
Prob(Omnibus):,0.0,Jarque-Bera (JB):,8069883.811
Skew:,19.917,Prob(JB):,0.0
Kurtosis:,495.28,Cond. No.,5.84


> ### `SalesPrice` as a function of `Bath_1`, `Bath_2`, and `Bath_4`

In [29]:
smf.ols(formula = 'SalePrice ~ Bath_1 + Bath_2 + Bath_4', data = df).fit().summary()

0,1,2,3
Dep. Variable:,SalePrice,R-squared:,0.043
Model:,OLS,Adj. R-squared:,0.039
Method:,Least Squares,F-statistic:,11.78
Date:,"Thu, 12 Jan 2017",Prob (F-statistic):,1.49e-07
Time:,20:48:10,Log-Likelihood:,-1314.2
No. Observations:,794,AIC:,2636.0
Df Residuals:,790,BIC:,2655.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
Intercept,1.4722,0.124,11.881,0.000,1.229 1.715
Bath_1,-0.4808,0.142,-3.383,0.001,-0.760 -0.202
Bath_2,-0.1977,0.143,-1.386,0.166,-0.478 0.082
Bath_4,0.7313,0.253,2.886,0.004,0.234 1.229

0,1,2,3
Omnibus:,1817.972,Durbin-Watson:,1.867
Prob(Omnibus):,0.0,Jarque-Bera (JB):,8069883.811
Skew:,19.917,Prob(JB):,0.0
Kurtosis:,495.28,Cond. No.,7.52


> ### `SalesPrice` as a function of `Bath_1`, `Bath_2`, and `Bath_3`

In [30]:
smf.ols(formula = 'SalePrice ~ Bath_1 + Bath_2 + Bath_3', data = df).fit().summary()

0,1,2,3
Dep. Variable:,SalePrice,R-squared:,0.043
Model:,OLS,Adj. R-squared:,0.039
Method:,Least Squares,F-statistic:,11.78
Date:,"Thu, 12 Jan 2017",Prob (F-statistic):,1.49e-07
Time:,20:48:47,Log-Likelihood:,-1314.2
No. Observations:,794,AIC:,2636.0
Df Residuals:,790,BIC:,2655.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5
,coef,std err,t,P>|t|,[95.0% Conf. Int.]
Intercept,2.2035,0.221,9.969,0.000,1.770 2.637
Bath_1,-1.2120,0.232,-5.231,0.000,-1.667 -0.757
Bath_2,-0.9290,0.232,-4.003,0.000,-1.384 -0.473
Bath_3,-0.7313,0.253,-2.886,0.004,-1.229 -0.234

0,1,2,3
Omnibus:,1817.972,Durbin-Watson:,1.867
Prob(Omnibus):,0.0,Jarque-Bera (JB):,8069883.811
Skew:,19.917,Prob(JB):,0.0
Kurtosis:,495.28,Cond. No.,11.7


## Part C - The Adverstising Dataset

In [None]:
df = pd.read_csv(os.path.join('..', 'datasets', 'dataset-09-advertising.csv'))

In [None]:
df

### Plots

> ### Sales ~ TV

In [None]:
sns.lmplot(x = 'TV', y = 'Sales', data = df)

> ### Sales ~ Radio

In [None]:
sns.lmplot(x = 'Radio', y = 'Sales', data = df)

> ### Sales ~ Newspaper

In [None]:
sns.lmplot(x = 'Newspaper', y = 'Sales', data = df)

### Simple linear regressions

> ### Sales ~ TV

In [None]:
model_tv = smf.ols(formula = 'Sales ~ TV', data = df).fit()

model_tv.summary()

> ### Sales ~ Radio

In [None]:
model_radio = smf.ols(formula = 'Sales ~ Radio', data = df).fit()

model_radio.summary()

> ### Sales ~ Newspaper

In [None]:
model_newspaper = smf.ols(formula = 'Sales ~ Newspaper', data = df).fit()

model_newspaper.summary()

### Residuals

> ### Sales ~ TV

In [None]:
sm.qqplot(model_tv.resid, line = 's')

pass

In [None]:
sm.graphics.plot_regress_exog(model_tv, 'TV')

pass

> ### Sales ~ Radio

In [None]:
sm.qqplot(model_radio.resid, line = 's')

pass

In [None]:
sm.graphics.plot_regress_exog(model_radio, 'Radio')

pass

> ### Sales ~ Newspaper

In [None]:
sm.qqplot(model_newspaper.resid, line = 's')

pass

In [None]:
sm.graphics.plot_regress_exog(model_newspaper, 'Newspaper')

pass

> ### Sales ~ TV + Radio + Newspaper

In [None]:
# TODO

model.summary()

> ### Sales ~ TV + Radio

In [None]:
# TODO

model.summary()

In [None]:
sm.qqplot(model.resid, line = 's')

pass

In [None]:
sm.graphics.plot_regress_exog(model, 'TV')

pass

In [None]:
sm.graphics.plot_regress_exog(model, 'Radio')

pass

## Part D - Interaction Effects and Advertising

### Sales ~ TV + Radio + TV * Radio

In [None]:
model = smf.ols(formula = 'Sales ~ TV + Radio + TV * Radio', data = df).fit()

model.summary()

In [None]:
sm.qqplot(model.resid, line = 's')

pass

In [None]:
sm.graphics.plot_regress_exog(model, 'TV')

pass

In [None]:
sm.graphics.plot_regress_exog(model, 'Radio')

pass

In [None]:
sm.graphics.plot_regress_exog(model, 'TV:Radio')

pass