In [1]:
import sklearn.metrics as metrics
import sklearn.linear_model as linear_model
import statsmodels.api as sm
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

  from pandas.core import datetools


In [2]:
## Load the data into a DataFrame
stores=pd.read_csv('/Users/austinlasseter/DSI-EC-2/projects/datasets/stores_postEDA.csv')
stores=stores.drop(['Unnamed: 0'], axis=1)
stores.head()

Unnamed: 0,store_id,fips,metro,unemployment,income,population,ave_bottle_price,ave_number_bottles,annual_profit_per_store,unemp_bins,income_bins,pop_bins
0,2113,19187,5.0,4.0,48013.0,38013.0,15.963071,4.763409,33857.9524,"(3, 4]","(47000, 50000]","(30000, 50000]"
1,2152,19033,5.0,3.7,53109.0,44151.0,12.897199,4.13101,28489.271,"(3, 4]","(50000, 70000]","(30000, 50000]"
2,2178,19005,6.0,5.0,49439.0,14330.0,15.046966,7.699587,110017.9018,"(4, 5]","(47000, 50000]","(0, 30000]"
3,2200,19161,9.0,3.1,54660.0,10350.0,17.455233,3.970626,88540.297,"(3, 4]","(50000, 70000]","(0, 30000]"
4,2205,19145,6.0,5.1,49507.0,15932.0,15.036204,6.048682,91329.2742,"(5, 6]","(47000, 50000]","(0, 30000]"


In [3]:
# List out all my variables
stores.columns

Index(['store_id', 'fips', 'metro', 'unemployment', 'income', 'population',
       'ave_bottle_price', 'ave_number_bottles', 'annual_profit_per_store',
       'unemp_bins', 'income_bins', 'pop_bins'],
      dtype='object')

## Model 1

In [5]:
# regression model #1
dep = stores['annual_profit_per_store'] # This is the outcome I want to predict
indep = stores.drop(['store_id', 'fips','unemp_bins', 'income_bins', 'pop_bins', 'annual_profit_per_store'], axis = 'columns') # These are the features that predict it
indep = sm.add_constant(indep) # Add the intercept
model = sm.OLS(dep,indep) # Instantiate the model
results = model.fit() # Fit the model
results.summary() # Summarize the results

0,1,2,3
Dep. Variable:,annual_profit_per_store,R-squared:,0.131
Model:,OLS,Adj. R-squared:,0.127
Method:,Least Squares,F-statistic:,32.2
Date:,"Wed, 07 Feb 2018",Prob (F-statistic):,2.96e-36
Time:,16:56:01,Log-Likelihood:,-15865.0
No. Observations:,1291,AIC:,31740.0
Df Residuals:,1284,BIC:,31780.0
Df Model:,6,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-3.624e+04,2.67e+04,-1.359,0.174,-8.85e+04,1.61e+04
metro,-2868.8224,991.464,-2.894,0.004,-4813.889,-923.755
unemployment,2330.8888,2287.192,1.019,0.308,-2156.155,6817.933
income,-0.4603,0.303,-1.520,0.129,-1.055,0.134
population,-0.0036,0.014,-0.256,0.798,-0.032,0.024
ave_bottle_price,7941.7891,583.090,13.620,0.000,6797.876,9085.702
ave_number_bottles,780.4217,319.458,2.443,0.015,153.705,1407.139

0,1,2,3
Omnibus:,462.171,Durbin-Watson:,1.099
Prob(Omnibus):,0.0,Jarque-Bera (JB):,1561.691
Skew:,1.77,Prob(JB):,0.0
Kurtosis:,7.061,Cond. No.,3460000.0


In [None]:
# Hmm. That model only has an r2 of 13%, and only 3 significant predictors. Let's try it differently

## Model 2 - Population

In [None]:
# Let's use some dummies for our binned variables

In [8]:
# Distribution of pop:
stores.pop_bins.value_counts()

(0, 30000]          487
(100000, 200000]    204
(300000, 500000]    189
(30000, 50000]      188
(70000, 100000]     117
(200000, 300000]     91
(50000, 70000]       15
Name: pop_bins, dtype: int64

In [25]:
# Let's convert population to dummy variables
model2 = pd.concat([stores, pd.get_dummies(stores['pop_bins'])], axis = 1);

In [17]:
model2.columns

Index(['store_id', 'fips', 'metro', 'unemployment', 'income', 'population',
       'ave_bottle_price', 'ave_number_bottles', 'annual_profit_per_store',
       'unemp_bins', 'income_bins', 'pop_bins', '(0, 30000]',
       '(100000, 200000]', '(200000, 300000]', '(30000, 50000]',
       '(300000, 500000]', '(50000, 70000]', '(70000, 100000]'],
      dtype='object')

In [None]:
# Let's see if population alone can predict store profits:

In [20]:
# regression model # 2
dep = model2['annual_profit_per_store'] # This is the outcome I want to predict
indep = model2.drop(['store_id', 'fips', 'metro', 'unemployment', 'income', 'population',
       'ave_bottle_price', 'ave_number_bottles', 'annual_profit_per_store',
       'unemp_bins', 'income_bins', 'pop_bins'], axis = 'columns') # These are the features that predict it
indep = sm.add_constant(indep) # Add the intercept
model = sm.OLS(dep,indep) # Instantiate the model
results = model.fit() # Fit the model
results.summary() # Summarize the results

0,1,2,3
Dep. Variable:,annual_profit_per_store,R-squared:,0.01
Model:,OLS,Adj. R-squared:,0.005
Method:,Least Squares,F-statistic:,2.155
Date:,"Wed, 07 Feb 2018",Prob (F-statistic):,0.0449
Time:,17:08:25,Log-Likelihood:,-15949.0
No. Observations:,1291,AIC:,31910.0
Df Residuals:,1284,BIC:,31950.0
Df Model:,6,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,4.036e+04,2266.177,17.812,0.000,3.59e+04,4.48e+04
"(0, 30000]",4561.2748,3164.410,1.441,0.150,-1646.708,1.08e+04
"(100000, 200000]",1.686e+04,4096.392,4.116,0.000,8824.452,2.49e+04
"(200000, 300000]",1623.2266,5589.324,0.290,0.772,-9341.984,1.26e+04
"(30000, 50000]",1.307e+04,4215.624,3.100,0.002,4799.434,2.13e+04
"(300000, 500000]",1.03e+04,4207.687,2.447,0.015,2040.845,1.86e+04
"(50000, 70000]",-6615.5716,1.28e+04,-0.517,0.605,-3.17e+04,1.85e+04
"(70000, 100000]",569.8808,5043.752,0.113,0.910,-9325.019,1.05e+04

0,1,2,3
Omnibus:,524.357,Durbin-Watson:,1.031
Prob(Omnibus):,0.0,Jarque-Bera (JB):,1803.743
Skew:,2.043,Prob(JB):,0.0
Kurtosis:,7.103,Cond. No.,4270000000000000.0


In [None]:
# Certain population categories ((100000, 200000],(30000, 50000] and (300000, 500000]) have a significant effect
# But the R-squared is only 1 percent

## Model 3 - Unemployment

In [21]:
# Distribution of unemployment:
stores.unemp_bins.value_counts()

(3, 4]    748
(4, 5]    286
(2, 3]    214
(5, 6]     27
(6, 7]     16
Name: unemp_bins, dtype: int64

In [23]:
# Let's convert population to dummy variables
model3 = pd.concat([stores, pd.get_dummies(stores['unemp_bins'])], axis = 1);

In [26]:
# regression model # 3
dep = model3['annual_profit_per_store'] # This is the outcome I want to predict
indep = model3.drop(['store_id', 'fips', 'metro', 'unemployment', 'income', 'population',
       'ave_bottle_price', 'ave_number_bottles', 'annual_profit_per_store',
       'unemp_bins', 'income_bins', 'pop_bins'], axis = 'columns') # These are the features that predict it
indep = sm.add_constant(indep) # Add the intercept
model = sm.OLS(dep,indep) # Instantiate the model
results = model.fit() # Fit the model
results.summary() # Summarize the results

0,1,2,3
Dep. Variable:,annual_profit_per_store,R-squared:,0.005
Model:,OLS,Adj. R-squared:,0.001
Method:,Least Squares,F-statistic:,1.479
Date:,"Wed, 07 Feb 2018",Prob (F-statistic):,0.206
Time:,17:13:39,Log-Likelihood:,-15953.0
No. Observations:,1291,AIC:,31920.0
Df Residuals:,1286,BIC:,31940.0
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,4.589e+04,3103.422,14.785,0.000,3.98e+04,5.2e+04
"(2, 3]",5978.7313,4420.047,1.353,0.176,-2692.563,1.47e+04
"(3, 4]",-156.6971,3530.606,-0.044,0.965,-7083.077,6769.683
"(4, 5]",3891.6105,4128.329,0.943,0.346,-4207.389,1.2e+04
"(5, 6]",1.799e+04,9388.406,1.916,0.056,-426.320,3.64e+04
"(6, 7]",1.818e+04,1.19e+04,1.525,0.128,-5207.499,4.16e+04

0,1,2,3
Omnibus:,526.328,Durbin-Watson:,1.057
Prob(Omnibus):,0.0,Jarque-Bera (JB):,1818.064
Skew:,2.05,Prob(JB):,0.0
Kurtosis:,7.122,Cond. No.,5470000000000000.0


In [None]:
# None of the unemployment brackets have a sig effect on annual profits
# The model has an R2 of only 5%
# Let's drop unemployment from our final model

## Model 3 - Average Household Income

In [29]:
# Distribution of income:
stores.income_bins.value_counts()

(50000, 70000]     1049
(47000, 50000]      170
(40000, 47000]       24
(70000, 80000]       22
(80000, 100000]      15
(0, 40000]           11
Name: income_bins, dtype: int64

In [30]:
# Let's convert to dummy variables
model4 = pd.concat([stores, pd.get_dummies(stores['income_bins'])], axis = 1);

In [31]:
# regression model # 3
dep = model4['annual_profit_per_store'] # This is the outcome I want to predict
indep = model4.drop(['store_id', 'fips', 'metro', 'unemployment', 'income', 'population',
       'ave_bottle_price', 'ave_number_bottles', 'annual_profit_per_store',
       'unemp_bins', 'income_bins', 'pop_bins'], axis = 'columns') # These are the features that predict it
indep = sm.add_constant(indep) # Add the intercept
model = sm.OLS(dep,indep) # Instantiate the model
results = model.fit() # Fit the model
results.summary() # Summarize the results

0,1,2,3
Dep. Variable:,annual_profit_per_store,R-squared:,0.004
Model:,OLS,Adj. R-squared:,-0.0
Method:,Least Squares,F-statistic:,0.9657
Date:,"Wed, 07 Feb 2018",Prob (F-statistic):,0.438
Time:,17:15:54,Log-Likelihood:,-15953.0
No. Observations:,1291,AIC:,31920.0
Df Residuals:,1285,BIC:,31950.0
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,3.836e+04,4043.365,9.488,0.000,3.04e+04,4.63e+04
"(0, 40000]",-2019.5699,1.49e+04,-0.135,0.892,-3.13e+04,2.73e+04
"(40000, 47000]",2.722e+04,1.05e+04,2.582,0.010,6538.864,4.79e+04
"(47000, 50000]",1.032e+04,5452.566,1.892,0.059,-381.249,2.1e+04
"(50000, 70000]",1.001e+04,4303.187,2.327,0.020,1572.511,1.85e+04
"(70000, 80000]",-2556.1223,1.09e+04,-0.234,0.815,-2.4e+04,1.89e+04
"(80000, 100000]",-4612.8484,1.3e+04,-0.356,0.722,-3e+04,2.08e+04

0,1,2,3
Omnibus:,523.603,Durbin-Watson:,1.058
Prob(Omnibus):,0.0,Jarque-Bera (JB):,1794.498
Skew:,2.042,Prob(JB):,0.0
Kurtosis:,7.085,Cond. No.,3.16e+16


In [33]:
# Two employment bracket (40000, 47000] and (50000, 70000] a sig effect on annual profits
# The model has an R2 of only 4%

## Model 5 - Drop unemployment and population

In [36]:
# regression model # 5
dep = stores['annual_profit_per_store'] # This is the outcome I want to predict
indep = stores.drop(['store_id', 'fips','unemp_bins', 'income_bins', 
                     'pop_bins', 'annual_profit_per_store', 'unemployment', 'population'], 
                    axis = 'columns') # These are the features that predict it
indep = sm.add_constant(indep) # Add the intercept
model = sm.OLS(dep,indep) # Instantiate the model
results = model.fit() # Fit the model
results.summary() # Summarize the results

0,1,2,3
Dep. Variable:,annual_profit_per_store,R-squared:,0.13
Model:,OLS,Adj. R-squared:,0.127
Method:,Least Squares,F-statistic:,48.07
Date:,"Wed, 07 Feb 2018",Prob (F-statistic):,1.04e-37
Time:,17:24:10,Log-Likelihood:,-15866.0
No. Observations:,1291,AIC:,31740.0
Df Residuals:,1286,BIC:,31770.0
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-1.693e+04,1.87e+04,-0.904,0.366,-5.36e+04,1.98e+04
metro,-3010.5093,818.032,-3.680,0.000,-4615.334,-1405.685
income,-0.6353,0.251,-2.530,0.012,-1.128,-0.143
ave_bottle_price,7886.4864,580.175,13.593,0.000,6748.293,9024.680
ave_number_bottles,801.4734,318.702,2.515,0.012,176.240,1426.707

0,1,2,3
Omnibus:,460.873,Durbin-Watson:,1.098
Prob(Omnibus):,0.0,Jarque-Bera (JB):,1548.136
Skew:,1.768,Prob(JB):,0.0
Kurtosis:,7.035,Cond. No.,732000.0


In [None]:
# All 4 of our predictors have a significant effect
# The R-squared is only 14%

In [40]:
# Let's go with that as our model.