In [1]:
import sklearn.metrics as metrics
import sklearn.linear_model as linear_model
from sklearn.linear_model import LinearRegression
from sklearn import metrics
import numpy as np
import statsmodels.api as sm
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

  from pandas.core import datetools


## The purpose of this notebook is to recommend 3 counties

In [2]:
## Load the data into a DataFrame
stores=pd.read_csv('/Users/austinlasseter/DSI-EC-2/projects/datasets/stores_modeling.csv')
stores=stores.drop(['Unnamed: 0'], axis=1)

In [3]:
# The strategy: limit it to suburban counties, then rank on density, and regress on the top 10.
# Urban population of 20,000 or more, not adjacent to a metro area
# https://www.ers.usda.gov/data-products/rural-urban-continuum-codes/documentation/

In [4]:
print(stores.columns)
stores.shape

Index(['store_id', 'fips', 'metro', 'unemployment', 'income', 'population',
       'ave_bottle_price', 'ave_number_bottles', 'annual_profit_per_store',
       'number_of_stores', 'density', 'unemp_bins', 'income_bins', 'pop_bins',
       'nstores_bins', 'density_bins', 'suburbs', 'town', 'rural'],
      dtype='object')


(1291, 19)

## Explore the 5 suburban counties

In [5]:
# How many stores are there in the 5 suburban counties? Just 88.
stores.metro.value_counts()

2.0    453
3.0    246
6.0    192
7.0    170
5.0     88
8.0     52
4.0     49
9.0     41
Name: metro, dtype: int64

In [6]:
# Confirm that our new dataset only has 88 stores in it.
suburbs=stores[stores['suburbs']==1]
suburbs.shape

(88, 19)

In [7]:
# What are the fips codes of these 5 counties?
stores[stores.metro==5].fips.value_counts()

19057    20
19187    19
19033    19
19111    16
19179    14
Name: fips, dtype: int64

In [8]:
# What's their average annual profit, and how does it compare to the average of all stores in non-suburb counties?
print(stores[stores.suburbs!=5].annual_profit_per_store.mean())
for n in [19033, 19057, 19111, 19179, 19187]:
    print(n, suburbs[suburbs['fips']==n].annual_profit_per_store.mean())
    # 3 have a higher average profit, 2 are slightly lower.

48249.49529961269
19033 93602.73237894736
19057 47770.31228999996
19111 64065.429512500006
19179 77392.45350000013
19187 41929.80885263157


In [9]:
# Based just on this information alone, I'd rank the counties as follows:

# 19033 Cerro Gordo: $93,602
# 19179 Wapello: $77,392
# 19111 Lee: $64,065
# 19057 Des Moines: $47,770
# 19187 Webster: $41,929

In [10]:
# Rank the 5 counties based on density of people per store.
x=suburbs.groupby('fips').density.mean()
print(x)
sorted(x)
# Based on this information, I'd predict that 19179 (Wapello) is the best place to put a new store
# followed by 19033 (Cerro Gordo)
# and 19187 (Webster) is the worst. Let's test that.

fips
19033    2207.550000
19057    1920.238095
19111    2109.529412
19179    2544.642857
19187    1900.650000
Name: density, dtype: float64


[1900.6500000000008,
 1920.2380952380954,
 2109.5294117647063,
 2207.5500000000002,
 2544.642857142856]

## Regression Model on the 5 suburban counties

In [11]:
# Dummify the 5 fips codes
suburbs = pd.concat([suburbs, pd.get_dummies(suburbs['fips'])], axis = 1);

In [12]:
suburbs.columns

Index([               'store_id',                    'fips',
                         'metro',            'unemployment',
                        'income',              'population',
              'ave_bottle_price',      'ave_number_bottles',
       'annual_profit_per_store',        'number_of_stores',
                       'density',              'unemp_bins',
                   'income_bins',                'pop_bins',
                  'nstores_bins',            'density_bins',
                       'suburbs',                    'town',
                         'rural',                     19033,
                           19057,                     19111,
                           19179,                     19187],
      dtype='object')

In [13]:
# regression model # 7c
dep = suburbs['annual_profit_per_store'] # This is the outcome I want to predict
indep = suburbs.drop([               'store_id',                    'fips',
                         'metro',            'unemployment',
                        'income',              'population',
              
       'annual_profit_per_store',        'number_of_stores',
                                               'unemp_bins',
                   'income_bins',                'pop_bins',
                  'nstores_bins',            'density_bins',
                       'suburbs',                    'town',
                         'rural',                    19187],
                    axis = 'columns') # These are the features that predict it
indep = sm.add_constant(indep) # Add the intercept
model = sm.OLS(dep,indep) # Instantiate the model
results = model.fit() # Fit the model
results.summary() # Summarize the results

0,1,2,3
Dep. Variable:,annual_profit_per_store,R-squared:,0.223
Model:,OLS,Adj. R-squared:,0.165
Method:,Least Squares,F-statistic:,3.867
Date:,"Fri, 09 Feb 2018",Prob (F-statistic):,0.00192
Time:,06:47:52,Log-Likelihood:,-1102.5
No. Observations:,88,AIC:,2219.0
Df Residuals:,81,BIC:,2236.0
Df Model:,6,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-3.938e+04,1.43e+04,-2.754,0.007,-6.78e+04,-1.09e+04
ave_bottle_price,1.275e+04,3182.472,4.007,0.000,6418.710,1.91e+04
ave_number_bottles,912.2247,1882.386,0.485,0.629,-2833.133,4657.582
density,-53.2624,23.567,-2.260,0.027,-100.153,-6.372
19033,6.582e+04,2.4e+04,2.743,0.007,1.81e+04,1.14e+05
19057,1.695e+04,2.25e+04,0.752,0.454,-2.79e+04,6.18e+04
19111,3.912e+04,2.46e+04,1.592,0.115,-9769.273,8.8e+04
19179,7.165e+04,2.94e+04,2.441,0.017,1.33e+04,1.3e+05

0,1,2,3
Omnibus:,14.535,Durbin-Watson:,1.746
Prob(Omnibus):,0.001,Jarque-Bera (JB):,16.039
Skew:,0.995,Prob(JB):,0.000329
Kurtosis:,3.645,Cond. No.,8.35e+18


In [14]:
# Based on these results, and in comparison to Webster county, 
# we conclude that annual per-store profit will increase in each county by the following amounts:
# 19179 Wapello: $71,650
# 19033 Cerro Gordo: $65,820
# 19057 Des Moines: No sig difference
# 19111 Lee: No sig difference

In [15]:
# Also, the R-squared is actually not so bad: 22%

## What about in comparison to the state as a whole?

In [16]:
# Let's convert each of the 5 counties to dummy variables
stores = pd.concat([stores, pd.get_dummies(stores['fips']==19179)], axis = 1);
stores.rename(columns = {True: 'Wapello'}, inplace=True)
stores = pd.concat([stores, pd.get_dummies(stores['fips']==19033)], axis = 1);
stores.rename(columns = {True: 'Cerro Gordo'}, inplace=True)
stores = pd.concat([stores, pd.get_dummies(stores['fips']==19057)], axis = 1);
stores.rename(columns = {True: 'Des Moines'}, inplace=True)
stores = pd.concat([stores, pd.get_dummies(stores['fips']==19111 )], axis = 1);
stores.rename(columns = {True: 'Lee'}, inplace=True)
stores = pd.concat([stores, pd.get_dummies(stores['fips']==19187 )], axis = 1);
stores.rename(columns = {True: 'Webster'}, inplace=True)
stores.drop([False], axis=1, inplace=True)
stores.columns

Index(['store_id', 'fips', 'metro', 'unemployment', 'income', 'population',
       'ave_bottle_price', 'ave_number_bottles', 'annual_profit_per_store',
       'number_of_stores', 'density', 'unemp_bins', 'income_bins', 'pop_bins',
       'nstores_bins', 'density_bins', 'suburbs', 'town', 'rural', 'Wapello',
       'Cerro Gordo', 'Des Moines', 'Lee', 'Webster'],
      dtype='object')

In [17]:
stores.columns

Index(['store_id', 'fips', 'metro', 'unemployment', 'income', 'population',
       'ave_bottle_price', 'ave_number_bottles', 'annual_profit_per_store',
       'number_of_stores', 'density', 'unemp_bins', 'income_bins', 'pop_bins',
       'nstores_bins', 'density_bins', 'suburbs', 'town', 'rural', 'Wapello',
       'Cerro Gordo', 'Des Moines', 'Lee', 'Webster'],
      dtype='object')

In [31]:
# regression model # 7c
dep = stores['annual_profit_per_store'] # This is the outcome I want to predict
indep = stores.drop(['store_id', 'fips', 'metro', 'unemployment', 'income', 'population',
      'annual_profit_per_store',
       'number_of_stores', 'density', 'unemp_bins', 'income_bins', 'pop_bins',
       'nstores_bins', 'density_bins', 'suburbs', 'town', 'rural'],
                    axis = 'columns') # These are the features that predict it
indep = sm.add_constant(indep) # Add the intercept
model = sm.OLS(dep,indep) # Instantiate the model
results = model.fit() # Fit the model
results.summary() # Summarize the results

0,1,2,3
Dep. Variable:,annual_profit_per_store,R-squared:,0.134
Model:,OLS,Adj. R-squared:,0.129
Method:,Least Squares,F-statistic:,28.37
Date:,"Fri, 09 Feb 2018",Prob (F-statistic):,1.73e-36
Time:,06:53:30,Log-Likelihood:,-15863.0
No. Observations:,1291,AIC:,31740.0
Df Residuals:,1283,BIC:,31780.0
Df Model:,7,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-6.559e+04,8949.819,-7.329,0.000,-8.31e+04,-4.8e+04
ave_bottle_price,7681.5179,577.174,13.309,0.000,6549.210,8813.826
ave_number_bottles,949.0051,315.785,3.005,0.003,329.494,1568.516
Wapello,3.043e+04,1.42e+04,2.150,0.032,2660.774,5.82e+04
Cerro Gordo,4.394e+04,1.22e+04,3.608,0.000,2.01e+04,6.78e+04
Des Moines,5149.7776,1.19e+04,0.433,0.665,-1.82e+04,2.85e+04
Lee,1.863e+04,1.33e+04,1.405,0.160,-7388.381,4.46e+04
Webster,-6562.5395,1.22e+04,-0.539,0.590,-3.05e+04,1.73e+04

0,1,2,3
Omnibus:,448.922,Durbin-Watson:,1.58
Prob(Omnibus):,0.0,Jarque-Bera (JB):,1460.666
Skew:,1.732,Prob(JB):,0.0
Kurtosis:,6.893,Cond. No.,165.0


## Conclusion: Build your store in Wapello or Cerro Gordo county.