# Predicting Sales

In [1]:
# Restaurant Site Selection
# prepare for Python version 3x features and functions
from __future__ import division, print_function

# import packages for analysis and modeling
import pandas as pd  # data frame operations
import numpy as np  # arrays and math functions
import statsmodels.api as sm  # statistical models (including regression)
import statsmodels.formula.api as smf  # statistical models (including regression)

# read data for Studenmund's Restaurants
# creating data frame restdata
restdata = pd.read_csv('studenmunds_restaurants.csv')

# print the first five rows of the data frame
print(pd.DataFrame.head(restdata)) 

    sales  competition  population  income
0  107919            3       65044   13240
1  118866            5      101376   22554
2   98579            7      124989   16916
3  122015            2       55249   20967
4  152827            3       73775   19576


In [2]:
# specify regression model
my_model = str('sales ~ competition + population + income')

# fit the model to the data
my_model_fit = smf.ols(my_model, data = restdata).fit()
# summary of model fit to the training set
print(my_model_fit.summary())

                            OLS Regression Results                            
Dep. Variable:                  sales   R-squared:                       0.618
Model:                            OLS   Adj. R-squared:                  0.579
Method:                 Least Squares   F-statistic:                     15.65
Date:                Sun, 17 Feb 2019   Prob (F-statistic):           3.06e-06
Time:                        14:55:47   Log-Likelihood:                -360.99
No. Observations:                  33   AIC:                             730.0
Df Residuals:                      29   BIC:                             736.0
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                  coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------
Intercept    1.022e+05   1.28e+04      7.984      

In [3]:
# predictions from the model fit to the data for current stores
restdata['predict_sales'] = my_model_fit.fittedvalues

# compute the proportion of response variance accounted for
print('\nProportion of Test Set Variance Accounted for: ',\
    round(np.power(restdata['sales'].corr(restdata['predict_sales']),2),3))


Proportion of Test Set Variance Accounted for:  0.618


In [4]:
# define DataFrame of sites for new restaurants
sites_data = {'sales': [0,0,0],
             'competition': [2, 3, 5],
             'population': [50000, 200000, 220000],
             'income': [25000, 22000, 19000]}

sites = pd.DataFrame(sites_data)
# obtain predicted sales for the new restaurants
# rounding to the nearest dollar
sites['sales_pred'] = my_model_fit.predict(sites)
print('\nNew sites with predicted sales', sites, '\n')


New sites with predicted sales    sales  competition  population  income     sales_pred
0      0            2       50000   25000  133974.582027
1      0            3      200000   22000  174236.392570
2      0            5      220000   19000  159316.640948 

