In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error,mean_squared_error,r2_score

In [2]:
# Import, convert 'Date' col to datetime
liquor=pd.read_csv('../Assets/Iowa_Liquor_sample.csv',parse_dates=['Date'],infer_datetime_format=True)

# format column names
import re

liquor.columns = [re.sub("[^a-zA-Z]+", "", x) for x in liquor.columns]

# remove '$' in values and convert to numeric
adjust_cols = ['StateBottleCost','StateBottleRetail','SaleDollars']

for col in adjust_cols:
    liquor[col] = pd.to_numeric(liquor[col].str.replace('$',''),errors='coerce')

liquor.dropna(inplace=True)

In [3]:
# calculate profit per sales transaction
liquor['Profit'] = liquor.SaleDollars - (liquor.BottlesSold * liquor.StateBottleCost)

In [4]:
# Filter to only 2015 
liquor2015 = liquor[liquor.Date.dt.year == 2015]

In [5]:
liquor2015.head()

Unnamed: 0,Date,StoreNumber,City,ZipCode,CountyNumber,County,Category,CategoryName,VendorNumber,ItemNumber,ItemDescription,BottleVolumeml,StateBottleCost,StateBottleRetail,BottlesSold,SaleDollars,VolumeSoldLiters,VolumeSoldGallons,Profit
0,2015-11-04,3717,SUMNER,50674,9.0,Bremer,1051100.0,APRICOT BRANDIES,55,54436,Mr. Boston Apricot Brandy,750,4.5,6.75,12,81.0,9.0,2.38,27.0
4,2015-08-18,3654,BELMOND,50421,99.0,Wright,1031080.0,VODKA 80 PROOF,297,35918,Five O'clock Vodka,1750,7.2,10.8,12,129.6,21.0,5.55,43.2
5,2015-04-20,2569,CEDAR RAPIDS,52402,57.0,Linn,1041100.0,AMERICAN DRY GINS,205,31473,New Amsterdam Gin,1750,13.32,19.98,6,119.88,10.5,2.77,39.96
6,2015-08-05,2596,OTTUMWA,52501,90.0,Wapello,1051010.0,AMERICAN GRAPE BRANDIES,85,52806,Korbel Brandy,750,6.66,9.99,3,29.97,2.25,0.59,9.99
7,2015-06-25,3456,CLEAR LAKE,50428,17.0,Cerro Gordo,1012100.0,CANADIAN WHISKIES,65,10628,Canadian Club Whisky,1750,15.75,23.63,2,47.26,3.5,0.92,15.76


In [6]:
# Create a dictionary called agg (what is this for?)
agg_dict = {'StateBottleCost':[np.mean,np.sum],
'SaleDollars':[np.sum,np.mean],
'Profit':[np.median,np.mean,np.sum]}

In [7]:
# Pass this dictionary to the groupby statement
liquor2015_stores = liquor2015.groupby('StoreNumber').agg(agg_dict)
liquor2015_stores.head()

Unnamed: 0_level_0,Profit,Profit,Profit,StateBottleCost,StateBottleCost,SaleDollars,SaleDollars
Unnamed: 0_level_1,median,mean,sum,mean,sum,sum,mean
StoreNumber,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
2106,57.0,92.665779,48742.2,10.298783,5417.16,146038.7,277.640114
2113,12.39,21.149932,3109.04,10.870544,1597.97,9310.22,63.33483
2130,51.0,95.215652,37229.32,9.822762,3840.7,111583.91,285.380844
2152,7.62,18.351277,2587.53,8.565035,1207.67,7721.08,54.759433
2178,27.0,34.45443,8165.7,9.689831,2296.49,24324.18,102.633671


In [8]:
# this comes out with StoreNumber as my index, I'll reset it
liquor2015_stores = liquor2015_stores.reset_index()

In [10]:
# My column names are split. I'll process through them to get single feature names
liquor2015_stores.columns = [['_'.join(col).rstrip('_') for col in liquor2015_stores.columns.values]]
liquor2015_stores.head()

Unnamed: 0,StoreNumber,Profit_median,Profit_mean,Profit_sum,StateBottleCost_mean,StateBottleCost_sum,SaleDollars_sum,SaleDollars_mean
0,2106,57.0,92.665779,48742.2,10.298783,5417.16,146038.7,277.640114
1,2113,12.39,21.149932,3109.04,10.870544,1597.97,9310.22,63.33483
2,2130,51.0,95.215652,37229.32,9.822762,3840.7,111583.91,285.380844
3,2152,7.62,18.351277,2587.53,8.565035,1207.67,7721.08,54.759433
4,2178,27.0,34.45443,8165.7,9.689831,2296.49,24324.18,102.633671


In [13]:
# Merge my features and my target on the 'StoreNumber' column
liquor2015_combined = pd.merge(liquor2015_stores,liquor2015,on=['StoreNumber'])

In [19]:
# define my features
feature_cols = ['StoreNumber', 'Profit_median', 'StateBottleCost_mean', 'SaleDollars_sum']
feature_cols

['StoreNumber', 'Profit_median', 'StateBottleCost_mean', 'SaleDollars_sum']

In [22]:
liquor2015_combined.head()

Unnamed: 0,StoreNumber,Profit_median,Profit_mean,Profit_sum,StateBottleCost_mean,StateBottleCost_sum,SaleDollars_sum,SaleDollars_mean,Date,City,...,ItemNumber,ItemDescription,BottleVolumeml,StateBottleCost,StateBottleRetail,BottlesSold,SaleDollars,VolumeSoldLiters,VolumeSoldGallons,Profit
0,2106,57.0,92.665779,48742.2,10.298783,5417.16,146038.7,277.640114,2015-08-20,CEDAR FALLS,...,34007,Absolut Swedish Vodka 80 Prf,1000,14.99,22.49,36,809.64,36.0,9.51,270.0
1,2106,57.0,92.665779,48742.2,10.298783,5417.16,146038.7,277.640114,2015-03-12,CEDAR FALLS,...,22216,Woodford Reserve Bourbon,750,21.02,31.53,6,189.18,4.5,1.19,63.06
2,2106,57.0,92.665779,48742.2,10.298783,5417.16,146038.7,277.640114,2015-07-30,CEDAR FALLS,...,69947,Rumple Minze Peppermint Schnapps Liqueur,1000,15.75,23.63,12,283.56,12.0,3.17,94.56
3,2106,57.0,92.665779,48742.2,10.298783,5417.16,146038.7,277.640114,2015-01-22,CEDAR FALLS,...,30527,Hawkeye Gin,1000,4.23,6.35,12,76.2,12.0,3.17,25.44
4,2106,57.0,92.665779,48742.2,10.298783,5417.16,146038.7,277.640114,2015-08-13,CEDAR FALLS,...,82637,Dekuyper Grape Pucker,1000,7.62,11.43,12,137.16,12.0,3.17,45.72


In [24]:
# Define x and y
X = liquor2015_combined[feature_cols]
y = liquor2015_combined.Profit

In [25]:
# instantiate & fit my model
linreg = LinearRegression()
linreg.fit(X, y)
print linreg.coef_

[ -8.16820365e-04   1.71400640e+00   6.16474388e-01   7.43003359e-05]


In [26]:
# pair the feature names with the coefficients using a command called 'zip'
print(pd.Series(list(zip(feature_cols, linreg.coef_))))

0         (StoreNumber, -0.000816820365469)
1            (Profit_median, 1.71400640228)
2    (StateBottleCost_mean, 0.616474387925)
3       (SaleDollars_sum, 7.4300335858e-05)
dtype: object


In [28]:
# perform the sale aggregation steps on the Q1 2016 data
lq_2016 = liquor[liquor.Date.dt.year == 2016]
lq_2016['Profit'] = lq_2016.SaleDollars - (lq_2016.BottlesSold * lq_2016.StateBottleCost)
lq_2016 = lq_2016.groupby('StoreNumber').agg(agg_dict)
lq_2016.columns = ['_'.join(tup).rstrip('_') for tup in lq_2016.columns.values]
lq_2016.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0_level_0,Profit_median,Profit_mean,Profit_sum,StateBottleCost_mean,StateBottleCost_sum,SaleDollars_sum,SaleDollars_mean
StoreNumber,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2106,51.96,80.751429,10174.68,10.363413,1305.79,30478.75,241.894841
2113,12.6,18.742973,693.49,10.709459,396.25,2065.9,55.835135
2130,51.0,79.387094,9288.29,10.620085,1242.55,27856.11,238.08641
2152,8.38,15.204,304.08,8.2565,165.13,904.66,45.233
2178,24.96,32.22,1868.76,9.266379,537.45,5588.5,96.353448


In [38]:
# this comes out with StoreNumber as my index, I'll reset it
lq_2016 = lq_2016.reset_index()

In [41]:
# 2016 predictions
X_all = liquor2015_combined[feature_cols]
y_all = liquor2015_combined.Profit
X_2016 = lq_2016[feature_cols]

In [44]:
# train on all method
linreg.fit(X_all,y_all)
lq_2016['prediction'] = linreg.predict(X_2016)

In [45]:
# Show your 2016 year-end prediction for each store
lq_2016[['prediction']].reset_index()

Unnamed: 0,index,prediction
0,0,86.138190
1,1,16.771424
2,2,84.436509
3,3,7.907994
4,4,37.275559
5,5,22.233786
6,6,60.362284
7,7,17.085271
8,8,7.994119
9,9,54.553510
