In [15]:
import pandas as pd
import numpy as np

In [2]:
# Import, convert 'Date' col to datetime
liquor=pd.read_csv('../Assets/Iowa_Liquor_sample.csv',parse_dates=['Date'],infer_datetime_format=True)

# format column names
import re

liquor.columns = [re.sub("[^a-zA-Z]+", "", x) for x in liquor.columns]

# remove '$' in values and convert to numeric
adjust_cols = ['StateBottleCost','StateBottleRetail','SaleDollars']

for col in adjust_cols:
    liquor[col] = pd.to_numeric(liquor[col].str.replace('$',''),errors='coerce')

liquor.dropna(inplace=True)

In [3]:
liquor.head()

Unnamed: 0,Date,StoreNumber,City,ZipCode,CountyNumber,County,Category,CategoryName,VendorNumber,ItemNumber,ItemDescription,BottleVolumeml,StateBottleCost,StateBottleRetail,BottlesSold,SaleDollars,VolumeSoldLiters,VolumeSoldGallons
0,2015-11-04,3717,SUMNER,50674,9.0,Bremer,1051100.0,APRICOT BRANDIES,55,54436,Mr. Boston Apricot Brandy,750,4.5,6.75,12,81.0,9.0,2.38
1,2016-03-02,2614,DAVENPORT,52807,82.0,Scott,1011100.0,BLENDED WHISKIES,395,27605,Tin Cup,750,13.75,20.63,2,41.26,1.5,0.4
2,2016-02-11,2106,CEDAR FALLS,50613,7.0,Black Hawk,1011200.0,STRAIGHT BOURBON WHISKIES,65,19067,Jim Beam,1000,12.59,18.89,24,453.36,24.0,6.34
3,2016-02-03,2501,AMES,50010,85.0,Story,1071100.0,AMERICAN COCKTAILS,395,59154,1800 Ultimate Margarita,1750,9.5,14.25,6,85.5,10.5,2.77
4,2015-08-18,3654,BELMOND,50421,99.0,Wright,1031080.0,VODKA 80 PROOF,297,35918,Five O'clock Vodka,1750,7.2,10.8,12,129.6,21.0,5.55


In [6]:
# Filter to only 2015:
liquor2015 = liquor[liquor.Date.dt.year == 2015]

In [38]:
# Calculate the sum of sales for each store in 2015 by grouping the full year data
liquor2015_fy = liquor2015.groupby('StoreNumber').sum().reset_index()[['StoreNumber','SaleDollars']]
liquor2015_fy.head()

Unnamed: 0,StoreNumber,SaleDollars
0,2106,146038.7
1,2113,9310.22
2,2130,111583.91
3,2152,7721.08
4,2178,24324.18


In [7]:
# Filter all my columns to Q1
liquor2015_q1 = liquor2015[liquor2015.Date.dt.month <= 3]

In [8]:
# calculate profit per sales transaction
liquor2015_q1['Profit'] = liquor2015_q1.SaleDollars - (liquor2015_q1.BottlesSold * liquor2015_q1.StateBottleCost)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [22]:
liquor2015_q1.StoreNumber.head()

12    3858
19    2614
20    2590
21    3942
22    3981
Name: StoreNumber, dtype: int64

In [21]:
# group the q1 data by store
liquor2015_q1_gp = liquor2015.groupby('StoreNumber').sum().reset_index()[['StoreNumber','SaleDollars']]
liquor2015_q1_gp.head()

Unnamed: 0,StoreNumber,SaleDollars
0,2106,146038.7
1,2113,9310.22
2,2130,111583.91
3,2152,7721.08
4,2178,24324.18


In [28]:
# Create a dictionary called agg (what is this for?)
agg_dict = {'StateBottleCost':[np.mean,np.sum],
'SaleDollars':[np.sum,np.mean],
'Profit':[np.median,np.mean,np.sum]}

In [30]:
# Pass this dictionary to the groupby statement
lq_2015_q1_gp = liquor2015_q1.groupby('StoreNumber').agg(agg_dict)
lq_2015_q1_gp.SaleDollars.head()

Unnamed: 0_level_0,sum,mean
StoreNumber,Unnamed: 1_level_1,Unnamed: 2_level_1
2106,39287.29,304.552636
2113,2833.25,67.458333
2130,24272.57,278.995057
2152,2003.46,62.608125
2178,5856.41,122.008542


In [32]:
# this comes out with StoreNumber as my index, I'll reset it
features_q1 = lq_2015_q1_gp.reset_index()
features_q1.head()

Unnamed: 0_level_0,StoreNumber,Profit,Profit,Profit,StateBottleCost,StateBottleCost,SaleDollars,SaleDollars
Unnamed: 0_level_1,Unnamed: 1_level_1,median,mean,sum,mean,sum,sum,mean
0,2106,47.16,101.615271,13108.37,10.04155,1295.36,39287.29,304.552636
1,2113,13.495,22.493333,944.72,10.54619,442.94,2833.25,67.458333
2,2130,46.8,93.203218,8108.68,10.26046,892.66,24272.57,278.995057
3,2152,11.01,20.89375,668.6,9.624062,307.97,2003.46,62.608125
4,2178,38.34,40.86,1961.28,9.816458,471.19,5856.41,122.008542


In [33]:
# My column names are split. I'll process through them to get single feature names
features_q1.columns = [['_'.join(col).rstrip('_') for col in features_q1.columns.values]]
features_q1.head()

Unnamed: 0,StoreNumber,Profit_median,Profit_mean,Profit_sum,StateBottleCost_mean,StateBottleCost_sum,SaleDollars_sum,SaleDollars_mean
0,2106,47.16,101.615271,13108.37,10.04155,1295.36,39287.29,304.552636
1,2113,13.495,22.493333,944.72,10.54619,442.94,2833.25,67.458333
2,2130,46.8,93.203218,8108.68,10.26046,892.66,24272.57,278.995057
3,2152,11.01,20.89375,668.6,9.624062,307.97,2003.46,62.608125
4,2178,38.34,40.86,1961.28,9.816458,471.19,5856.41,122.008542


In [41]:
# Merge my features and my target on the 'StoreNumber' column
liquor2015_combined = pd.merge(features_q1,liquor2015_fy,on=['StoreNumber'])
liquor2015_combined.head()

Unnamed: 0,StoreNumber,Profit_median,Profit_mean,Profit_sum,StateBottleCost_mean,StateBottleCost_sum,SaleDollars_sum,SaleDollars_mean,SaleDollars
0,2106,47.16,101.615271,13108.37,10.04155,1295.36,39287.29,304.552636,146038.7
1,2113,13.495,22.493333,944.72,10.54619,442.94,2833.25,67.458333,9310.22
2,2130,46.8,93.203218,8108.68,10.26046,892.66,24272.57,278.995057,111583.91
3,2152,11.01,20.89375,668.6,9.624062,307.97,2003.46,62.608125,7721.08
4,2178,38.34,40.86,1961.28,9.816458,471.19,5856.41,122.008542,24324.18
