# Classification Model for Utah

In [15]:
# packages used
import pandas as pd
import numpy as np
from sklearn import tree
#from sklearn.ensemble import BaggingClassifier
#from sklearn.ensemble import RandomForestClassifier
#from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score


### Data Loading

In [2]:
# Load the data
data_dir = 'Data/'
Stores = pd.read_csv(data_dir + 'AllStoresUtah.csv', dtype = str)
StoreType = pd.read_csv(data_dir + 'NonLowPointStoresUtah.csv', dtype = str)

In [3]:
Stores.head()

Unnamed: 0,RTL_STORE_CD,RTL_FIPS_COUNTY_DSC,RTL_PREMISE_TYPE_CD,RTL_CHANNEL_DSC,RTL_SUBCHANNEL_DSC,RTL_BEER_FLAG,RTL_LIQUOR_FLG
0,101856127,GRAND,OFF,CONVENIENCE STORE,CONVENTIONAL CONVENIENCE,Y,N
1,101948415,GARFIELD,OFF,CONVENIENCE STORE,CONVENTIONAL CONVENIENCE,Y,N
2,105604679,CACHE,ON,DINING,CASUAL DINING,Y,Y
3,100437548,SALT LAKE,OFF,CONVENIENCE STORE,CONVENTIONAL CONVENIENCE,Y,N
4,101943195,SALT LAKE,OFF,EXTENDED MASTER OFF-PREMISE,OTHER OFF-PREMISE,N,N


In [4]:
StoreType.head()

Unnamed: 0,RTL_STORE_CD,BEERTYPE
0,101415037,NonLowPoint
1,201880474,NonLowPoint
2,101415040,NonLowPoint
3,101415045,NonLowPoint
4,103755213,NonLowPoint


In [5]:
FullData = pd.merge(Stores, StoreType, on="RTL_STORE_CD", how="left")

In [6]:
FullData.loc[FullData['BEERTYPE'].isnull(),"BEERTYPE"] = "LowPoint"

In [7]:
FullData = pd.concat([FullData, pd.get_dummies(FullData['RTL_FIPS_COUNTY_DSC'],prefix='COUNTY')], axis=1)
FullData = pd.concat([FullData, pd.get_dummies(FullData['RTL_PREMISE_TYPE_CD'],prefix='PREMISE')], axis=1)
FullData = pd.concat([FullData, pd.get_dummies(FullData['RTL_CHANNEL_DSC'],prefix='CHANNEL')], axis=1)
FullData = pd.concat([FullData, pd.get_dummies(FullData['RTL_SUBCHANNEL_DSC'],prefix='SUBCHANNEL')], axis=1)
FullData = pd.concat([FullData, pd.get_dummies(FullData['RTL_BEER_FLAG'],prefix='BEER_LICENSE')], axis=1)
FullData = pd.concat([FullData, pd.get_dummies(FullData['RTL_LIQUOR_FLG'],prefix='LIQUOR_LICENSE')], axis=1)

In [8]:
FullData.head()

Unnamed: 0,RTL_STORE_CD,RTL_FIPS_COUNTY_DSC,RTL_PREMISE_TYPE_CD,RTL_CHANNEL_DSC,RTL_SUBCHANNEL_DSC,RTL_BEER_FLAG,RTL_LIQUOR_FLG,BEERTYPE,COUNTY_BEAVER,COUNTY_BOX ELDER,...,SUBCHANNEL_THEATER,SUBCHANNEL_TRADING POST,SUBCHANNEL_UNKNOWN,SUBCHANNEL_WINE SPECIALTY STORE,BEER_LICENSE_N,BEER_LICENSE_U,BEER_LICENSE_Y,LIQUOR_LICENSE_N,LIQUOR_LICENSE_U,LIQUOR_LICENSE_Y
0,101856127,GRAND,OFF,CONVENIENCE STORE,CONVENTIONAL CONVENIENCE,Y,N,LowPoint,0,0,...,0,0,0,0,0,0,1,1,0,0
1,101948415,GARFIELD,OFF,CONVENIENCE STORE,CONVENTIONAL CONVENIENCE,Y,N,LowPoint,0,0,...,0,0,0,0,0,0,1,1,0,0
2,105604679,CACHE,ON,DINING,CASUAL DINING,Y,Y,LowPoint,0,0,...,0,0,0,0,0,0,1,0,0,1
3,100437548,SALT LAKE,OFF,CONVENIENCE STORE,CONVENTIONAL CONVENIENCE,Y,N,LowPoint,0,0,...,0,0,0,0,0,0,1,1,0,0
4,101943195,SALT LAKE,OFF,EXTENDED MASTER OFF-PREMISE,OTHER OFF-PREMISE,N,N,LowPoint,0,0,...,0,0,0,0,1,0,0,1,0,0


### Prepare Data

In [9]:
# prepare data to fit model
X_labels = [c for c in FullData.columns if c not in ['RTL_STORE_CD','BEERTYPE','RTL_FIPS_COUNTY_DSC','RTL_PREMISE_TYPE_CD','RTL_CHANNEL_DSC','RTL_SUBCHANNEL_DSC','RTL_BEER_FLAG',"RTL_LIQUOR_FLG"]]
X = FullData.loc[:,X_labels]
Y = FullData['BEERTYPE']

# split data to training and test data
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.4, random_state=0)

### Decision Tree Classifier

In [10]:
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X_train,Y_train)

In [11]:
clf.score(X_test, Y_test)

0.99895941727367321

In [12]:
# change the parameter depth and compute test error
results = []
for d in range(1,8):
    clf = tree.DecisionTreeClassifier(max_depth=d)
    clf = clf.fit(X,Y)    
    scores = cross_val_score(clf, X, Y, cv=5)
    results.append((d,scores.mean(), scores.std()))
    
df_tree_accuracy = pd.DataFrame(data=results,columns=['depth','mean','std'])
df_tree_accuracy

Unnamed: 0,depth,mean,std
0,1,0.997501,0.001559
1,2,0.999168,0.00102
2,3,0.999168,0.00102
3,4,0.999168,0.00102
4,5,0.999168,0.00102
5,6,0.999168,0.00102
6,7,0.999168,0.00102


### Rules