In [37]:
import cx_Oracle
from IPython.display import IFrame    #to display pdf file
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import pandas as pd
import pickle
from py2casefold import casefold
from sklearn import tree
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

# successful code:

In [38]:
#define a function to get the subset of dataframe
def classification_subset_32(dataframe, state):


    #load the columns for subset from pickle:
    subset_columns = pickle.load(open('classification_columns.p', 'rb'))
    
    #subset the dataframe:
    classification_df = pd.DataFrame(dataframe[subset_columns])

    classification_df = classification_df[classification_df['RTL_STATE_DSC'] == str(state).upper()]
    return classification_df

In [39]:
#define a function for business_rule_pickle
def business_rule_pickle_32(classification_df, state):  

    #load the business rule from pickle for that State
    filename = str(state).title()+'_business_rule.p'
    rule_func = pickle.load(open(filename, 'rb'))

    #make the prediction
    result_df = classification_df
    result_df['LowPoint_Y'] = rule_func(result_df)
    
    return result_df

In [40]:
#business rule pickle Utah
def Rules_Utah(dataframe):
    result_list = []
    for row in range(len(dataframe)):
        if dataframe.loc[row,'RTL_CHANNEL_DSC'] == 'LIQUOR' or 'MILITARY' in dataframe.loc[row,'RTL_CHANNEL_DSC']:
            result_list.append(False)
        else:
            result_list.append(True)  
    return result_list

#save the Utah business rule into pickle
pickle.dump(Rules_Utah, open('Utah_business_rule.p', 'wb'))

In [41]:
#business rule pickle Oklahoma
def Rules_Oklahoma(dataframe):
    result_list = []
    for row in range(len(dataframe)):
        if dataframe.loc[row,'RTL_CHANNEL_DSC'] == 'DISTRIBUTOR/SUB-DISTRIBUTOR' or dataframe.loc[row,'RTL_CHANNEL_DSC'] == 'EXTENDED MASTER OFF-PREMISE':
            result_list.append(False)
        else:
            result_list.append(True)
    return result_list

#save the Oklahoma business rule into pickle
pickle.dump(Rules_Oklahoma, open('Oklahoma_business_rule.p', 'wb'))

In [42]:
#business rule pickle Colorado
def Rules_Colorado(dataframe):
    result_list = []
    for row in range(len(dataframe)):
        if dataframe.loc[row,'RTL_PREMISE_TYPE_CD'] == 'OFF':
            if dataframe.loc[row,'RTL_CHANNEL_DSC'] == 'LIQUOR' or dataframe.loc[row,'RTL_CHANNEL_DSC'] == 'DRUG' or dataframe.loc[row,'RTL_CHANNEL_DSC'] == 'MILITARY OFF-PREMISE':
                result_list.append(False)
            elif dataframe.loc[row,'RTL_CHANNEL_DSC'] == 'GROCERY':
                if dataframe.loc[row,'RTL_LIQUOR_FLG'] == 'Y':
                    result_list.append(False)
                else:
                    result_list.append(True)
            else:
                result_list.append(True)
        else:
             result_list.append(False)
    return result_list

#save the Colorado business rule into pickle
pickle.dump(Rules_Colorado, open('Colorado_business_rule.p', 'wb'))

In [43]:
def decision_tree_pickle_32(classification_df, state):

    #call the binarize function to transform the columns into binary
    columns = [c for c in classification_df.columns if c not in ['RTL_STORE_CD','LowPoint_Y', 'RTL_STATE_DSC']]
    result_df = pd.concat([classification_df, pd.get_dummies(classification_df[columns] , prefix = ['B_'+c for c in columns])] , axis = 1)
        
    #load the model from pickle for that state
    filename = str(state).title()+'_decision_tree_model.p'
    decision_tree_model = pickle.load(open(filename, 'rb'))

    #prepare independent variables:
    X_labels = [c for c in result_df.columns if c not in no_train_list]
    X = result_df.loc[:,X_labels]

    #dependent variable:
    result_df['LowPoint_Y'] = decision_tree_model.predict(X) == 'LowPoint'
    
    return result_df

In [8]:
#Kansas Decision tree Pickle
# Load the data
data_dir = 'Data/'
Stores = pd.read_csv(data_dir + 'AllStoresKansas.csv', dtype = str)
StoreType = pd.read_csv(data_dir + 'NonLowPointStoresKansas.csv', dtype = str)
FullData = pd.merge(Stores, StoreType, on="RTL_STORE_CD", how="left")
FullData.loc[FullData['BEERTYPE'].isnull(),"BEERTYPE"] = "LowPoint"
FullData = pd.concat([FullData, pd.get_dummies(FullData['RTL_FIPS_COUNTY_DSC'],prefix='COUNTY')], axis=1)
FullData = pd.concat([FullData, pd.get_dummies(FullData['RTL_PREMISE_TYPE_CD'],prefix='PREMISE')], axis=1)
FullData = pd.concat([FullData, pd.get_dummies(FullData['RTL_CHANNEL_DSC'],prefix='CHANNEL')], axis=1)
FullData = pd.concat([FullData, pd.get_dummies(FullData['RTL_SUBCHANNEL_DSC'],prefix='SUBCHANNEL')], axis=1)
FullData = pd.concat([FullData, pd.get_dummies(FullData['RTL_BEER_FLAG'],prefix='BEER_LICENSE')], axis=1)
FullData = pd.concat([FullData, pd.get_dummies(FullData['RTL_LIQUOR_FLG'],prefix='LIQUOR_LICENSE')], axis=1)

# prepare data to fit model
X_labels = [c for c in FullData.columns if c not in ['RTL_STORE_CD','BEERTYPE','RTL_FIPS_COUNTY_DSC','RTL_PREMISE_TYPE_CD','RTL_CHANNEL_DSC','RTL_SUBCHANNEL_DSC','RTL_BEER_FLAG',"RTL_LIQUOR_FLG"]]
X = FullData.loc[:,X_labels]
Y = FullData['BEERTYPE']

parameters = {'max_depth':range(1,21), 'min_samples_leaf':range(3,21,3), 'min_samples_split':range(3,21,3), 'random_state': [0]}
clf = tree.DecisionTreeClassifier()
clf = GridSearchCV(clf, parameters, n_jobs = -2)
clf.fit(X, Y)
accuracy = clf.best_score_ 
best_params = clf.best_params_
#the result: with the best tree depth and accuracy
best_depth = best_params['max_depth']
best_leaf = best_params['min_samples_leaf']
best_split = best_params['min_samples_split']

clf = tree.DecisionTreeClassifier(max_depth=best_depth, min_samples_leaf= best_leaf, min_samples_split = best_split, random_state=0)
clf = clf.fit(X,Y)
# save the model to disk
filename = 'Kansas_decision_tree_model.p'
pickle.dump(clf, open(filename, 'wb'), protocol=2)


In [9]:
# Minnesota Decision tree Pickle
data_dir = 'Data/'
Stores = pd.read_csv(data_dir + 'AllStoresMinnesota.csv', dtype = str)
StoreType = pd.read_csv(data_dir + 'NonLowPointStoresMinnesota.csv', dtype = str)

FullData = pd.merge(Stores, StoreType, on="RTL_STORE_CD", how="left")
FullData.loc[FullData['BEERTYPE'].isnull(),"BEERTYPE"] = "LowPoint"
FullData = pd.concat([FullData, pd.get_dummies(FullData['RTL_FIPS_COUNTY_DSC'],prefix='COUNTY')], axis=1)
FullData = pd.concat([FullData, pd.get_dummies(FullData['RTL_PREMISE_TYPE_CD'],prefix='PREMISE')], axis=1)
FullData = pd.concat([FullData, pd.get_dummies(FullData['RTL_CHANNEL_DSC'],prefix='CHANNEL')], axis=1)
FullData = pd.concat([FullData, pd.get_dummies(FullData['RTL_SUBCHANNEL_DSC'],prefix='SUBCHANNEL')], axis=1)
FullData = pd.concat([FullData, pd.get_dummies(FullData['RTL_BEER_FLAG'],prefix='BEER_LICENSE')], axis=1)
FullData = pd.concat([FullData, pd.get_dummies(FullData['RTL_LIQUOR_FLG'],prefix='LIQUOR_LICENSE')], axis=1)
# prepare data to fit model
X_labels = [c for c in FullData.columns if c not in ['RTL_STORE_CD','BEERTYPE','RTL_FIPS_COUNTY_DSC','RTL_PREMISE_TYPE_CD','RTL_CHANNEL_DSC','RTL_SUBCHANNEL_DSC','RTL_BEER_FLAG',"RTL_LIQUOR_FLG"]]
X = FullData.loc[:,X_labels]
Y = FullData['BEERTYPE']
parameters = {'max_depth':range(1,21), 'min_samples_leaf':range(3,21,3), 'min_samples_split':range(3,21,3), 'random_state': [0]}
clf = tree.DecisionTreeClassifier()
clf = GridSearchCV(clf, parameters, n_jobs = -2)
clf.fit(X, Y)
accuracy = clf.best_score_ 
best_params = clf.best_params_
best_params = clf.best_params_
best_depth = best_params['max_depth']
best_leaf = best_params['min_samples_leaf']
best_split = best_params['min_samples_split']

clf = tree.DecisionTreeClassifier(max_depth=best_depth, min_samples_leaf= best_leaf, min_samples_split = best_split, random_state=0)
clf = clf.fit(X,Y)
# save the model to Pickle
pickle.dump(clf, open('Minnesota_decision_tree_model.p', 'wb'), protocol=2)

In [44]:
#this block works fine, but only half part of the function, need to add the later ones.
def read_query_and_predict_32(classification_df, state):
    
    #Access to Oracle DB
    host = 'tncluster6.cbi.net'
    port = 1521
    s_name = 'EDW1TS_EX.cbi.net'
    dsn_tns = cx_Oracle.makedsn(host, port, service_name = s_name)
    db_ts = cx_Oracle.connect('amartinezcotto', 'Kohlia#743', dsn_tns)
    cursor = db_ts.cursor()

    if str(state).title() == 'Kansas'.title():
        state_code = str(162)
    elif str(state).title() == 'Utah'.title():
        state_code = str(239)
    elif str(state).title() == 'Minnesota'.title():
        state_code = str(177)
    elif str(state).title() == 'Corolado'.title():
        state_code = str(125)
    elif str(state).title() == 'Oklahoma'.title():
        state_code = str(201)

    query = '''select /*+ parallel(6) */ a15.STORE_CD  RTL_STORE_CD, 'NonLowPoint' as "BEERTYPE"
                    from EDW.BI_CRN_F_RETAIL_DEPL_MVW_VW a11
                    join EDW.BI_CRN_D_ITEM_VW a12
                       on  (a11.ITEM_ID = a12.ITEM_ID)
                     join EDW.BI_CRN_D_REL_TIME_DETAIL_VW a13
                       on  (a11.DATE_ID = a13.REL_DATE_ID)
                     join EDW.BI_CRN_D_DISTRIBUTOR_VW a14
                       on  (a11.DIST_ID = a14.DIST_ID)
                     join EDW.BI_CRN_D_RETAILER_VW a15
                       on  (a11.RETAILER_ID = a15.RETAILER_ID)
                    where (a13.REL_TIME_CD in ('L3_TY')
                     and a14.STATE_PROVINCE_COUNTRY_CD in (''' + "'" + state_code + "')" + ''' and a12.MASTER_SKU_CD not in ('80013450', '80013452', '80013455', '80013979', '80013456', '80013475', '80061325', '80013477', '80013982', '80013980', '80059842', '80058839', '80014020', '80014022', '80014023', '80014024', '80014025', '80013992', '80013991', '80013464', '80013466', '80013469', '80013471', '80013472', '80013473', '80013478', '80031998', '80013485', '80056926', '80058838', '80019746', '80031994', '80015966', '80014017', '80012704', '80014015', '80014009', '80014011', '80024732', '80013994', '80056172', '80014010'))
                    group by a15.STORE_CD
                    UNION 
                    select /*+ parallel(6) */ a15.STORE_CD  RTL_STORE_CD,
                     'LowPoint' as "BEERTYPE"
                    from EDW.BI_CRN_F_RETAIL_DEPL_MVW_VW a11
                     join EDW.BI_CRN_D_ITEM_VW a12
                       on  (a11.ITEM_ID = a12.ITEM_ID)
                     join EDW.BI_CRN_D_REL_TIME_DETAIL_VW a13
                       on  (a11.DATE_ID = a13.REL_DATE_ID)
                     join EDW.BI_CRN_D_DISTRIBUTOR_VW a14
                       on  (a11.DIST_ID = a14.DIST_ID)
                     join EDW.BI_CRN_D_RETAILER_VW a15
                       on  (a11.RETAILER_ID = a15.RETAILER_ID)
                    where (a13.REL_TIME_CD in ('L3_TY')
                     and a14.STATE_PROVINCE_COUNTRY_CD in (''' +"'" + state_code + "')" + ''' and a12.MASTER_SKU_CD not in ('80029020', '80060325', '80032234', '80032233', '80023100', '80031072', '80013435', '80013437', '80013439', '80057078', '80014016', '80013442', '80026873', '80056799', '80013444', '80014014', '80013447', '80013460', '80013457', '80013458', '80059543', '80013461', '80059542', '80059571', '80059572', '80059573', '80013982', '80013980', '80056922', '80014002', '80013968', '80029050', '80013515', '80013516', '80013517', '80013518', '80018933', '80056908', '80013520', '80014006', '80013993', '80013522', '80014026', '80013984', '80013978', '80013970', '80014012', '80014060', '80013986', '80013977', '80060330', '80060331', '80060328', '999', '80014008', '80013995', '80013972', '80056887', '80014001', '80031989', '80013985', '80013971', '80013983', '80014007', '80029021', '80015214', '80015215', '80014005', '80014004', '80013976', '80013990', '80013998', '80013989', '80013975', '80013997', '80027630', '80027631', '80027632', '80062390', '80062550', '80013996', '80014013', '80013988', '80013987', '80013974', '80013973', '80013999', '80014000', 'UNK', '80013981'))
                    group by a15.STORE_CD '''

    try:
        cursor.execute(query)
        names = [ x[0] for x in cursor.description]
        rows = cursor.fetchall()
        StoreType = pd.DataFrame(rows, columns=names)

    finally:
        if cursor is not None:
            cursor.close()
            
    return StoreType

In [45]:
#list of columns that don't include in binarize
no_binary_list = ['RTL_STORE_CD', 'RTL_STATE_DSC', 'LowPoint_Y', 'BEERTYPE']

#list of columns that don't include in train
no_train_list = ['RTL_STORE_CD','RTL_FIPS_COUNTY_DSC', 'RTL_PREMISE_TYPE_CD', 'RTL_CHANNEL_DSC', 'RTL_SUBCHANNEL_DSC', 'RTL_BEER_FLAG', 'RTL_LIQUOR_FLG', 'RTL_STATE_DSC', 'LowPoint_Y', 'BEERTYPE']

In [46]:
# Load the data
data_dir = 'Data/'
Stores = pd.read_csv(data_dir + 'AllStoresKansas.csv', dtype = str)

In [47]:
Stores['test1'] = 'TEST1'
Stores['test2'] = 'TEST2'
Stores['LowPoint_Y'] = False
Stores['RTL_STATE_DSC'] = 'KANSAS'
Stores.tail()

Unnamed: 0,RTL_STORE_CD,RTL_FIPS_COUNTY_DSC,RTL_PREMISE_TYPE_CD,RTL_CHANNEL_DSC,RTL_SUBCHANNEL_DSC,RTL_BEER_FLAG,RTL_LIQUOR_FLG,test1,test2,LowPoint_Y,RTL_STATE_DSC
2630,107493514,JOHNSON,ON,RECREATION,OTHER ENTERTAINMENT,Y,N,TEST1,TEST2,False,KANSAS
2631,100696166,SHERMAN,OFF,CONVENIENCE STORE,CONVENTIONAL CONVENIENCE,Y,N,TEST1,TEST2,False,KANSAS
2632,101403713,JOHNSON,OFF,LIQUOR,LIQUOR SUPER STORE,Y,Y,TEST1,TEST2,False,KANSAS
2633,201404077,WYANDOTTE,OFF,LIQUOR,CONVENTIONAL LIQUOR,Y,Y,TEST1,TEST2,False,KANSAS
2634,102631767,SEDGWICK,OFF,EXTENDED MASTER OFF-PREMISE,OTHER OFF-PREMISE,N,N,TEST1,TEST2,False,KANSAS


In [48]:
clsDF = classification_subset_32(Stores, 'Kansas')
clsDF.tail()

Unnamed: 0,RTL_STORE_CD,RTL_FIPS_COUNTY_DSC,RTL_PREMISE_TYPE_CD,RTL_CHANNEL_DSC,RTL_SUBCHANNEL_DSC,RTL_BEER_FLAG,RTL_LIQUOR_FLG,RTL_STATE_DSC,LowPoint_Y
2630,107493514,JOHNSON,ON,RECREATION,OTHER ENTERTAINMENT,Y,N,KANSAS,False
2631,100696166,SHERMAN,OFF,CONVENIENCE STORE,CONVENTIONAL CONVENIENCE,Y,N,KANSAS,False
2632,101403713,JOHNSON,OFF,LIQUOR,LIQUOR SUPER STORE,Y,Y,KANSAS,False
2633,201404077,WYANDOTTE,OFF,LIQUOR,CONVENTIONAL LIQUOR,Y,Y,KANSAS,False
2634,102631767,SEDGWICK,OFF,EXTENDED MASTER OFF-PREMISE,OTHER OFF-PREMISE,N,N,KANSAS,False


# Problem here

In [49]:
#delete this line below if works

In [50]:
#Access to Oracle DB
host = 'tncluster6.cbi.net'
port = 1521
s_name = 'EDW1TS_EX.cbi.net'
dsn_tns = cx_Oracle.makedsn(host, port, service_name = s_name)
db_ts = cx_Oracle.connect('amartinezcotto', 'Kohlia#743', dsn_tns)
cursor = db_ts.cursor()

state = 'Kansas'
if str(state).title() == 'Kansas'.title():
    state_code = str(162)
elif str(state).title() == 'Utah'.title():
    state_code = str(239)
elif str(state).title() == 'Minnesota'.title():
    state_code = str(177)
elif str(state).title() == 'Corolado'.title():
    state_code = str(125)
elif str(state).title() == 'Oklahoma'.title():
    state_code = str(201)

query = '''select /*+ parallel(6) */ a15.STORE_CD  RTL_STORE_CD, 'NonLowPoint' as "BEERTYPE"
                    from EDW.BI_CRN_F_RETAIL_DEPL_MVW_VW a11
                    join EDW.BI_CRN_D_ITEM_VW a12
                       on  (a11.ITEM_ID = a12.ITEM_ID)
                     join EDW.BI_CRN_D_REL_TIME_DETAIL_VW a13
                       on  (a11.DATE_ID = a13.REL_DATE_ID)
                     join EDW.BI_CRN_D_DISTRIBUTOR_VW a14
                       on  (a11.DIST_ID = a14.DIST_ID)
                     join EDW.BI_CRN_D_RETAILER_VW a15
                       on  (a11.RETAILER_ID = a15.RETAILER_ID)
                    where (a13.REL_TIME_CD in ('L3_TY')
                     and a14.STATE_PROVINCE_COUNTRY_CD in (''' +"'" + state_code + "')" + ''' and a12.MASTER_SKU_CD not in ('80013450', '80013452', '80013455', '80013979', '80013456', '80013475', '80061325', '80013477', '80013982', '80013980', '80059842', '80058839', '80014020', '80014022', '80014023', '80014024', '80014025', '80013992', '80013991', '80013464', '80013466', '80013469', '80013471', '80013472', '80013473', '80013478', '80031998', '80013485', '80056926', '80058838', '80019746', '80031994', '80015966', '80014017', '80012704', '80014015', '80014009', '80014011', '80024732', '80013994', '80056172', '80014010'))
                    group by a15.STORE_CD
                    UNION 
                    select /*+ parallel(6) */ a15.STORE_CD  RTL_STORE_CD,
                     'LowPoint' as "BEERTYPE"
                    from EDW.BI_CRN_F_RETAIL_DEPL_MVW_VW a11
                     join EDW.BI_CRN_D_ITEM_VW a12
                       on  (a11.ITEM_ID = a12.ITEM_ID)
                     join EDW.BI_CRN_D_REL_TIME_DETAIL_VW a13
                       on  (a11.DATE_ID = a13.REL_DATE_ID)
                     join EDW.BI_CRN_D_DISTRIBUTOR_VW a14
                       on  (a11.DIST_ID = a14.DIST_ID)
                     join EDW.BI_CRN_D_RETAILER_VW a15
                       on  (a11.RETAILER_ID = a15.RETAILER_ID)
                    where (a13.REL_TIME_CD in ('L3_TY')
                     and a14.STATE_PROVINCE_COUNTRY_CD in (''' +"'" + state_code + "')" + ''' and a12.MASTER_SKU_CD not in ('80029020', '80060325', '80032234', '80032233', '80023100', '80031072', '80013435', '80013437', '80013439', '80057078', '80014016', '80013442', '80026873', '80056799', '80013444', '80014014', '80013447', '80013460', '80013457', '80013458', '80059543', '80013461', '80059542', '80059571', '80059572', '80059573', '80013982', '80013980', '80056922', '80014002', '80013968', '80029050', '80013515', '80013516', '80013517', '80013518', '80018933', '80056908', '80013520', '80014006', '80013993', '80013522', '80014026', '80013984', '80013978', '80013970', '80014012', '80014060', '80013986', '80013977', '80060330', '80060331', '80060328', '999', '80014008', '80013995', '80013972', '80056887', '80014001', '80031989', '80013985', '80013971', '80013983', '80014007', '80029021', '80015214', '80015215', '80014005', '80014004', '80013976', '80013990', '80013998', '80013989', '80013975', '80013997', '80027630', '80027631', '80027632', '80062390', '80062550', '80013996', '80014013', '80013988', '80013987', '80013974', '80013973', '80013999', '80014000', 'UNK', '80013981'))
                    group by a15.STORE_CD '''


In [51]:
try:
    cursor.execute(query)
    names = [x[0] for x in cursor.description]
    print names
    rows = cursor.fetchall()
    print rows
    StoreType = pd.DataFrame(rows, columns=names)

finally:
    if cursor is not None:
        cursor.close()

['RTL_STORE_CD', 'BEERTYPE']
[]


In [36]:
StoreType.head()

Unnamed: 0,RTL_STORE_CD,BEERTYPE


In [None]:
#delete this line above

In [15]:
StoreType = read_query_and_predict_32(clsDF, 'Kansas')
StoreType.head()

Unnamed: 0,RTL_STORE_CD,BEERTYPE


In [16]:
result_df = pd.merge(clsDF, StoreType, on="RTL_STORE_CD", how="left")  
result_df.tail()

Unnamed: 0,RTL_STORE_CD,RTL_FIPS_COUNTY_DSC,RTL_PREMISE_TYPE_CD,RTL_CHANNEL_DSC,RTL_SUBCHANNEL_DSC,RTL_BEER_FLAG,RTL_LIQUOR_FLG,RTL_STATE_DSC,LowPoint_Y,BEERTYPE
2630,107493514,JOHNSON,ON,RECREATION,OTHER ENTERTAINMENT,Y,N,KANSAS,False,
2631,100696166,SHERMAN,OFF,CONVENIENCE STORE,CONVENTIONAL CONVENIENCE,Y,N,KANSAS,False,
2632,101403713,JOHNSON,OFF,LIQUOR,LIQUOR SUPER STORE,Y,Y,KANSAS,False,
2633,201404077,WYANDOTTE,OFF,LIQUOR,CONVENTIONAL LIQUOR,Y,Y,KANSAS,False,
2634,102631767,SEDGWICK,OFF,EXTENDED MASTER OFF-PREMISE,OTHER OFF-PREMISE,N,N,KANSAS,False,


In [17]:
columns = [c for c in clsDF.columns if c not in no_binary_list]
columns

['RTL_FIPS_COUNTY_DSC',
 'RTL_PREMISE_TYPE_CD',
 'RTL_CHANNEL_DSC',
 'RTL_SUBCHANNEL_DSC',
 'RTL_BEER_FLAG',
 'RTL_LIQUOR_FLG']

In [18]:
result_df = pd.concat([result_df, pd.get_dummies(result_df[columns], prefix = ['B_'+c for c in columns])] , axis = 1)
result_df

Unnamed: 0,RTL_STORE_CD,RTL_FIPS_COUNTY_DSC,RTL_PREMISE_TYPE_CD,RTL_CHANNEL_DSC,RTL_SUBCHANNEL_DSC,RTL_BEER_FLAG,RTL_LIQUOR_FLG,RTL_STATE_DSC,LowPoint_Y,BEERTYPE,...,B_RTL_SUBCHANNEL_DSC_SUPERMARKET-NATURAL/GOURMET FOODS,B_RTL_SUBCHANNEL_DSC_THEATER,B_RTL_SUBCHANNEL_DSC_UNKNOWN,B_RTL_SUBCHANNEL_DSC_WINE SPECIALTY STORE,B_RTL_BEER_FLAG_N,B_RTL_BEER_FLAG_U,B_RTL_BEER_FLAG_Y,B_RTL_LIQUOR_FLG_N,B_RTL_LIQUOR_FLG_U,B_RTL_LIQUOR_FLG_Y
0,101403888,SEDGWICK,OFF,LIQUOR,CONVENTIONAL LIQUOR,Y,Y,KANSAS,False,,...,0,0,0,0,0,0,1,0,0,1
1,100091604,SALINE,OFF,CONVENIENCE STORE,CONVENTIONAL CONVENIENCE,Y,N,KANSAS,False,,...,0,0,0,0,0,0,1,1,0,0
2,200697901,SEDGWICK,OFF,CONVENIENCE STORE,CONVENTIONAL CONVENIENCE,Y,N,KANSAS,False,,...,0,0,0,0,0,0,1,1,0,0
3,100091143,HARPER,OFF,GROCERY,SUPERMARKET-CONVENTIONAL,Y,N,KANSAS,False,,...,0,0,0,0,0,0,1,1,0,0
4,107381463,DOUGLAS,OFF,GROCERY,SUPERMARKET-NATURAL/GOURMET FOODS,Y,N,KANSAS,False,,...,1,0,0,0,0,0,1,1,0,0
5,201953385,NEOSHO,ON,DINING,CASUAL DINING,Y,Y,KANSAS,False,,...,0,0,0,0,0,0,1,0,0,1
6,100091590,LABETTE,OFF,GROCERY,SUPERMARKET-CONVENTIONAL,Y,N,KANSAS,False,,...,0,0,0,0,0,0,1,1,0,0
7,201403873,PHILLIPS,OFF,LIQUOR,CONVENTIONAL LIQUOR,Y,Y,KANSAS,False,,...,0,0,0,0,0,0,1,0,0,1
8,201644381,STEVENS,ON,BAR/NIGHTCLUB,NEIGHBORHOOD BAR,Y,Y,KANSAS,False,,...,0,0,0,0,0,0,1,0,0,1
9,201852636,SEDGWICK,OFF,CONVENIENCE STORE,GAS STATION/KIOSK,Y,N,KANSAS,False,,...,0,0,0,0,0,0,1,1,0,0


In [19]:
#subset the training dataset
train_df = result_df.loc[result_df['BEERTYPE'] != '']  
train_df.tail()

Unnamed: 0,RTL_STORE_CD,RTL_FIPS_COUNTY_DSC,RTL_PREMISE_TYPE_CD,RTL_CHANNEL_DSC,RTL_SUBCHANNEL_DSC,RTL_BEER_FLAG,RTL_LIQUOR_FLG,RTL_STATE_DSC,LowPoint_Y,BEERTYPE,...,B_RTL_SUBCHANNEL_DSC_SUPERMARKET-NATURAL/GOURMET FOODS,B_RTL_SUBCHANNEL_DSC_THEATER,B_RTL_SUBCHANNEL_DSC_UNKNOWN,B_RTL_SUBCHANNEL_DSC_WINE SPECIALTY STORE,B_RTL_BEER_FLAG_N,B_RTL_BEER_FLAG_U,B_RTL_BEER_FLAG_Y,B_RTL_LIQUOR_FLG_N,B_RTL_LIQUOR_FLG_U,B_RTL_LIQUOR_FLG_Y
2630,107493514,JOHNSON,ON,RECREATION,OTHER ENTERTAINMENT,Y,N,KANSAS,False,,...,0,0,0,0,0,0,1,1,0,0
2631,100696166,SHERMAN,OFF,CONVENIENCE STORE,CONVENTIONAL CONVENIENCE,Y,N,KANSAS,False,,...,0,0,0,0,0,0,1,1,0,0
2632,101403713,JOHNSON,OFF,LIQUOR,LIQUOR SUPER STORE,Y,Y,KANSAS,False,,...,0,0,0,0,0,0,1,0,0,1
2633,201404077,WYANDOTTE,OFF,LIQUOR,CONVENTIONAL LIQUOR,Y,Y,KANSAS,False,,...,0,0,0,0,0,0,1,0,0,1
2634,102631767,SEDGWICK,OFF,EXTENDED MASTER OFF-PREMISE,OTHER OFF-PREMISE,N,N,KANSAS,False,,...,0,0,0,0,1,0,0,1,0,0


In [20]:
#fit the model using traning data
#independent variables:
X_labels = [c for c in train_df.columns if c not in no_train_list]
X_train = train_df.loc[:,X_labels]

In [21]:
#dependent variable:
Y_train = train_df['BEERTYPE'] 
#train the model, and find the best parameter
parameters = {'max_depth':range(1,21), 'min_samples_leaf':range(3,21,3), 'min_samples_split':range(3,21,3), 'random_state': [0]}
clf = tree.DecisionTreeClassifier()
clf = GridSearchCV(clf, parameters, n_jobs = -2)
clf.fit(X_train, Y_train)

JoblibValueError: JoblibValueError
___________________________________________________________________________
Multiprocessing exception:
...........................................................................
C:\Users\qguo\AppData\Local\Continuum\Anaconda2\lib\runpy.py in _run_module_as_main(mod_name='ipykernel_launcher', alter_argv=1)
    169     pkg_name = mod_name.rpartition('.')[0]
    170     main_globals = sys.modules["__main__"].__dict__
    171     if alter_argv:
    172         sys.argv[0] = fname
    173     return _run_code(code, main_globals, None,
--> 174                      "__main__", fname, loader, pkg_name)
        fname = r'C:\Users\qguo\AppData\Local\Continuum\Anaconda2\lib\site-packages\ipykernel_launcher.py'
        loader = <pkgutil.ImpLoader instance>
        pkg_name = ''
    175 
    176 def run_module(mod_name, init_globals=None,
    177                run_name=None, alter_sys=False):
    178     """Execute a module's code without importing it

...........................................................................
C:\Users\qguo\AppData\Local\Continuum\Anaconda2\lib\runpy.py in _run_code(code=<code object <module> at 00000000022DAF30, file ...lib\site-packages\ipykernel_launcher.py", line 5>, run_globals={'__builtins__': <module '__builtin__' (built-in)>, '__doc__': 'Entry point for launching an IPython kernel.\n\nTh...orts until\nafter removing the cwd from sys.path.\n', '__file__': r'C:\Users\qguo\AppData\Local\Continuum\Anaconda2\lib\site-packages\ipykernel_launcher.py', '__loader__': <pkgutil.ImpLoader instance>, '__name__': '__main__', '__package__': '', 'app': <module 'ipykernel.kernelapp' from 'C:\Users\qgu...onda2\lib\site-packages\ipykernel\kernelapp.pyc'>, 'sys': <module 'sys' (built-in)>}, init_globals=None, mod_name='__main__', mod_fname=r'C:\Users\qguo\AppData\Local\Continuum\Anaconda2\lib\site-packages\ipykernel_launcher.py', mod_loader=<pkgutil.ImpLoader instance>, pkg_name='')
     67         run_globals.update(init_globals)
     68     run_globals.update(__name__ = mod_name,
     69                        __file__ = mod_fname,
     70                        __loader__ = mod_loader,
     71                        __package__ = pkg_name)
---> 72     exec code in run_globals
        code = <code object <module> at 00000000022DAF30, file ...lib\site-packages\ipykernel_launcher.py", line 5>
        run_globals = {'__builtins__': <module '__builtin__' (built-in)>, '__doc__': 'Entry point for launching an IPython kernel.\n\nTh...orts until\nafter removing the cwd from sys.path.\n', '__file__': r'C:\Users\qguo\AppData\Local\Continuum\Anaconda2\lib\site-packages\ipykernel_launcher.py', '__loader__': <pkgutil.ImpLoader instance>, '__name__': '__main__', '__package__': '', 'app': <module 'ipykernel.kernelapp' from 'C:\Users\qgu...onda2\lib\site-packages\ipykernel\kernelapp.pyc'>, 'sys': <module 'sys' (built-in)>}
     73     return run_globals
     74 
     75 def _run_module_code(code, init_globals=None,
     76                     mod_name=None, mod_fname=None,

...........................................................................
C:\Users\qguo\AppData\Local\Continuum\Anaconda2\lib\site-packages\ipykernel_launcher.py in <module>()
     11     # This is added back by InteractiveShellApp.init_path()
     12     if sys.path[0] == '':
     13         del sys.path[0]
     14 
     15     from ipykernel import kernelapp as app
---> 16     app.launch_new_instance()
     17 
     18 
     19 
     20 

...........................................................................
C:\Users\qguo\AppData\Local\Continuum\Anaconda2\lib\site-packages\traitlets\config\application.py in launch_instance(cls=<class 'ipykernel.kernelapp.IPKernelApp'>, argv=None, **kwargs={})
    653 
    654         If a global instance already exists, this reinitializes and starts it
    655         """
    656         app = cls.instance(**kwargs)
    657         app.initialize(argv)
--> 658         app.start()
        app.start = <bound method IPKernelApp.start of <ipykernel.kernelapp.IPKernelApp object>>
    659 
    660 #-----------------------------------------------------------------------------
    661 # utility functions, for convenience
    662 #-----------------------------------------------------------------------------

...........................................................................
C:\Users\qguo\AppData\Local\Continuum\Anaconda2\lib\site-packages\ipykernel\kernelapp.py in start(self=<ipykernel.kernelapp.IPKernelApp object>)
    472             return self.subapp.start()
    473         if self.poller is not None:
    474             self.poller.start()
    475         self.kernel.start()
    476         try:
--> 477             ioloop.IOLoop.instance().start()
    478         except KeyboardInterrupt:
    479             pass
    480 
    481 launch_new_instance = IPKernelApp.launch_instance

...........................................................................
C:\Users\qguo\AppData\Local\Continuum\Anaconda2\lib\site-packages\zmq\eventloop\ioloop.py in start(self=<zmq.eventloop.ioloop.ZMQIOLoop object>)
    172             )
    173         return loop
    174     
    175     def start(self):
    176         try:
--> 177             super(ZMQIOLoop, self).start()
        self.start = <bound method ZMQIOLoop.start of <zmq.eventloop.ioloop.ZMQIOLoop object>>
    178         except ZMQError as e:
    179             if e.errno == ETERM:
    180                 # quietly return on ETERM
    181                 pass

...........................................................................
C:\Users\qguo\AppData\Local\Continuum\Anaconda2\lib\site-packages\tornado\ioloop.py in start(self=<zmq.eventloop.ioloop.ZMQIOLoop object>)
    883                 self._events.update(event_pairs)
    884                 while self._events:
    885                     fd, events = self._events.popitem()
    886                     try:
    887                         fd_obj, handler_func = self._handlers[fd]
--> 888                         handler_func(fd_obj, events)
        handler_func = <function null_wrapper>
        fd_obj = <zmq.sugar.socket.Socket object>
        events = 5
    889                     except (OSError, IOError) as e:
    890                         if errno_from_exception(e) == errno.EPIPE:
    891                             # Happens when the client closes the connection
    892                             pass

...........................................................................
C:\Users\qguo\AppData\Local\Continuum\Anaconda2\lib\site-packages\tornado\stack_context.py in null_wrapper(*args=(<zmq.sugar.socket.Socket object>, 5), **kwargs={})
    272         # Fast path when there are no active contexts.
    273         def null_wrapper(*args, **kwargs):
    274             try:
    275                 current_state = _state.contexts
    276                 _state.contexts = cap_contexts[0]
--> 277                 return fn(*args, **kwargs)
        args = (<zmq.sugar.socket.Socket object>, 5)
        kwargs = {}
    278             finally:
    279                 _state.contexts = current_state
    280         null_wrapper._wrapped = True
    281         return null_wrapper

...........................................................................
C:\Users\qguo\AppData\Local\Continuum\Anaconda2\lib\site-packages\zmq\eventloop\zmqstream.py in _handle_events(self=<zmq.eventloop.zmqstream.ZMQStream object>, fd=<zmq.sugar.socket.Socket object>, events=5)
    435             # dispatch events:
    436             if events & IOLoop.ERROR:
    437                 gen_log.error("got POLLERR event on ZMQStream, which doesn't make sense")
    438                 return
    439             if events & IOLoop.READ:
--> 440                 self._handle_recv()
        self._handle_recv = <bound method ZMQStream._handle_recv of <zmq.eventloop.zmqstream.ZMQStream object>>
    441                 if not self.socket:
    442                     return
    443             if events & IOLoop.WRITE:
    444                 self._handle_send()

...........................................................................
C:\Users\qguo\AppData\Local\Continuum\Anaconda2\lib\site-packages\zmq\eventloop\zmqstream.py in _handle_recv(self=<zmq.eventloop.zmqstream.ZMQStream object>)
    467                 gen_log.error("RECV Error: %s"%zmq.strerror(e.errno))
    468         else:
    469             if self._recv_callback:
    470                 callback = self._recv_callback
    471                 # self._recv_callback = None
--> 472                 self._run_callback(callback, msg)
        self._run_callback = <bound method ZMQStream._run_callback of <zmq.eventloop.zmqstream.ZMQStream object>>
        callback = <function null_wrapper>
        msg = [<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>]
    473                 
    474         # self.update_state()
    475         
    476 

...........................................................................
C:\Users\qguo\AppData\Local\Continuum\Anaconda2\lib\site-packages\zmq\eventloop\zmqstream.py in _run_callback(self=<zmq.eventloop.zmqstream.ZMQStream object>, callback=<function null_wrapper>, *args=([<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>],), **kwargs={})
    409         close our socket."""
    410         try:
    411             # Use a NullContext to ensure that all StackContexts are run
    412             # inside our blanket exception handler rather than outside.
    413             with stack_context.NullContext():
--> 414                 callback(*args, **kwargs)
        callback = <function null_wrapper>
        args = ([<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>],)
        kwargs = {}
    415         except:
    416             gen_log.error("Uncaught exception, closing connection.",
    417                           exc_info=True)
    418             # Close the socket on an uncaught exception from a user callback

...........................................................................
C:\Users\qguo\AppData\Local\Continuum\Anaconda2\lib\site-packages\tornado\stack_context.py in null_wrapper(*args=([<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>],), **kwargs={})
    272         # Fast path when there are no active contexts.
    273         def null_wrapper(*args, **kwargs):
    274             try:
    275                 current_state = _state.contexts
    276                 _state.contexts = cap_contexts[0]
--> 277                 return fn(*args, **kwargs)
        args = ([<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>],)
        kwargs = {}
    278             finally:
    279                 _state.contexts = current_state
    280         null_wrapper._wrapped = True
    281         return null_wrapper

...........................................................................
C:\Users\qguo\AppData\Local\Continuum\Anaconda2\lib\site-packages\ipykernel\kernelbase.py in dispatcher(msg=[<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>])
    278         if self.control_stream:
    279             self.control_stream.on_recv(self.dispatch_control, copy=False)
    280 
    281         def make_dispatcher(stream):
    282             def dispatcher(msg):
--> 283                 return self.dispatch_shell(stream, msg)
        msg = [<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>]
    284             return dispatcher
    285 
    286         for s in self.shell_streams:
    287             s.on_recv(make_dispatcher(s), copy=False)

...........................................................................
C:\Users\qguo\AppData\Local\Continuum\Anaconda2\lib\site-packages\ipykernel\kernelbase.py in dispatch_shell(self=<ipykernel.ipkernel.IPythonKernel object>, stream=<zmq.eventloop.zmqstream.ZMQStream object>, msg={'buffers': [], 'content': {u'allow_stdin': True, u'code': u"#dependent variable:\nY_train = train_df['BEER...ameters, n_jobs = -2)\nclf.fit(X_train, Y_train)", u'silent': False, u'stop_on_error': True, u'store_history': True, u'user_expressions': {}}, 'header': {'date': datetime.datetime(2017, 6, 28, 12, 55, 40, 717000, tzinfo=tzutc()), u'msg_id': u'1AD82674D21448C3841A7C5AD81BFE3E', u'msg_type': u'execute_request', u'session': u'1921610C555D45909DD94CD367832E8E', u'username': u'username', u'version': u'5.0'}, 'metadata': {}, 'msg_id': u'1AD82674D21448C3841A7C5AD81BFE3E', 'msg_type': u'execute_request', 'parent_header': {}})
    230             self.log.warn("Unknown message type: %r", msg_type)
    231         else:
    232             self.log.debug("%s: %s", msg_type, msg)
    233             self.pre_handler_hook()
    234             try:
--> 235                 handler(stream, idents, msg)
        handler = <bound method IPythonKernel.execute_request of <ipykernel.ipkernel.IPythonKernel object>>
        stream = <zmq.eventloop.zmqstream.ZMQStream object>
        idents = ['1921610C555D45909DD94CD367832E8E']
        msg = {'buffers': [], 'content': {u'allow_stdin': True, u'code': u"#dependent variable:\nY_train = train_df['BEER...ameters, n_jobs = -2)\nclf.fit(X_train, Y_train)", u'silent': False, u'stop_on_error': True, u'store_history': True, u'user_expressions': {}}, 'header': {'date': datetime.datetime(2017, 6, 28, 12, 55, 40, 717000, tzinfo=tzutc()), u'msg_id': u'1AD82674D21448C3841A7C5AD81BFE3E', u'msg_type': u'execute_request', u'session': u'1921610C555D45909DD94CD367832E8E', u'username': u'username', u'version': u'5.0'}, 'metadata': {}, 'msg_id': u'1AD82674D21448C3841A7C5AD81BFE3E', 'msg_type': u'execute_request', 'parent_header': {}}
    236             except Exception:
    237                 self.log.error("Exception in message handler:", exc_info=True)
    238             finally:
    239                 self.post_handler_hook()

...........................................................................
C:\Users\qguo\AppData\Local\Continuum\Anaconda2\lib\site-packages\ipykernel\kernelbase.py in execute_request(self=<ipykernel.ipkernel.IPythonKernel object>, stream=<zmq.eventloop.zmqstream.ZMQStream object>, ident=['1921610C555D45909DD94CD367832E8E'], parent={'buffers': [], 'content': {u'allow_stdin': True, u'code': u"#dependent variable:\nY_train = train_df['BEER...ameters, n_jobs = -2)\nclf.fit(X_train, Y_train)", u'silent': False, u'stop_on_error': True, u'store_history': True, u'user_expressions': {}}, 'header': {'date': datetime.datetime(2017, 6, 28, 12, 55, 40, 717000, tzinfo=tzutc()), u'msg_id': u'1AD82674D21448C3841A7C5AD81BFE3E', u'msg_type': u'execute_request', u'session': u'1921610C555D45909DD94CD367832E8E', u'username': u'username', u'version': u'5.0'}, 'metadata': {}, 'msg_id': u'1AD82674D21448C3841A7C5AD81BFE3E', 'msg_type': u'execute_request', 'parent_header': {}})
    394         if not silent:
    395             self.execution_count += 1
    396             self._publish_execute_input(code, parent, self.execution_count)
    397 
    398         reply_content = self.do_execute(code, silent, store_history,
--> 399                                         user_expressions, allow_stdin)
        user_expressions = {}
        allow_stdin = True
    400 
    401         # Flush output before sending the reply.
    402         sys.stdout.flush()
    403         sys.stderr.flush()

...........................................................................
C:\Users\qguo\AppData\Local\Continuum\Anaconda2\lib\site-packages\ipykernel\ipkernel.py in do_execute(self=<ipykernel.ipkernel.IPythonKernel object>, code=u"#dependent variable:\nY_train = train_df['BEER...ameters, n_jobs = -2)\nclf.fit(X_train, Y_train)", silent=False, store_history=True, user_expressions={}, allow_stdin=True)
    191 
    192         self._forward_input(allow_stdin)
    193 
    194         reply_content = {}
    195         try:
--> 196             res = shell.run_cell(code, store_history=store_history, silent=silent)
        res = undefined
        shell.run_cell = <bound method ZMQInteractiveShell.run_cell of <ipykernel.zmqshell.ZMQInteractiveShell object>>
        code = u"#dependent variable:\nY_train = train_df['BEER...ameters, n_jobs = -2)\nclf.fit(X_train, Y_train)"
        store_history = True
        silent = False
    197         finally:
    198             self._restore_input()
    199 
    200         if res.error_before_exec is not None:

...........................................................................
C:\Users\qguo\AppData\Local\Continuum\Anaconda2\lib\site-packages\ipykernel\zmqshell.py in run_cell(self=<ipykernel.zmqshell.ZMQInteractiveShell object>, *args=(u"#dependent variable:\nY_train = train_df['BEER...ameters, n_jobs = -2)\nclf.fit(X_train, Y_train)",), **kwargs={'silent': False, 'store_history': True})
    528             )
    529         self.payload_manager.write_payload(payload)
    530 
    531     def run_cell(self, *args, **kwargs):
    532         self._last_traceback = None
--> 533         return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
        self.run_cell = <bound method ZMQInteractiveShell.run_cell of <ipykernel.zmqshell.ZMQInteractiveShell object>>
        args = (u"#dependent variable:\nY_train = train_df['BEER...ameters, n_jobs = -2)\nclf.fit(X_train, Y_train)",)
        kwargs = {'silent': False, 'store_history': True}
    534 
    535     def _showtraceback(self, etype, evalue, stb):
    536         # try to preserve ordering of tracebacks and print statements
    537         sys.stdout.flush()

...........................................................................
C:\Users\qguo\AppData\Local\Continuum\Anaconda2\lib\site-packages\IPython\core\interactiveshell.py in run_cell(self=<ipykernel.zmqshell.ZMQInteractiveShell object>, raw_cell=u"#dependent variable:\nY_train = train_df['BEER...ameters, n_jobs = -2)\nclf.fit(X_train, Y_train)", store_history=True, silent=False, shell_futures=True)
   2712                 self.displayhook.exec_result = result
   2713 
   2714                 # Execute the user code
   2715                 interactivity = "none" if silent else self.ast_node_interactivity
   2716                 has_raised = self.run_ast_nodes(code_ast.body, cell_name,
-> 2717                    interactivity=interactivity, compiler=compiler, result=result)
        interactivity = 'last_expr'
        compiler = <IPython.core.compilerop.CachingCompiler instance>
   2718                 
   2719                 self.last_execution_succeeded = not has_raised
   2720 
   2721                 # Reset this so later displayed values do not modify the

...........................................................................
C:\Users\qguo\AppData\Local\Continuum\Anaconda2\lib\site-packages\IPython\core\interactiveshell.py in run_ast_nodes(self=<ipykernel.zmqshell.ZMQInteractiveShell object>, nodelist=[<_ast.Assign object>, <_ast.Assign object>, <_ast.Assign object>, <_ast.Assign object>, <_ast.Expr object>], cell_name='<ipython-input-21-162323613337>', interactivity='last', compiler=<IPython.core.compilerop.CachingCompiler instance>, result=<ExecutionResult object at c6fac50, execution_co..._before_exec=None error_in_exec=None result=None>)
   2822                     return True
   2823 
   2824             for i, node in enumerate(to_run_interactive):
   2825                 mod = ast.Interactive([node])
   2826                 code = compiler(mod, cell_name, "single")
-> 2827                 if self.run_code(code, result):
        self.run_code = <bound method ZMQInteractiveShell.run_code of <ipykernel.zmqshell.ZMQInteractiveShell object>>
        code = <code object <module> at 000000000C80B030, file "<ipython-input-21-162323613337>", line 7>
        result = <ExecutionResult object at c6fac50, execution_co..._before_exec=None error_in_exec=None result=None>
   2828                     return True
   2829 
   2830             # Flush softspace
   2831             if softspace(sys.stdout, 0):

...........................................................................
C:\Users\qguo\AppData\Local\Continuum\Anaconda2\lib\site-packages\IPython\core\interactiveshell.py in run_code(self=<ipykernel.zmqshell.ZMQInteractiveShell object>, code_obj=<code object <module> at 000000000C80B030, file "<ipython-input-21-162323613337>", line 7>, result=<ExecutionResult object at c6fac50, execution_co..._before_exec=None error_in_exec=None result=None>)
   2876         outflag = 1  # happens in more places, so it's easier as default
   2877         try:
   2878             try:
   2879                 self.hooks.pre_run_code_hook()
   2880                 #rprint('Running code', repr(code_obj)) # dbg
-> 2881                 exec(code_obj, self.user_global_ns, self.user_ns)
        code_obj = <code object <module> at 000000000C80B030, file "<ipython-input-21-162323613337>", line 7>
        self.user_global_ns = {'FullData':      RTL_STORE_CD RTL_FIPS_COUNTY_DSC RTL_PREMIS... 0                 1  

[5738 rows x 185 columns], 'GridSearchCV': <class 'sklearn.model_selection._search.GridSearchCV'>, 'IFrame': <class 'IPython.lib.display.IFrame'>, 'In': ['', u"import cx_Oracle\nfrom IPython.display import ... sklearn.model_selection import train_test_split", u"#define a function to get the subset of datafr...tr(state).upper()]\n    return classification_df", u"#define a function for business_rule_pickle\nd...rule_func(result_df)\n    \n    return result_df", u"#business rule pickle Utah\ndef Rules_Utah(dat...(Rules_Utah, open('Utah_business_rule.p', 'wb'))", u"#business rule pickle Oklahoma\ndef Rules_Okla...klahoma, open('Oklahoma_business_rule.p', 'wb'))", u"#business rule pickle Colorado\ndef Rules_Colo...olorado, open('Colorado_business_rule.p', 'wb'))", u"def decision_tree_pickle_32(classification_df,...ict(X) == 'LowPoint'\n    \n    return result_df", u'#Kansas Decision tree Pickle\n# Load the data\...le.dump(clf, open(filename, \'wb\'), protocol=2)', u'# Minnesota Decision tree Pickle\ndata_dir = \...ta_decision_tree_model.p\', \'wb\'), protocol=2)', u'#this block works fine, but only half part of ...rsor.close()\n            \n    return StoreType', u"#list of columns that don't include in binariz...FLG', 'RTL_STATE_DSC', 'LowPoint_Y', 'BEERTYPE']", u"# Load the data\ndata_dir = 'Data/'\nStores = ...v(data_dir + 'AllStoresKansas.csv', dtype = str)", u"Stores['test1'] = 'TEST1'\nStores['test2'] = '...tores['RTL_STATE_DSC'] = 'KANSAS'\nStores.tail()", u"clsDF = classification_subset_32(Stores, 'Kansas')\nclsDF.tail()", u"StoreType = read_query_and_predict_32(clsDF, 'Kansas')\nStoreType.head()", u'result_df = pd.merge(clsDF, StoreType, on="RTL_STORE_CD", how="left")  \nresult_df.tail()', u'columns = [c for c in clsDF.columns if c not in no_binary_list]\ncolumns', u"result_df = pd.concat([result_df, pd.get_dummi...B_'+c for c in columns])] , axis = 1)\nresult_df", u"#subset the training dataset\ntrain_df = resul...[result_df['BEERTYPE'] != '']  \ntrain_df.tail()", ...], 'Out': {13:      RTL_STORE_CD RTL_FIPS_COUNTY_DSC RTL_PREMIS...      N  TEST1  TEST2       False        KANSAS  , 14:      RTL_STORE_CD RTL_FIPS_COUNTY_DSC RTL_PREMIS... 
2634              N        KANSAS       False  , 15: Empty DataFrame
Columns: [RTL_STORE_CD, BEERTYPE]
Index: [], 16:      RTL_STORE_CD RTL_FIPS_COUNTY_DSC RTL_PREMIS...           N        KANSAS       False      NaN  , 17: ['RTL_FIPS_COUNTY_DSC', 'RTL_PREMISE_TYPE_CD', 'RTL_CHANNEL_DSC', 'RTL_SUBCHANNEL_DSC', 'RTL_BEER_FLAG', 'RTL_LIQUOR_FLG'], 18:      RTL_STORE_CD RTL_FIPS_COUNTY_DSC RTL_PREMIS...                   0  

[2635 rows x 211 columns], 19:      RTL_STORE_CD RTL_FIPS_COUNTY_DSC RTL_PREMIS...  0                   0  

[5 rows x 211 columns]}, 'Rules_Colorado': <function Rules_Colorado>, 'Rules_Oklahoma': <function Rules_Oklahoma>, 'Rules_Utah': <function Rules_Utah>, 'StoreType': Empty DataFrame
Columns: [RTL_STORE_CD, BEERTYPE]
Index: [], 'Stores':      RTL_STORE_CD RTL_FIPS_COUNTY_DSC RTL_PREMIS...  False        KANSAS  

[2635 rows x 11 columns], ...}
        self.user_ns = {'FullData':      RTL_STORE_CD RTL_FIPS_COUNTY_DSC RTL_PREMIS... 0                 1  

[5738 rows x 185 columns], 'GridSearchCV': <class 'sklearn.model_selection._search.GridSearchCV'>, 'IFrame': <class 'IPython.lib.display.IFrame'>, 'In': ['', u"import cx_Oracle\nfrom IPython.display import ... sklearn.model_selection import train_test_split", u"#define a function to get the subset of datafr...tr(state).upper()]\n    return classification_df", u"#define a function for business_rule_pickle\nd...rule_func(result_df)\n    \n    return result_df", u"#business rule pickle Utah\ndef Rules_Utah(dat...(Rules_Utah, open('Utah_business_rule.p', 'wb'))", u"#business rule pickle Oklahoma\ndef Rules_Okla...klahoma, open('Oklahoma_business_rule.p', 'wb'))", u"#business rule pickle Colorado\ndef Rules_Colo...olorado, open('Colorado_business_rule.p', 'wb'))", u"def decision_tree_pickle_32(classification_df,...ict(X) == 'LowPoint'\n    \n    return result_df", u'#Kansas Decision tree Pickle\n# Load the data\...le.dump(clf, open(filename, \'wb\'), protocol=2)', u'# Minnesota Decision tree Pickle\ndata_dir = \...ta_decision_tree_model.p\', \'wb\'), protocol=2)', u'#this block works fine, but only half part of ...rsor.close()\n            \n    return StoreType', u"#list of columns that don't include in binariz...FLG', 'RTL_STATE_DSC', 'LowPoint_Y', 'BEERTYPE']", u"# Load the data\ndata_dir = 'Data/'\nStores = ...v(data_dir + 'AllStoresKansas.csv', dtype = str)", u"Stores['test1'] = 'TEST1'\nStores['test2'] = '...tores['RTL_STATE_DSC'] = 'KANSAS'\nStores.tail()", u"clsDF = classification_subset_32(Stores, 'Kansas')\nclsDF.tail()", u"StoreType = read_query_and_predict_32(clsDF, 'Kansas')\nStoreType.head()", u'result_df = pd.merge(clsDF, StoreType, on="RTL_STORE_CD", how="left")  \nresult_df.tail()', u'columns = [c for c in clsDF.columns if c not in no_binary_list]\ncolumns', u"result_df = pd.concat([result_df, pd.get_dummi...B_'+c for c in columns])] , axis = 1)\nresult_df", u"#subset the training dataset\ntrain_df = resul...[result_df['BEERTYPE'] != '']  \ntrain_df.tail()", ...], 'Out': {13:      RTL_STORE_CD RTL_FIPS_COUNTY_DSC RTL_PREMIS...      N  TEST1  TEST2       False        KANSAS  , 14:      RTL_STORE_CD RTL_FIPS_COUNTY_DSC RTL_PREMIS... 
2634              N        KANSAS       False  , 15: Empty DataFrame
Columns: [RTL_STORE_CD, BEERTYPE]
Index: [], 16:      RTL_STORE_CD RTL_FIPS_COUNTY_DSC RTL_PREMIS...           N        KANSAS       False      NaN  , 17: ['RTL_FIPS_COUNTY_DSC', 'RTL_PREMISE_TYPE_CD', 'RTL_CHANNEL_DSC', 'RTL_SUBCHANNEL_DSC', 'RTL_BEER_FLAG', 'RTL_LIQUOR_FLG'], 18:      RTL_STORE_CD RTL_FIPS_COUNTY_DSC RTL_PREMIS...                   0  

[2635 rows x 211 columns], 19:      RTL_STORE_CD RTL_FIPS_COUNTY_DSC RTL_PREMIS...  0                   0  

[5 rows x 211 columns]}, 'Rules_Colorado': <function Rules_Colorado>, 'Rules_Oklahoma': <function Rules_Oklahoma>, 'Rules_Utah': <function Rules_Utah>, 'StoreType': Empty DataFrame
Columns: [RTL_STORE_CD, BEERTYPE]
Index: [], 'Stores':      RTL_STORE_CD RTL_FIPS_COUNTY_DSC RTL_PREMIS...  False        KANSAS  

[2635 rows x 11 columns], ...}
   2882             finally:
   2883                 # Reset our crash handler in place
   2884                 sys.excepthook = old_excepthook
   2885         except SystemExit as e:

...........................................................................
C:\Users\qguo\Desktop\Snow\Python\test_github-master\test_github-master\<ipython-input-21-162323613337> in <module>()
      2 Y_train = train_df['BEERTYPE'] 
      3 #train the model, and find the best parameter
      4 parameters = {'max_depth':range(1,21), 'min_samples_leaf':range(3,21,3), 'min_samples_split':range(3,21,3), 'random_state': [0]}
      5 clf = tree.DecisionTreeClassifier()
      6 clf = GridSearchCV(clf, parameters, n_jobs = -2)
----> 7 clf.fit(X_train, Y_train)
      8 
      9 
     10 
     11 

...........................................................................
C:\Users\qguo\AppData\Local\Continuum\Anaconda2\lib\site-packages\sklearn\model_selection\_search.py in fit(self=GridSearchCV(cv=None, error_score='raise',
     ...train_score=True,
       scoring=None, verbose=0), X=      B_RTL_FIPS_COUNTY_DSC_ALLEN  B_RTL_FIPS_CO...                   0  

[2635 rows x 201 columns], y=0       NaN
1       NaN
2       NaN
3       NaN
...  NaN
Name: BEERTYPE, Length: 2635, dtype: object, groups=None)
    940 
    941         groups : array-like, with shape (n_samples,), optional
    942             Group labels for the samples used while splitting the dataset into
    943             train/test set.
    944         """
--> 945         return self._fit(X, y, groups, ParameterGrid(self.param_grid))
        self._fit = <bound method GridSearchCV._fit of GridSearchCV(...rain_score=True,
       scoring=None, verbose=0)>
        X =       B_RTL_FIPS_COUNTY_DSC_ALLEN  B_RTL_FIPS_CO...                   0  

[2635 rows x 201 columns]
        y = 0       NaN
1       NaN
2       NaN
3       NaN
...  NaN
Name: BEERTYPE, Length: 2635, dtype: object
        groups = None
        self.param_grid = {'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20], 'min_samples_leaf': [3, 6, 9, 12, 15, 18], 'min_samples_split': [3, 6, 9, 12, 15, 18], 'random_state': [0]}
    946 
    947 
    948 class RandomizedSearchCV(BaseSearchCV):
    949     """Randomized search on hyper parameters.

...........................................................................
C:\Users\qguo\AppData\Local\Continuum\Anaconda2\lib\site-packages\sklearn\model_selection\_search.py in _fit(self=GridSearchCV(cv=None, error_score='raise',
     ...train_score=True,
       scoring=None, verbose=0), X=      B_RTL_FIPS_COUNTY_DSC_ALLEN  B_RTL_FIPS_CO...                   0  

[2635 rows x 201 columns], y=0       NaN
1       NaN
2       NaN
3       NaN
...  NaN
Name: BEERTYPE, Length: 2635, dtype: object, groups=None, parameter_iterable=<sklearn.model_selection._search.ParameterGrid object>)
    559                                   fit_params=self.fit_params,
    560                                   return_train_score=self.return_train_score,
    561                                   return_n_test_samples=True,
    562                                   return_times=True, return_parameters=True,
    563                                   error_score=self.error_score)
--> 564           for parameters in parameter_iterable
        parameters = undefined
        parameter_iterable = <sklearn.model_selection._search.ParameterGrid object>
    565           for train, test in cv_iter)
    566 
    567         # if one choose to see train score, "out" will contain train score info
    568         if self.return_train_score:

...........................................................................
C:\Users\qguo\AppData\Local\Continuum\Anaconda2\lib\site-packages\sklearn\externals\joblib\parallel.py in __call__(self=Parallel(n_jobs=-2), iterable=<generator object <genexpr>>)
    763             if pre_dispatch == "all" or n_jobs == 1:
    764                 # The iterable was consumed all at once by the above for loop.
    765                 # No need to wait for async callbacks to trigger to
    766                 # consumption.
    767                 self._iterating = False
--> 768             self.retrieve()
        self.retrieve = <bound method Parallel.retrieve of Parallel(n_jobs=-2)>
    769             # Make sure that we get a last message telling us we are done
    770             elapsed_time = time.time() - self._start_time
    771             self._print('Done %3i out of %3i | elapsed: %s finished',
    772                         (len(self._output), len(self._output),

---------------------------------------------------------------------------
Sub-process traceback:
---------------------------------------------------------------------------
ValueError                                         Wed Jun 28 07:55:42 2017
PID: 13476Python 2.7.13: C:\Users\qguo\AppData\Local\Continuum\Anaconda2\python.exe
...........................................................................
C:\Users\qguo\AppData\Local\Continuum\Anaconda2\lib\site-packages\sklearn\externals\joblib\parallel.py in __call__(self=<sklearn.externals.joblib.parallel.BatchedCalls object>)
    126     def __init__(self, iterator_slice):
    127         self.items = list(iterator_slice)
    128         self._size = len(self.items)
    129 
    130     def __call__(self):
--> 131         return [func(*args, **kwargs) for func, args, kwargs in self.items]
        func = <function _fit_and_score>
        args = (DecisionTreeClassifier(class_weight=None, criter...  presort=False, random_state=0, splitter='best'),       B_RTL_FIPS_COUNTY_DSC_ALLEN  B_RTL_FIPS_CO...                   0  

[2635 rows x 201 columns], 0       NaN
1       NaN
2       NaN
3       NaN
...  NaN
Name: BEERTYPE, Length: 2635, dtype: object, <function _passthrough_scorer>, array([ 879,  880,  881, ..., 2632, 2633, 2634]), array([  0,   1,   2,   3,   4,   5,   6,   7,  ...,
       871, 872, 873, 874, 875, 876, 877, 878]), 0, {'max_depth': 1, 'min_samples_leaf': 3, 'min_samples_split': 3, 'random_state': 0})
        kwargs = {'error_score': 'raise', 'fit_params': {}, 'return_n_test_samples': True, 'return_parameters': True, 'return_times': True, 'return_train_score': True}
        self.items = [(<function _fit_and_score>, (DecisionTreeClassifier(class_weight=None, criter...  presort=False, random_state=0, splitter='best'),       B_RTL_FIPS_COUNTY_DSC_ALLEN  B_RTL_FIPS_CO...                   0  

[2635 rows x 201 columns], 0       NaN
1       NaN
2       NaN
3       NaN
...  NaN
Name: BEERTYPE, Length: 2635, dtype: object, <function _passthrough_scorer>, array([ 879,  880,  881, ..., 2632, 2633, 2634]), array([  0,   1,   2,   3,   4,   5,   6,   7,  ...,
       871, 872, 873, 874, 875, 876, 877, 878]), 0, {'max_depth': 1, 'min_samples_leaf': 3, 'min_samples_split': 3, 'random_state': 0}), {'error_score': 'raise', 'fit_params': {}, 'return_n_test_samples': True, 'return_parameters': True, 'return_times': True, 'return_train_score': True})]
    132 
    133     def __len__(self):
    134         return self._size
    135 

...........................................................................
C:\Users\qguo\AppData\Local\Continuum\Anaconda2\lib\site-packages\sklearn\model_selection\_validation.py in _fit_and_score(estimator=DecisionTreeClassifier(class_weight=None, criter...  presort=False, random_state=0, splitter='best'), X=      B_RTL_FIPS_COUNTY_DSC_ALLEN  B_RTL_FIPS_CO...                   0  

[2635 rows x 201 columns], y=0       NaN
1       NaN
2       NaN
3       NaN
...  NaN
Name: BEERTYPE, Length: 2635, dtype: object, scorer=<function _passthrough_scorer>, train=array([ 879,  880,  881, ..., 2632, 2633, 2634]), test=array([  0,   1,   2,   3,   4,   5,   6,   7,  ...,
       871, 872, 873, 874, 875, 876, 877, 878]), verbose=0, parameters={'max_depth': 1, 'min_samples_leaf': 3, 'min_samples_split': 3, 'random_state': 0}, fit_params={}, return_train_score=True, return_parameters=True, return_n_test_samples=True, return_times=True, error_score='raise')
    233 
    234     try:
    235         if y_train is None:
    236             estimator.fit(X_train, **fit_params)
    237         else:
--> 238             estimator.fit(X_train, y_train, **fit_params)
        estimator.fit = <bound method DecisionTreeClassifier.fit of Deci... presort=False, random_state=0, splitter='best')>
        X_train =       B_RTL_FIPS_COUNTY_DSC_ALLEN  B_RTL_FIPS_CO...                   0  

[1756 rows x 201 columns]
        y_train = 879     NaN
880     NaN
881     NaN
882     NaN
...  NaN
Name: BEERTYPE, Length: 1756, dtype: object
        fit_params = {}
    239 
    240     except Exception as e:
    241         # Note fit time as time until error
    242         fit_time = time.time() - start_time

...........................................................................
C:\Users\qguo\AppData\Local\Continuum\Anaconda2\lib\site-packages\sklearn\tree\tree.py in fit(self=DecisionTreeClassifier(class_weight=None, criter...  presort=False, random_state=0, splitter='best'), X=      B_RTL_FIPS_COUNTY_DSC_ALLEN  B_RTL_FIPS_CO...                   0  

[1756 rows x 201 columns], y=879     NaN
880     NaN
881     NaN
882     NaN
...  NaN
Name: BEERTYPE, Length: 1756, dtype: object, sample_weight=None, check_input=True, X_idx_sorted=None)
    734 
    735         super(DecisionTreeClassifier, self).fit(
    736             X, y,
    737             sample_weight=sample_weight,
    738             check_input=check_input,
--> 739             X_idx_sorted=X_idx_sorted)
        X_idx_sorted = None
    740         return self
    741 
    742 
    743     def predict_proba(self, X, check_input=True):

...........................................................................
C:\Users\qguo\AppData\Local\Continuum\Anaconda2\lib\site-packages\sklearn\tree\tree.py in fit(self=DecisionTreeClassifier(class_weight=None, criter...  presort=False, random_state=0, splitter='best'), X=array([[ 0.,  0.,  0., ...,  0.,  0.,  1.],
    ....,  0.,  0., ...,  1.,  0.,  0.]], dtype=float32), y=array([[nan],
       [nan],
       [nan],
      ...[nan],
       [nan],
       [nan]], dtype=object), sample_weight=None, check_input=True, X_idx_sorted=None)
    141             y = np.reshape(y, (-1, 1))
    142 
    143         self.n_outputs_ = y.shape[1]
    144 
    145         if is_classification:
--> 146             check_classification_targets(y)
        y = array([[nan],
       [nan],
       [nan],
      ...[nan],
       [nan],
       [nan]], dtype=object)
    147             y = np.copy(y)
    148 
    149             self.classes_ = []
    150             self.n_classes_ = []

...........................................................................
C:\Users\qguo\AppData\Local\Continuum\Anaconda2\lib\site-packages\sklearn\utils\multiclass.py in check_classification_targets(y=array([[nan],
       [nan],
       [nan],
      ...[nan],
       [nan],
       [nan]], dtype=object))
    167     y : array-like
    168     """
    169     y_type = type_of_target(y)
    170     if y_type not in ['binary', 'multiclass', 'multiclass-multioutput',
    171             'multilabel-indicator', 'multilabel-sequences']:
--> 172         raise ValueError("Unknown label type: %r" % y_type)
        y_type = 'unknown'
    173 
    174 
    175 
    176 def type_of_target(y):

ValueError: Unknown label type: 'unknown'
___________________________________________________________________________

In [None]:
accuracy = clf.best_score_ 
best_params = clf.best_params_
best_depth = best_params['max_depth']
best_leaf = best_params['min_samples_leaf']
best_split = best_params['min_samples_split']



In [None]:
parameters = {'max_depth':range(1,21), 'min_samples_leaf':range(3,21,3), 'min_samples_split':range(3,21,3), 'random_state': [0]}
clf = tree.DecisionTreeClassifier()
clf = GridSearchCV(clf, parameters, n_jobs = -2)
clf.fit(X_train, Y_train)
accuracy = clf.best_score_ 
best_params = clf.best_params_
best_depth = best_params['max_depth']
best_leaf = best_params['min_samples_leaf']
best_split = best_params['min_samples_split']


In [None]:
#using the best parameter to train and fit the model
clf = tree.DecisionTreeClassifier(max_depth=best_depth, min_samples_leaf= best_leaf, min_samples_split = best_split, random_state=0)
clf = clf.fit(X_train,Y_train)

#Prepare for prediction dataset
X_pred = result_df.loc[:,X_labels]

#make prediction for whole dataset.
result_df['LowPoint_Y'] = clf.predict(X_pred) == 'LowPoint'        

#delete the column 'BEERTYPE'
#del result_df['BEERTYPE']    

     