In [1]:
import cx_Oracle
from IPython.display import IFrame    #to display pdf file
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import pandas as pd
import pickle
from py2casefold import casefold
from sklearn import tree
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

In [None]:
#run all the small functions first, and then run the rest(load data)

In [2]:
#save this one 
def classify_32(dataframe, state, isPickle):
    
    '''
    This function takes in dataframe, and classify the store type to either it is a LowPoint store or not.
        LowPoint_Y = True: stores that can only sell 3.2% beer.
        LowPoint_Y = False: stores that can sell any beer.
    
    Parameter:
    ---------
        state: takes a string. Indicates the state.
        isPickle: takes a boolean. Indicates classification method. 'True' for pickle, 'False' for grid search
        
    Return:
    -------
        the dataframe with prediction result under 'LowPoint_Y' column.
        
    '''
    #define a function to get the subset of dataframe
    def classification_subset_32(dataframe, state):
        
        '''
        This function subsets the dataframe to prepare classification.

        parameter:
        ---------
            state: Takes a string. The name of state to subset dataframe. 
            
        return:
        -------
            The subsetted dataframe.        
        '''
        #load the columns for subset from pickle:
        subset_columns = pickle.load(open('classification_columns.p', 'rb'))

        #subset the dataframe:
        classification_df = pd.DataFrame(dataframe[subset_columns])

        classification_df = classification_df[classification_df['RTL_STATE_DSC'] == str(state).upper()]
        return classification_df    
    
    def decision_tree_pickle_32(classification_df, state):
        '''
        This function takes the name of the state, and using the subsetted dataframe from function 'classification_subset' 
            to classify the stores using pickled decision tree model. 

        parameter:
        ----------
            classification_df: Takes a dataframe. The subsetted dataframe after excuting the function "classification_subset"
            state: Takes a string. The name of the State

        return:
        -------
            The dataframe with prediction result.        
        '''

        #call the binarize function to transform the columns into binary
        columns = [c for c in classification_df.columns if c not in ['RTL_STORE_CD','LowPoint_Y', 'RTL_STATE_DSC']]
        result_df = pd.concat([classification_df, pd.get_dummies(classification_df[columns] , prefix = ['B_'+c for c in columns])] , axis = 1)

        #load the model from pickle for that state
        filename = str(state).title()+'_decision_tree_model.p'
        decision_tree_model = pickle.load(open(filename, 'rb'))

        #prepare independent variables:
        X_labels = [c for c in result_df.columns if c not in list(classification_df.columns)]
        X = result_df.loc[:,X_labels]

        #dependent variable:
        result_df['LowPoint_Y'] = decision_tree_model.predict(X) == 'LowPoint'

        return result_df

        
    #define a function for business_rule_pickle
    def business_rule_pickle_32(classification_df, state):  
        '''
        This function takes the name of the state, and using the subsetted dataframe from function 'classification_subset' 
            to classify the stores using pickled business rule.

        parameter:
        ----------
            state: Takes a string. The name of the State
            classification_df: Takes a dataframe. The subsetted dataframe after excuting the function "classification_subset"

        return:
        -------
            The dataframe with prediction result.        
        '''    
        #load the business rule from pickle for that State
        filename = str(state).title()+'_business_rule.p'
        rule_func = pickle.load(open(filename, 'rb'))

        #make the prediction
        result_df = classification_df
        result_df['LowPoint_Y'] = rule_func(result_df)

        return result_df
    

    def read_query_and_predict_32(self, classification_df, state):
    
        '''
        This function queries from Oracle SQL for specific state and store type and save the data in a dataframe. Then, use decision
            tree classifier to train the labeled data, and make prediction on whole dataset.

        parameter:
        ---------
            classification_df: Takes a dataframe. The subsetted dataframe after excuting the function "classification_subset"
            state: Takes a string. The state name

        return:
        -------
            a dataframe with prediction result. 
        '''


        #Access to Oracle DB
        host = 'tncluster6.cbi.net'
        port = 1521
        s_name = 'EDW1TS_EX.cbi.net'
        dsn_tns = cx_Oracle.makedsn(host, port, service_name = s_name)
        db_ts = cx_Oracle.connect('amartinezcotto', 'Kohlia#743', dsn_tns)
        cursor = db_ts.cursor()

        if str(state) == 'Kansas'.casefold():
            state_code = str(162)
        elif str(state) == 'Utah'.casefold():
            state_code = str(239)
        elif str(state) == 'Minnesota'.casefold():
            state_code = str(177)
        elif str(state) == 'Corolado'.casefold():
            state_code = str(125)
        elif str(state) == 'Oklahoma'.casefold():
            state_code = str(201)

        query = '''select /*+ parallel(6) */ a15.STORE_CD  RTL_STORE_CD, 'NonLowPoint' as "BEERTYPE"
                    from EDW.BI_CRN_F_RETAIL_DEPL_MVW_VW a11
                    join EDW.BI_CRN_D_ITEM_VW a12
                       on  (a11.ITEM_ID = a12.ITEM_ID)
                     join EDW.BI_CRN_D_REL_TIME_DETAIL_VW a13
                       on  (a11.DATE_ID = a13.REL_DATE_ID)
                     join EDW.BI_CRN_D_DISTRIBUTOR_VW a14
                       on  (a11.DIST_ID = a14.DIST_ID)
                     join EDW.BI_CRN_D_RETAILER_VW a15
                       on  (a11.RETAILER_ID = a15.RETAILER_ID)
                    where (a13.REL_TIME_CD in ('L3_TY')
                     and a14.STATE_PROVINCE_COUNTRY_CD in (''' + "'" + state_code + "')" + ''' and a12.MASTER_SKU_CD not in ('80013450', '80013452', '80013455', '80013979', '80013456', '80013475', '80061325', '80013477', '80013982', '80013980', '80059842', '80058839', '80014020', '80014022', '80014023', '80014024', '80014025', '80013992', '80013991', '80013464', '80013466', '80013469', '80013471', '80013472', '80013473', '80013478', '80031998', '80013485', '80056926', '80058838', '80019746', '80031994', '80015966', '80014017', '80012704', '80014015', '80014009', '80014011', '80024732', '80013994', '80056172', '80014010'))
                    group by a15.STORE_CD
                    UNION 
                    select /*+ parallel(6) */ a15.STORE_CD  RTL_STORE_CD,
                     'LowPoint' as "BEERTYPE"
                    from EDW.BI_CRN_F_RETAIL_DEPL_MVW_VW a11
                     join EDW.BI_CRN_D_ITEM_VW a12
                       on  (a11.ITEM_ID = a12.ITEM_ID)
                     join EDW.BI_CRN_D_REL_TIME_DETAIL_VW a13
                       on  (a11.DATE_ID = a13.REL_DATE_ID)
                     join EDW.BI_CRN_D_DISTRIBUTOR_VW a14
                       on  (a11.DIST_ID = a14.DIST_ID)
                     join EDW.BI_CRN_D_RETAILER_VW a15
                       on  (a11.RETAILER_ID = a15.RETAILER_ID)
                    where (a13.REL_TIME_CD in ('L3_TY')
                     and a14.STATE_PROVINCE_COUNTRY_CD in (''' +"'" + state_code + "')" + ''' and a12.MASTER_SKU_CD not in ('80029020', '80060325', '80032234', '80032233', '80023100', '80031072', '80013435', '80013437', '80013439', '80057078', '80014016', '80013442', '80026873', '80056799', '80013444', '80014014', '80013447', '80013460', '80013457', '80013458', '80059543', '80013461', '80059542', '80059571', '80059572', '80059573', '80013982', '80013980', '80056922', '80014002', '80013968', '80029050', '80013515', '80013516', '80013517', '80013518', '80018933', '80056908', '80013520', '80014006', '80013993', '80013522', '80014026', '80013984', '80013978', '80013970', '80014012', '80014060', '80013986', '80013977', '80060330', '80060331', '80060328', '999', '80014008', '80013995', '80013972', '80056887', '80014001', '80031989', '80013985', '80013971', '80013983', '80014007', '80029021', '80015214', '80015215', '80014005', '80014004', '80013976', '80013990', '80013998', '80013989', '80013975', '80013997', '80027630', '80027631', '80027632', '80062390', '80062550', '80013996', '80014013', '80013988', '80013987', '80013974', '80013973', '80013999', '80014000', 'UNK', '80013981'))
                    group by a15.STORE_CD '''

        try:
            cursor.execute(query)
            names = [ x[0] for x in cursor.description]
            rows = cursor.fetchall()
            StoreType = pd.DataFrame( rows, columns=names)

        finally:
            if cursor is not None:
                cursor.close()

        #If the above code runs sucessfully, it will return a dataframe which columns: store number and store type(LowPoint or NonLowPoint)



        #left join the store type data with subsetted dataframe
        result_df = pd.merge(classification_df, StoreType, on="RTL_STORE_CD", how="left")  

        #call the binarize function to transform the columns into binary
        result_df = self.binarize([c for c in classification_df.columns if c not in ['RTL_STORE_CD',"LowPoint_Y"]])

        #subset the training dataset
        train_df = result_df.loc[result_df['BEERTYPE'] == 'LowPoint' or result_df['BEERTYPE'] == 'NonLowPoint']  

        #fit the model using traning data
        #independent variables:
        X_labels = [c for c in train_df.columns if c not in [classification_df.columns]]
        X_train = train_df.loc[:,X_labels]
        #dependent variable:
        Y_train = train_df.loc['BEERTYPE'] 

        #train the model, and find the best parameter
        parameters = {'max_depth':range(1,21), 'min_samples_leaf':range(3,21,3), 'min_samples_split':range(3,21,3), 'random_state': [0]}
        clf = tree.DecisionTreeClassifier()
        clf = GridSearchCV(clf, parameters, n_jobs = -2)
        clf.fit(X_train, Y_train)
        accuracy = clf.best_score_ 
        best_params = clf.best_params_
        best_depth = best_params['max_depth']
        best_leaf = best_params['min_samples_leaf']
        best_split = best_params['min_samples_split']

        #using the best parameter to train and fit the model
        clf = tree.DecisionTreeClassifier(max_depth=best_depth, min_samples_leaf= best_leaf, min_samples_split = best_split, random_state=0)
        clf = clf.fit(X_train,Y_train)

        #Prepare for prediction dataset
        X_pred = result_df.loc[:,X_labels]

        #make prediction for whole dataset.
        result_df['LowPoint_Y'] = clf.predict(X_pred) == 'LowPoint_Y'        

        #delete the column 'BEERTYPE'
        del result_df['BEERTYPE']    
     
    
    
    ###############################excute code########################
    
    #create a LowPoint_Y column for entire dataset. 
    dataframe['LowPoint_Y'] = ''
    
    #take a subset of the data classification_subset 
    classification_df = self.classification_subset(self.data)
    
    #if using Pickle
    if isPickle:
        
        #Decision tree model for Kansas and Minnesota
        #If the state is Kansas:
        if state == 'Kansas'.casefold():
            self.decision_tree_pickle(classification_df, 'Kansas')

        #If the state is Minnesota:   
        elif state == 'Minnesota'.casefold():
            self.decision_tree_pickle(classification_df, 'Minnesota')
      
    
        #Business rule for Utah, Oklahoma, and Colorado
        #If the state is Utah:
        elif state == 'Utah'.casefold():
            self.business_rule_pickle(classification_df, 'Utah')
            
        #If the state is Oklahoma:   
        elif state == 'Oklahoma'.casefold():
            self.business_rule_pickle(classification_df, 'Oklahoma')


        #If the state is Colorado:  
        elif state == 'Colorado'.casefold():
            self.business_rule_pickle(classification_df, 'Colorado')

        #If enter other States:
        else:
            print('wrong state')
            break
    
    #if using grid search
    else:
        
        #If the state is Kansas:
        if state == 'Kansas'.casefold():
            #get the query result as dataframe and get the prediction result
            self.read_query_and_predict(classification_df, 'Kansas')
        
        #If the state is Minnesota:    
        elif state == 'Minnesota'.casefold():
            #get the query result as dataframe and get the prediction result
            self.read_query_and_predict(classification_df, 'Minnesota')

        else:
            print('wrong state')
            break 
    
    

SyntaxError: 'break' outside loop (<ipython-input-2-4d6e100550d8>, line 260)

In [50]:
#list of columns that don't include in binarize
no_binary_list = ['RTL_STORE_CD', 'RTL_STATE_DSC', 'LowPoint_Y', 'BEERTYPE']

#list of columns that don't include in train
no_train_list = ['RTL_STORE_CD','RTL_FIPS_COUNTY_DSC', 'RTL_PREMISE_TYPE_CD', 'RTL_CHANNEL_DSC', 'RTL_SUBCHANNEL_DSC', 'RTL_BEER_FLAG', 'RTL_LIQUOR_FLG', 'RTL_STATE_DSC', 'LowPoint_Y', 'BEERTYPE']

In [51]:
# Load the data
data_dir = 'Data/'
Stores = pd.read_csv(data_dir + 'AllStoresKansas.csv', dtype = str)

In [52]:
Stores['test1'] = 'TEST1'
Stores['test2'] = 'TEST2'
Stores['LowPoint_Y'] = False
Stores['RTL_STATE_DSC'] = 'KANSAS'
Stores.tail()

Unnamed: 0,RTL_STORE_CD,RTL_FIPS_COUNTY_DSC,RTL_PREMISE_TYPE_CD,RTL_CHANNEL_DSC,RTL_SUBCHANNEL_DSC,RTL_BEER_FLAG,RTL_LIQUOR_FLG,test1,test2,LowPoint_Y,RTL_STATE_DSC
2630,107493514,JOHNSON,ON,RECREATION,OTHER ENTERTAINMENT,Y,N,TEST1,TEST2,False,KANSAS
2631,100696166,SHERMAN,OFF,CONVENIENCE STORE,CONVENTIONAL CONVENIENCE,Y,N,TEST1,TEST2,False,KANSAS
2632,101403713,JOHNSON,OFF,LIQUOR,LIQUOR SUPER STORE,Y,Y,TEST1,TEST2,False,KANSAS
2633,201404077,WYANDOTTE,OFF,LIQUOR,CONVENTIONAL LIQUOR,Y,Y,TEST1,TEST2,False,KANSAS
2634,102631767,SEDGWICK,OFF,EXTENDED MASTER OFF-PREMISE,OTHER OFF-PREMISE,N,N,TEST1,TEST2,False,KANSAS


In [53]:
clsDF = classification_subset_32(Stores, 'Kansas')
clsDF.tail()

Unnamed: 0,RTL_STORE_CD,RTL_FIPS_COUNTY_DSC,RTL_PREMISE_TYPE_CD,RTL_CHANNEL_DSC,RTL_SUBCHANNEL_DSC,RTL_BEER_FLAG,RTL_LIQUOR_FLG,RTL_STATE_DSC,LowPoint_Y
2630,107493514,JOHNSON,ON,RECREATION,OTHER ENTERTAINMENT,Y,N,KANSAS,False
2631,100696166,SHERMAN,OFF,CONVENIENCE STORE,CONVENTIONAL CONVENIENCE,Y,N,KANSAS,False
2632,101403713,JOHNSON,OFF,LIQUOR,LIQUOR SUPER STORE,Y,Y,KANSAS,False
2633,201404077,WYANDOTTE,OFF,LIQUOR,CONVENTIONAL LIQUOR,Y,Y,KANSAS,False
2634,102631767,SEDGWICK,OFF,EXTENDED MASTER OFF-PREMISE,OTHER OFF-PREMISE,N,N,KANSAS,False


In [34]:
#need to change this block to automate 
def read_query_and_predict_32(classification_df, state):
    
    #Access to Oracle DB
    host = 'tncluster6.cbi.net'
    port = 1521
    s_name = 'EDW1TS_EX.cbi.net'
    dsn_tns = cx_Oracle.makedsn(host, port, service_name = s_name)
    db_ts = cx_Oracle.connect('amartinezcotto', 'Kohlia#743', dsn_tns)
    cursor = db_ts.cursor()

    if str(state).title() == 'Kansas'.title():
        state_code = str(162)
    elif str(state).title() == 'Utah'.title():
        state_code = str(239)
    elif str(state).title() == 'Minnesota'.title():
        state_code = str(177)
    elif str(state).title() == 'Corolado'.title():
        state_code = str(125)
    elif str(state).title() == 'Oklahoma'.title():
        state_code = str(201)

    query = '''select /*+ parallel(6) */ a15.STORE_CD  RTL_STORE_CD, 'NonLowPoint' as "BEERTYPE"
                    from EDW.BI_CRN_F_RETAIL_DEPL_MVW_VW a11
                    join EDW.BI_CRN_D_ITEM_VW a12
                       on  (a11.ITEM_ID = a12.ITEM_ID)
                     join EDW.BI_CRN_D_REL_TIME_DETAIL_VW a13
                       on  (a11.DATE_ID = a13.REL_DATE_ID)
                     join EDW.BI_CRN_D_DISTRIBUTOR_VW a14
                       on  (a11.DIST_ID = a14.DIST_ID)
                     join EDW.BI_CRN_D_RETAILER_VW a15
                       on  (a11.RETAILER_ID = a15.RETAILER_ID)
                    where (a13.REL_TIME_CD in ('L3_TY')
                     and a14.STATE_PROVINCE_COUNTRY_CD in (''' + "'" + state_code + "')" + ''' and a12.MASTER_SKU_CD not in ('80013450', '80013452', '80013455', '80013979', '80013456', '80013475', '80061325', '80013477', '80013982', '80013980', '80059842', '80058839', '80014020', '80014022', '80014023', '80014024', '80014025', '80013992', '80013991', '80013464', '80013466', '80013469', '80013471', '80013472', '80013473', '80013478', '80031998', '80013485', '80056926', '80058838', '80019746', '80031994', '80015966', '80014017', '80012704', '80014015', '80014009', '80014011', '80024732', '80013994', '80056172', '80014010'))
                    group by a15.STORE_CD
                    UNION 
                    select /*+ parallel(6) */ a15.STORE_CD  RTL_STORE_CD,
                     'LowPoint' as "BEERTYPE"
                    from EDW.BI_CRN_F_RETAIL_DEPL_MVW_VW a11
                     join EDW.BI_CRN_D_ITEM_VW a12
                       on  (a11.ITEM_ID = a12.ITEM_ID)
                     join EDW.BI_CRN_D_REL_TIME_DETAIL_VW a13
                       on  (a11.DATE_ID = a13.REL_DATE_ID)
                     join EDW.BI_CRN_D_DISTRIBUTOR_VW a14
                       on  (a11.DIST_ID = a14.DIST_ID)
                     join EDW.BI_CRN_D_RETAILER_VW a15
                       on  (a11.RETAILER_ID = a15.RETAILER_ID)
                    where (a13.REL_TIME_CD in ('L3_TY')
                     and a14.STATE_PROVINCE_COUNTRY_CD in (''' +"'" + state_code + "')" + ''' and a12.MASTER_SKU_CD not in ('80029020', '80060325', '80032234', '80032233', '80023100', '80031072', '80013435', '80013437', '80013439', '80057078', '80014016', '80013442', '80026873', '80056799', '80013444', '80014014', '80013447', '80013460', '80013457', '80013458', '80059543', '80013461', '80059542', '80059571', '80059572', '80059573', '80013982', '80013980', '80056922', '80014002', '80013968', '80029050', '80013515', '80013516', '80013517', '80013518', '80018933', '80056908', '80013520', '80014006', '80013993', '80013522', '80014026', '80013984', '80013978', '80013970', '80014012', '80014060', '80013986', '80013977', '80060330', '80060331', '80060328', '999', '80014008', '80013995', '80013972', '80056887', '80014001', '80031989', '80013985', '80013971', '80013983', '80014007', '80029021', '80015214', '80015215', '80014005', '80014004', '80013976', '80013990', '80013998', '80013989', '80013975', '80013997', '80027630', '80027631', '80027632', '80062390', '80062550', '80013996', '80014013', '80013988', '80013987', '80013974', '80013973', '80013999', '80014000', 'UNK', '80013981'))
                    group by a15.STORE_CD '''

    try:
        cursor.execute(query)
        names = [ x[0] for x in cursor.description]
        rows = cursor.fetchall()
        StoreType = pd.DataFrame(rows, columns=names)

    finally:
        if cursor is not None:
            cursor.close()

    #If the above code runs sucessfully, it will return a dataframe which columns: store number and store type(LowPoint or NonLowPoint)



    #left join the store type data with subsetted dataframe
    result_df = pd.merge(classification_df, StoreType, on="RTL_STORE_CD", how="left")  

    #call the binarize function to transform the columns into binary
    columns = [c for c in classification_df.columns if c not in ['RTL_STORE_CD','LowPoint_Y', 'RTL_STATE_DSC']]
    result_df = pd.concat([classification_df, pd.get_dummies(classification_df[columns] , prefix = ['B_'+c for c in columns])] , axis = 1)
        
    #subset the training dataset
    train_df = result_df.loc[result_df['BEERTYPE'] == 'LowPoint' or result_df['BEERTYPE'] == 'NonLowPoint']  

    #fit the model using traning data
    #independent variables:
    X_labels = [c for c in train_df.columns if c not in [classification_df.columns]]
    X_train = train_df.loc[:,X_labels]
    #dependent variable:
    Y_train = train_df.loc['BEERTYPE'] 

    #train the model, and find the best parameter
    parameters = {'max_depth':range(1,21), 'min_samples_leaf':range(3,21,3), 'min_samples_split':range(3,21,3), 'random_state': [0]}
    clf = tree.DecisionTreeClassifier()
    clf = GridSearchCV(clf, parameters, n_jobs = -2)
    clf.fit(X_train, Y_train)
    accuracy = clf.best_score_ 
    best_params = clf.best_params_
    best_depth = best_params['max_depth']
    best_leaf = best_params['min_samples_leaf']
    best_split = best_params['min_samples_split']

    #using the best parameter to train and fit the model
    clf = tree.DecisionTreeClassifier(max_depth=best_depth, min_samples_leaf= best_leaf, min_samples_split = best_split, random_state=0)
    clf = clf.fit(X_train,Y_train)

    #Prepare for prediction dataset
    X_pred = result_df.loc[:,X_labels]

    #make prediction for whole dataset.
    result_df['LowPoint_Y'] = clf.predict(X_pred) == 'LowPoint_Y'        

    #delete the column 'BEERTYPE'
    #del result_df['BEERTYPE']    
    
    return result_df
     

In [54]:
#this block works fine, but only half part of the function, need to add the later ones.
def read_query_and_predict_32(classification_df, state):
    
    #Access to Oracle DB
    host = 'tncluster6.cbi.net'
    port = 1521
    s_name = 'EDW1TS_EX.cbi.net'
    dsn_tns = cx_Oracle.makedsn(host, port, service_name = s_name)
    db_ts = cx_Oracle.connect('amartinezcotto', 'Kohlia#743', dsn_tns)
    cursor = db_ts.cursor()

    if str(state).title() == 'Kansas'.title():
        state_code = str(162)
    elif str(state).title() == 'Utah'.title():
        state_code = str(239)
    elif str(state).title() == 'Minnesota'.title():
        state_code = str(177)
    elif str(state).title() == 'Corolado'.title():
        state_code = str(125)
    elif str(state).title() == 'Oklahoma'.title():
        state_code = str(201)

    query = '''select /*+ parallel(6) */ a15.STORE_CD  RTL_STORE_CD, 'NonLowPoint' as "BEERTYPE"
                    from EDW.BI_CRN_F_RETAIL_DEPL_MVW_VW a11
                    join EDW.BI_CRN_D_ITEM_VW a12
                       on  (a11.ITEM_ID = a12.ITEM_ID)
                     join EDW.BI_CRN_D_REL_TIME_DETAIL_VW a13
                       on  (a11.DATE_ID = a13.REL_DATE_ID)
                     join EDW.BI_CRN_D_DISTRIBUTOR_VW a14
                       on  (a11.DIST_ID = a14.DIST_ID)
                     join EDW.BI_CRN_D_RETAILER_VW a15
                       on  (a11.RETAILER_ID = a15.RETAILER_ID)
                    where (a13.REL_TIME_CD in ('L3_TY')
                     and a14.STATE_PROVINCE_COUNTRY_CD in (''' + "'" + state_code + "')" + ''' and a12.MASTER_SKU_CD not in ('80013450', '80013452', '80013455', '80013979', '80013456', '80013475', '80061325', '80013477', '80013982', '80013980', '80059842', '80058839', '80014020', '80014022', '80014023', '80014024', '80014025', '80013992', '80013991', '80013464', '80013466', '80013469', '80013471', '80013472', '80013473', '80013478', '80031998', '80013485', '80056926', '80058838', '80019746', '80031994', '80015966', '80014017', '80012704', '80014015', '80014009', '80014011', '80024732', '80013994', '80056172', '80014010'))
                    group by a15.STORE_CD
                    UNION 
                    select /*+ parallel(6) */ a15.STORE_CD  RTL_STORE_CD,
                     'LowPoint' as "BEERTYPE"
                    from EDW.BI_CRN_F_RETAIL_DEPL_MVW_VW a11
                     join EDW.BI_CRN_D_ITEM_VW a12
                       on  (a11.ITEM_ID = a12.ITEM_ID)
                     join EDW.BI_CRN_D_REL_TIME_DETAIL_VW a13
                       on  (a11.DATE_ID = a13.REL_DATE_ID)
                     join EDW.BI_CRN_D_DISTRIBUTOR_VW a14
                       on  (a11.DIST_ID = a14.DIST_ID)
                     join EDW.BI_CRN_D_RETAILER_VW a15
                       on  (a11.RETAILER_ID = a15.RETAILER_ID)
                    where (a13.REL_TIME_CD in ('L3_TY')
                     and a14.STATE_PROVINCE_COUNTRY_CD in (''' +"'" + state_code + "')" + ''' and a12.MASTER_SKU_CD not in ('80029020', '80060325', '80032234', '80032233', '80023100', '80031072', '80013435', '80013437', '80013439', '80057078', '80014016', '80013442', '80026873', '80056799', '80013444', '80014014', '80013447', '80013460', '80013457', '80013458', '80059543', '80013461', '80059542', '80059571', '80059572', '80059573', '80013982', '80013980', '80056922', '80014002', '80013968', '80029050', '80013515', '80013516', '80013517', '80013518', '80018933', '80056908', '80013520', '80014006', '80013993', '80013522', '80014026', '80013984', '80013978', '80013970', '80014012', '80014060', '80013986', '80013977', '80060330', '80060331', '80060328', '999', '80014008', '80013995', '80013972', '80056887', '80014001', '80031989', '80013985', '80013971', '80013983', '80014007', '80029021', '80015214', '80015215', '80014005', '80014004', '80013976', '80013990', '80013998', '80013989', '80013975', '80013997', '80027630', '80027631', '80027632', '80062390', '80062550', '80013996', '80014013', '80013988', '80013987', '80013974', '80013973', '80013999', '80014000', 'UNK', '80013981'))
                    group by a15.STORE_CD '''

    try:
        cursor.execute(query)
        names = [ x[0] for x in cursor.description]
        rows = cursor.fetchall()
        StoreType = pd.DataFrame(rows, columns=names)

    finally:
        if cursor is not None:
            cursor.close()
            
    return StoreType

In [55]:
StoreType = read_query_and_predict_32(clsDF, 'Kansas')
StoreType.tail()

Unnamed: 0,RTL_STORE_CD,BEERTYPE
3536,302174106,LowPoint
3537,302216623,NonLowPoint
3538,302255841,LowPoint
3539,303771642,NonLowPoint
3540,VIP-24253737,LowPoint


In [56]:
result_df = pd.merge(clsDF, StoreType, on="RTL_STORE_CD", how="left")  
result_df.tail()

Unnamed: 0,RTL_STORE_CD,RTL_FIPS_COUNTY_DSC,RTL_PREMISE_TYPE_CD,RTL_CHANNEL_DSC,RTL_SUBCHANNEL_DSC,RTL_BEER_FLAG,RTL_LIQUOR_FLG,RTL_STATE_DSC,LowPoint_Y,BEERTYPE
3538,101403713,JOHNSON,OFF,LIQUOR,LIQUOR SUPER STORE,Y,Y,KANSAS,False,NonLowPoint
3539,101403713,JOHNSON,OFF,LIQUOR,LIQUOR SUPER STORE,Y,Y,KANSAS,False,LowPoint
3540,201404077,WYANDOTTE,OFF,LIQUOR,CONVENTIONAL LIQUOR,Y,Y,KANSAS,False,NonLowPoint
3541,201404077,WYANDOTTE,OFF,LIQUOR,CONVENTIONAL LIQUOR,Y,Y,KANSAS,False,LowPoint
3542,102631767,SEDGWICK,OFF,EXTENDED MASTER OFF-PREMISE,OTHER OFF-PREMISE,N,N,KANSAS,False,LowPoint


In [57]:
columns = [c for c in clsDF.columns if c not in no_binary_list]
columns

['RTL_FIPS_COUNTY_DSC',
 'RTL_PREMISE_TYPE_CD',
 'RTL_CHANNEL_DSC',
 'RTL_SUBCHANNEL_DSC',
 'RTL_BEER_FLAG',
 'RTL_LIQUOR_FLG']

In [58]:
result_df = pd.concat([result_df, pd.get_dummies(result_df[columns], prefix = ['B_'+c for c in columns])] , axis = 1)
result_df

Unnamed: 0,RTL_STORE_CD,RTL_FIPS_COUNTY_DSC,RTL_PREMISE_TYPE_CD,RTL_CHANNEL_DSC,RTL_SUBCHANNEL_DSC,RTL_BEER_FLAG,RTL_LIQUOR_FLG,RTL_STATE_DSC,LowPoint_Y,BEERTYPE,...,B_RTL_SUBCHANNEL_DSC_SUPERMARKET-NATURAL/GOURMET FOODS,B_RTL_SUBCHANNEL_DSC_THEATER,B_RTL_SUBCHANNEL_DSC_UNKNOWN,B_RTL_SUBCHANNEL_DSC_WINE SPECIALTY STORE,B_RTL_BEER_FLAG_N,B_RTL_BEER_FLAG_U,B_RTL_BEER_FLAG_Y,B_RTL_LIQUOR_FLG_N,B_RTL_LIQUOR_FLG_U,B_RTL_LIQUOR_FLG_Y
0,101403888,SEDGWICK,OFF,LIQUOR,CONVENTIONAL LIQUOR,Y,Y,KANSAS,False,LowPoint,...,0,0,0,0,0,0,1,0,0,1
1,101403888,SEDGWICK,OFF,LIQUOR,CONVENTIONAL LIQUOR,Y,Y,KANSAS,False,NonLowPoint,...,0,0,0,0,0,0,1,0,0,1
2,100091604,SALINE,OFF,CONVENIENCE STORE,CONVENTIONAL CONVENIENCE,Y,N,KANSAS,False,LowPoint,...,0,0,0,0,0,0,1,1,0,0
3,200697901,SEDGWICK,OFF,CONVENIENCE STORE,CONVENTIONAL CONVENIENCE,Y,N,KANSAS,False,LowPoint,...,0,0,0,0,0,0,1,1,0,0
4,100091143,HARPER,OFF,GROCERY,SUPERMARKET-CONVENTIONAL,Y,N,KANSAS,False,LowPoint,...,0,0,0,0,0,0,1,1,0,0
5,107381463,DOUGLAS,OFF,GROCERY,SUPERMARKET-NATURAL/GOURMET FOODS,Y,N,KANSAS,False,LowPoint,...,1,0,0,0,0,0,1,1,0,0
6,201953385,NEOSHO,ON,DINING,CASUAL DINING,Y,Y,KANSAS,False,LowPoint,...,0,0,0,0,0,0,1,0,0,1
7,100091590,LABETTE,OFF,GROCERY,SUPERMARKET-CONVENTIONAL,Y,N,KANSAS,False,LowPoint,...,0,0,0,0,0,0,1,1,0,0
8,201403873,PHILLIPS,OFF,LIQUOR,CONVENTIONAL LIQUOR,Y,Y,KANSAS,False,LowPoint,...,0,0,0,0,0,0,1,0,0,1
9,201403873,PHILLIPS,OFF,LIQUOR,CONVENTIONAL LIQUOR,Y,Y,KANSAS,False,NonLowPoint,...,0,0,0,0,0,0,1,0,0,1


In [59]:
#subset the training dataset
train_df = result_df.loc[result_df['BEERTYPE'] != '']  
train_df.tail()

Unnamed: 0,RTL_STORE_CD,RTL_FIPS_COUNTY_DSC,RTL_PREMISE_TYPE_CD,RTL_CHANNEL_DSC,RTL_SUBCHANNEL_DSC,RTL_BEER_FLAG,RTL_LIQUOR_FLG,RTL_STATE_DSC,LowPoint_Y,BEERTYPE,...,B_RTL_SUBCHANNEL_DSC_SUPERMARKET-NATURAL/GOURMET FOODS,B_RTL_SUBCHANNEL_DSC_THEATER,B_RTL_SUBCHANNEL_DSC_UNKNOWN,B_RTL_SUBCHANNEL_DSC_WINE SPECIALTY STORE,B_RTL_BEER_FLAG_N,B_RTL_BEER_FLAG_U,B_RTL_BEER_FLAG_Y,B_RTL_LIQUOR_FLG_N,B_RTL_LIQUOR_FLG_U,B_RTL_LIQUOR_FLG_Y
3538,101403713,JOHNSON,OFF,LIQUOR,LIQUOR SUPER STORE,Y,Y,KANSAS,False,NonLowPoint,...,0,0,0,0,0,0,1,0,0,1
3539,101403713,JOHNSON,OFF,LIQUOR,LIQUOR SUPER STORE,Y,Y,KANSAS,False,LowPoint,...,0,0,0,0,0,0,1,0,0,1
3540,201404077,WYANDOTTE,OFF,LIQUOR,CONVENTIONAL LIQUOR,Y,Y,KANSAS,False,NonLowPoint,...,0,0,0,0,0,0,1,0,0,1
3541,201404077,WYANDOTTE,OFF,LIQUOR,CONVENTIONAL LIQUOR,Y,Y,KANSAS,False,LowPoint,...,0,0,0,0,0,0,1,0,0,1
3542,102631767,SEDGWICK,OFF,EXTENDED MASTER OFF-PREMISE,OTHER OFF-PREMISE,N,N,KANSAS,False,LowPoint,...,0,0,0,0,1,0,0,1,0,0


In [60]:
#fit the model using traning data
#independent variables:
X_labels = [c for c in train_df.columns if c not in no_train_list]
X_train = train_df.loc[:,X_labels]

In [61]:
#dependent variable:
Y_train = train_df['BEERTYPE'] 
#train the model, and find the best parameter
parameters = {'max_depth':range(1,21), 'min_samples_leaf':range(3,21,3), 'min_samples_split':range(3,21,3), 'random_state': [0]}
clf = tree.DecisionTreeClassifier()
clf = GridSearchCV(clf, parameters, n_jobs = -2)

In [62]:
clf.fit(X_train, Y_train)

GridSearchCV(cv=None, error_score='raise',
       estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best'),
       fit_params={}, iid=True, n_jobs=-2,
       param_grid={'min_samples_split': [3, 6, 9, 12, 15, 18], 'random_state': [0], 'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20], 'min_samples_leaf': [3, 6, 9, 12, 15, 18]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [63]:
accuracy = clf.best_score_ 
best_params = clf.best_params_
best_depth = best_params['max_depth']
best_leaf = best_params['min_samples_leaf']
best_split = best_params['min_samples_split']



In [64]:
parameters = {'max_depth':range(1,21), 'min_samples_leaf':range(3,21,3), 'min_samples_split':range(3,21,3), 'random_state': [0]}
clf = tree.DecisionTreeClassifier()
clf = GridSearchCV(clf, parameters, n_jobs = -2)
clf.fit(X_train, Y_train)
accuracy = clf.best_score_ 
best_params = clf.best_params_
best_depth = best_params['max_depth']
best_leaf = best_params['min_samples_leaf']
best_split = best_params['min_samples_split']


In [72]:
#using the best parameter to train and fit the model
clf = tree.DecisionTreeClassifier(max_depth=best_depth, min_samples_leaf= best_leaf, min_samples_split = best_split, random_state=0)
clf = clf.fit(X_train,Y_train)

#Prepare for prediction dataset
X_pred = result_df.loc[:,X_labels]

#make prediction for whole dataset.
result_df['LowPoint_Y'] = clf.predict(X_pred) == 'LowPoint'        

#delete the column 'BEERTYPE'
#del result_df['BEERTYPE']    

     

In [None]:
#until here is all correct, need to rerun

In [None]:
#need to combine the function read_query_and_predict_32

# successful code:

In [8]:
#define a function to get the subset of dataframe
def classification_subset_32(dataframe, state):


    #load the columns for subset from pickle:
    subset_columns = pickle.load(open('classification_columns.p', 'rb'))
    
    #subset the dataframe:
    classification_df = pd.DataFrame(dataframe[subset_columns])

    classification_df = classification_df[classification_df['RTL_STATE_DSC'] == str(state).upper()]
    return classification_df

In [9]:
#define a function for business_rule_pickle
def business_rule_pickle_32(classification_df, state):  

    #load the business rule from pickle for that State
    filename = str(state).title()+'_business_rule.p'
    rule_func = pickle.load(open(filename, 'rb'))

    #make the prediction
    result_df = classification_df
    result_df['LowPoint_Y'] = rule_func(result_df)
    
    return result_df

In [10]:
#business rule pickle Utah
def Rules_Utah(dataframe):
    result_list = []
    for row in range(len(dataframe)):
        if dataframe.loc[row,'RTL_CHANNEL_DSC'] == 'LIQUOR' or 'MILITARY' in dataframe.loc[row,'RTL_CHANNEL_DSC']:
            result_list.append(False)
        else:
            result_list.append(True)  
    return result_list

#save the Utah business rule into pickle
pickle.dump(Rules_Utah, open('Utah_business_rule.p', 'wb'))

In [11]:
#business rule pickle Oklahoma
def Rules_Oklahoma(dataframe):
    result_list = []
    for row in range(len(dataframe)):
        if dataframe.loc[row,'RTL_CHANNEL_DSC'] == 'DISTRIBUTOR/SUB-DISTRIBUTOR' or dataframe.loc[row,'RTL_CHANNEL_DSC'] == 'EXTENDED MASTER OFF-PREMISE':
            result_list.append(False)
        else:
            result_list.append(True)
    return result_list

#save the Oklahoma business rule into pickle
pickle.dump(Rules_Oklahoma, open('Oklahoma_business_rule.p', 'wb'))

In [12]:
#business rule pickle Colorado
def Rules_Colorado(dataframe):
    result_list = []
    for row in range(len(dataframe)):
        if dataframe.loc[row,'RTL_PREMISE_TYPE_CD'] == 'OFF':
            if dataframe.loc[row,'RTL_CHANNEL_DSC'] == 'LIQUOR' or dataframe.loc[row,'RTL_CHANNEL_DSC'] == 'DRUG' or dataframe.loc[row,'RTL_CHANNEL_DSC'] == 'MILITARY OFF-PREMISE':
                result_list.append(False)
            elif dataframe.loc[row,'RTL_CHANNEL_DSC'] == 'GROCERY':
                if dataframe.loc[row,'RTL_LIQUOR_FLG'] == 'Y':
                    result_list.append(False)
                else:
                    result_list.append(True)
            else:
                result_list.append(True)
        else:
             result_list.append(False)
    return result_list

#save the Colorado business rule into pickle
pickle.dump(Rules_Colorado, open('Colorado_business_rule.p', 'wb'))

In [13]:
def decision_tree_pickle_32(classification_df, state):

    #call the binarize function to transform the columns into binary
    columns = [c for c in classification_df.columns if c not in ['RTL_STORE_CD','LowPoint_Y', 'RTL_STATE_DSC']]
    result_df = pd.concat([classification_df, pd.get_dummies(classification_df[columns] , prefix = ['B_'+c for c in columns])] , axis = 1)
        
    #load the model from pickle for that state
    filename = str(state).title()+'_decision_tree_model.p'
    decision_tree_model = pickle.load(open(filename, 'rb'))

    #prepare independent variables:
    X_labels = [c for c in result_df.columns if c not in no_train_list]
    X = result_df.loc[:,X_labels]

    #dependent variable:
    result_df['LowPoint_Y'] = decision_tree_model.predict(X) == 'LowPoint'
    
    return result_df

In [37]:
#Kansas Decision tree Pickle
# Load the data
data_dir = 'Data/'
Stores = pd.read_csv(data_dir + 'AllStoresKansas.csv', dtype = str)
StoreType = pd.read_csv(data_dir + 'NonLowPointStoresKansas.csv', dtype = str)
FullData = pd.merge(Stores, StoreType, on="RTL_STORE_CD", how="left")
FullData.loc[FullData['BEERTYPE'].isnull(),"BEERTYPE"] = "LowPoint"
FullData = pd.concat([FullData, pd.get_dummies(FullData['RTL_FIPS_COUNTY_DSC'],prefix='COUNTY')], axis=1)
FullData = pd.concat([FullData, pd.get_dummies(FullData['RTL_PREMISE_TYPE_CD'],prefix='PREMISE')], axis=1)
FullData = pd.concat([FullData, pd.get_dummies(FullData['RTL_CHANNEL_DSC'],prefix='CHANNEL')], axis=1)
FullData = pd.concat([FullData, pd.get_dummies(FullData['RTL_SUBCHANNEL_DSC'],prefix='SUBCHANNEL')], axis=1)
FullData = pd.concat([FullData, pd.get_dummies(FullData['RTL_BEER_FLAG'],prefix='BEER_LICENSE')], axis=1)
FullData = pd.concat([FullData, pd.get_dummies(FullData['RTL_LIQUOR_FLG'],prefix='LIQUOR_LICENSE')], axis=1)

# prepare data to fit model
X_labels = [c for c in FullData.columns if c not in ['RTL_STORE_CD','BEERTYPE','RTL_FIPS_COUNTY_DSC','RTL_PREMISE_TYPE_CD','RTL_CHANNEL_DSC','RTL_SUBCHANNEL_DSC','RTL_BEER_FLAG',"RTL_LIQUOR_FLG"]]
X = FullData.loc[:,X_labels]
Y = FullData['BEERTYPE']

parameters = {'max_depth':range(1,21), 'min_samples_leaf':range(3,21,3), 'min_samples_split':range(3,21,3), 'random_state': [0]}
clf = tree.DecisionTreeClassifier()
clf = GridSearchCV(clf, parameters, n_jobs = -2)
clf.fit(X, Y)
accuracy = clf.best_score_ 
best_params = clf.best_params_
#the result: with the best tree depth and accuracy
best_depth = best_params['max_depth']
best_leaf = best_params['min_samples_leaf']
best_split = best_params['min_samples_split']

clf = tree.DecisionTreeClassifier(max_depth=best_depth, min_samples_leaf= best_leaf, min_samples_split = best_split, random_state=0)
clf = clf.fit(X,Y)
# save the model to disk
filename = 'Kansas_decision_tree_model.p'
pickle.dump(clf, open(filename, 'wb'), protocol=2)


In [56]:
# Minnesota Decision tree Pickle
data_dir = 'Data/'
Stores = pd.read_csv(data_dir + 'AllStoresMinnesota.csv', dtype = str)
StoreType = pd.read_csv(data_dir + 'NonLowPointStoresMinnesota.csv', dtype = str)

FullData = pd.merge(Stores, StoreType, on="RTL_STORE_CD", how="left")
FullData.loc[FullData['BEERTYPE'].isnull(),"BEERTYPE"] = "LowPoint"
FullData = pd.concat([FullData, pd.get_dummies(FullData['RTL_FIPS_COUNTY_DSC'],prefix='COUNTY')], axis=1)
FullData = pd.concat([FullData, pd.get_dummies(FullData['RTL_PREMISE_TYPE_CD'],prefix='PREMISE')], axis=1)
FullData = pd.concat([FullData, pd.get_dummies(FullData['RTL_CHANNEL_DSC'],prefix='CHANNEL')], axis=1)
FullData = pd.concat([FullData, pd.get_dummies(FullData['RTL_SUBCHANNEL_DSC'],prefix='SUBCHANNEL')], axis=1)
FullData = pd.concat([FullData, pd.get_dummies(FullData['RTL_BEER_FLAG'],prefix='BEER_LICENSE')], axis=1)
FullData = pd.concat([FullData, pd.get_dummies(FullData['RTL_LIQUOR_FLG'],prefix='LIQUOR_LICENSE')], axis=1)
# prepare data to fit model
X_labels = [c for c in FullData.columns if c not in ['RTL_STORE_CD','BEERTYPE','RTL_FIPS_COUNTY_DSC','RTL_PREMISE_TYPE_CD','RTL_CHANNEL_DSC','RTL_SUBCHANNEL_DSC','RTL_BEER_FLAG',"RTL_LIQUOR_FLG"]]
X = FullData.loc[:,X_labels]
Y = FullData['BEERTYPE']
parameters = {'max_depth':range(1,21), 'min_samples_leaf':range(3,21,3), 'min_samples_split':range(3,21,3), 'random_state': [0]}
clf = tree.DecisionTreeClassifier()
clf = GridSearchCV(clf, parameters, n_jobs = -2)
clf.fit(X, Y)
accuracy = clf.best_score_ 
best_params = clf.best_params_
best_params = clf.best_params_
best_depth = best_params['max_depth']
best_leaf = best_params['min_samples_leaf']
best_split = best_params['min_samples_split']

clf = tree.DecisionTreeClassifier(max_depth=best_depth, min_samples_leaf= best_leaf, min_samples_split = best_split, random_state=0)
clf = clf.fit(X,Y)
# save the model to Pickle
pickle.dump(clf, open('Minnesota_decision_tree_model.p', 'wb'), protocol=2)