In [1]:
import cx_Oracle
from IPython.display import IFrame    #to display pdf file
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import pandas as pd
import pickle
from py2casefold import casefold
from sklearn import tree
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

In [2]:
#list of columns that don't include in train
no_train_list = ['RTL_STORE_CD','RTL_FIPS_COUNTY_DSC', 'RTL_PREMISE_TYPE_CD', 'RTL_CHANNEL_DSC', 'RTL_SUBCHANNEL_DSC', 'RTL_BEER_FLAG', 'RTL_LIQUOR_FLG', 'RTL_STATE_CD', 'LowPoint_Y', 'BEERTYPE']

# Mini-Functions:

In [3]:
#define a function to get the subset of dataframe
def classification_subset_32(dataframe, state):

    #load the columns for subset from pickle:
    subset_columns = pickle.load(open('classification_columns.p', 'rb'))
    
    #subset the dataframe:
    classification_df = pd.DataFrame(dataframe[subset_columns])

    classification_df = classification_df[classification_df['RTL_STATE_CD'] == str(state).upper()]
    return classification_df

In [4]:
#define a function for business_rule_pickle
def business_rule_pickle_32(classification_df, state):  

    #load the business rule from pickle for that State
    filename = str(state).upper()+'_business_rule.p'
    rule_func = pickle.load(open(filename, 'rb'))

    #make the prediction
    result_df = classification_df
    result_df['LowPoint_Y'] = rule_func(result_df)
    
    return result_df

In [5]:
#business rule pickle Utah
def Rules_Utah(dataframe):
    result_list = []
    for row in range(len(dataframe)):
        if dataframe.loc[row,'RTL_CHANNEL_DSC'] == 'LIQUOR' or 'MILITARY' in dataframe.loc[row,'RTL_CHANNEL_DSC']:
            result_list.append(False)
        else:
            result_list.append(True)  
    return result_list

#save the Utah business rule into pickle
pickle.dump(Rules_Utah, open('UT_business_rule.p', 'wb'))

In [6]:
#business rule pickle Oklahoma
def Rules_Oklahoma(dataframe):
    result_list = []
    for row in range(len(dataframe)):
        if dataframe.loc[row,'RTL_CHANNEL_DSC'] == 'DISTRIBUTOR/SUB-DISTRIBUTOR' or dataframe.loc[row,'RTL_CHANNEL_DSC'] == 'EXTENDED MASTER OFF-PREMISE':
            result_list.append(False)
        else:
            result_list.append(True)
    return result_list

#save the Oklahoma business rule into pickle
pickle.dump(Rules_Oklahoma, open('OK_business_rule.p', 'wb'))

In [7]:
#business rule pickle Colorado
def Rules_Colorado(dataframe):
    result_list = []
    for row in range(len(dataframe)):
        if dataframe.loc[row,'RTL_PREMISE_TYPE_CD'] == 'OFF':
            if dataframe.loc[row,'RTL_CHANNEL_DSC'] == 'LIQUOR' or dataframe.loc[row,'RTL_CHANNEL_DSC'] == 'DRUG' or dataframe.loc[row,'RTL_CHANNEL_DSC'] == 'MILITARY OFF-PREMISE':
                result_list.append(False)
            elif dataframe.loc[row,'RTL_CHANNEL_DSC'] == 'GROCERY':
                if dataframe.loc[row,'RTL_LIQUOR_FLG'] == 'Y':
                    result_list.append(False)
                else:
                    result_list.append(True)
            else:
                result_list.append(True)
        else:
             result_list.append(False)
    return result_list

#save the Colorado business rule into pickle
pickle.dump(Rules_Colorado, open('CO_business_rule.p', 'wb'))

In [8]:
def decision_tree_pickle_32(classification_df, state):
    
    
    #call the binarize function to transform the columns into binary
    binarize_columns = ['RTL_FIPS_COUNTY_DSC', 'RTL_PREMISE_TYPE_CD', 'RTL_CHANNEL_DSC', 'RTL_SUBCHANNEL_DSC', 'RTL_BEER_FLAG', 'RTL_LIQUOR_FLG']
    result_df = pd.concat([classification_df, pd.get_dummies(classification_df[binarize_columns] , prefix = ['B_'+c for c in binarize_columns])] , axis = 1)
  
    #load the model from pickle for that state
    filename = str(state).upper()+'_decision_tree_model.p'
    decision_tree_model = pickle.load(open(filename, 'rb'))

    #prepare independent variables:
    X_labels = [c for c in result_df.columns if c not in no_train_list]
    X = result_df.loc[:,X_labels]

    #dependent variable:
    result_df['LowPoint_Y'] = decision_tree_model.predict(X) == 'LowPoint'
    
    return result_df

In [17]:
#Kansas Decision tree Pickle
# Load the data
data_dir = 'Data/'
Stores = pd.read_csv(data_dir + 'AllStoresKansas.csv', dtype = str)
StoreType = pd.read_csv(data_dir + 'NonLowPointStoresKansas.csv', dtype = str)
FullData = pd.merge(Stores, StoreType, on="RTL_STORE_CD", how="left")
FullData.loc[FullData['BEERTYPE'].isnull(),"BEERTYPE"] = "LowPoint"
FullData = pd.concat([FullData, pd.get_dummies(FullData['RTL_FIPS_COUNTY_DSC'],prefix='B')], axis=1)
FullData = pd.concat([FullData, pd.get_dummies(FullData['RTL_PREMISE_TYPE_CD'],prefix='B')], axis=1)
FullData = pd.concat([FullData, pd.get_dummies(FullData['RTL_CHANNEL_DSC'],prefix='B')], axis=1)
FullData = pd.concat([FullData, pd.get_dummies(FullData['RTL_SUBCHANNEL_DSC'],prefix='B')], axis=1)
FullData = pd.concat([FullData, pd.get_dummies(FullData['RTL_BEER_FLAG'],prefix='B')], axis=1)
FullData = pd.concat([FullData, pd.get_dummies(FullData['RTL_LIQUOR_FLG'],prefix='B')], axis=1)

# prepare data to fit model
X_labels = [c for c in FullData.columns if c not in ['RTL_STORE_CD','BEERTYPE','RTL_FIPS_COUNTY_DSC','RTL_PREMISE_TYPE_CD','RTL_CHANNEL_DSC','RTL_SUBCHANNEL_DSC','RTL_BEER_FLAG',"RTL_LIQUOR_FLG"]]
X = FullData.loc[:,X_labels]
Y = FullData['BEERTYPE']

parameters = {'max_depth':range(1,21), 'min_samples_leaf':range(3,21,3), 'min_samples_split':range(3,21,3), 'random_state': [0]}
clf = tree.DecisionTreeClassifier()
clf = GridSearchCV(clf, parameters, n_jobs = -2)
clf.fit(X, Y)
accuracy = clf.best_score_ 
best_params = clf.best_params_
#the result: with the best tree depth and accuracy
best_depth = best_params['max_depth']
best_leaf = best_params['min_samples_leaf']
best_split = best_params['min_samples_split']

clf = tree.DecisionTreeClassifier(max_depth=best_depth, min_samples_leaf= best_leaf, min_samples_split = best_split, random_state=0)
clf = clf.fit(X,Y)
# save the model to disk
filename = 'KS_decision_tree_model.p'
pickle.dump(clf, open(filename, 'wb'), protocol=2)


In [18]:
# Minnesota Decision tree Pickle
data_dir = 'Data/'
Stores = pd.read_csv(data_dir + 'AllStoresMinnesota.csv', dtype = str)
StoreType = pd.read_csv(data_dir + 'NonLowPointStoresMinnesota.csv', dtype = str)

FullData = pd.merge(Stores, StoreType, on="RTL_STORE_CD", how="left")
FullData.loc[FullData['BEERTYPE'].isnull(),"BEERTYPE"] = "LowPoint"
FullData = pd.concat([FullData, pd.get_dummies(FullData['RTL_FIPS_COUNTY_DSC'],prefix='B')], axis=1)
FullData = pd.concat([FullData, pd.get_dummies(FullData['RTL_PREMISE_TYPE_CD'],prefix='B')], axis=1)
FullData = pd.concat([FullData, pd.get_dummies(FullData['RTL_CHANNEL_DSC'],prefix='B')], axis=1)
FullData = pd.concat([FullData, pd.get_dummies(FullData['RTL_SUBCHANNEL_DSC'],prefix='B')], axis=1)
FullData = pd.concat([FullData, pd.get_dummies(FullData['RTL_BEER_FLAG'],prefix='B')], axis=1)
FullData = pd.concat([FullData, pd.get_dummies(FullData['RTL_LIQUOR_FLG'],prefix='B')], axis=1)
# prepare data to fit model
X_labels = [c for c in FullData.columns if c not in ['RTL_STORE_CD','BEERTYPE','RTL_FIPS_COUNTY_DSC','RTL_PREMISE_TYPE_CD','RTL_CHANNEL_DSC','RTL_SUBCHANNEL_DSC','RTL_BEER_FLAG',"RTL_LIQUOR_FLG"]]
X = FullData.loc[:,X_labels]
Y = FullData['BEERTYPE']
parameters = {'max_depth':range(1,21), 'min_samples_leaf':range(3,21,3), 'min_samples_split':range(3,21,3), 'random_state': [0]}
clf = tree.DecisionTreeClassifier()
clf = GridSearchCV(clf, parameters, n_jobs = -2)
clf.fit(X, Y)
accuracy = clf.best_score_ 
best_params = clf.best_params_
best_params = clf.best_params_
best_depth = best_params['max_depth']
best_leaf = best_params['min_samples_leaf']
best_split = best_params['min_samples_split']

clf = tree.DecisionTreeClassifier(max_depth=best_depth, min_samples_leaf= best_leaf, min_samples_split = best_split, random_state=0)
clf = clf.fit(X,Y)
# save the model to Pickle
pickle.dump(clf, open('MN_decision_tree_model.p', 'wb'), protocol=2)

In [11]:
def read_query_32(state):
    
    #Access to Oracle DB
    host = 'tncluster6.cbi.net'
    port = 1521
    s_name = 'EDW1TS_EX.cbi.net'
    dsn_tns = cx_Oracle.makedsn(host, port, service_name = s_name)
    db_ts = cx_Oracle.connect('amartinezcotto', 'Kohlia#743', dsn_tns)
    cursor = db_ts.cursor()

    if str(state).upper() == 'KS':
        state_code = str(162)
    elif str(state).upper() == 'UT':
        state_code = str(239)
    elif str(state).upper() == 'MN':
        state_code = str(177)
    elif str(state).upper() == 'CO':
        state_code = str(125)
    elif str(state).upper() == 'OK':
        state_code = str(201)

    query_NonLowPoint = '''select /*+ parallel(6) */ a15.STORE_CD  RTL_STORE_CD, 
                    'NonLowPoint' as "BEERTYPE"
                    from EDW.BI_CRN_F_RETAIL_DEPL_MVW_VW a11
                    join EDW.BI_CRN_D_ITEM_VW a12
                       on  (a11.ITEM_ID = a12.ITEM_ID)
                     join EDW.BI_CRN_D_REL_TIME_DETAIL_VW a13
                       on  (a11.DATE_ID = a13.REL_DATE_ID)
                     join EDW.BI_CRN_D_DISTRIBUTOR_VW a14
                       on  (a11.DIST_ID = a14.DIST_ID)
                     join EDW.BI_CRN_D_RETAILER_VW a15
                       on  (a11.RETAILER_ID = a15.RETAILER_ID)
                    where (a13.REL_TIME_CD in ('L3_TY')
                     and a14.STATE_PROVINCE_COUNTRY_CD in (''' + "'" + state_code + "')" + ''' and a12.MASTER_SKU_CD not in ('80013450', '80013452', '80013455', '80013979', '80013456', '80013475', '80061325', '80013477', '80013982', '80013980', '80059842', '80058839', '80014020', '80014022', '80014023', '80014024', '80014025', '80013992', '80013991', '80013464', '80013466', '80013469', '80013471', '80013472', '80013473', '80013478', '80031998', '80013485', '80056926', '80058838', '80019746', '80031994', '80015966', '80014017', '80012704', '80014015', '80014009', '80014011', '80024732', '80013994', '80056172', '80014010'))
                    group by a15.STORE_CD'''

    try:
        cursor.execute(query_NonLowPoint)
        names = [ x[0] for x in cursor.description]
        rows = cursor.fetchall()
        NonLowPoint_DF = pd.DataFrame(rows, columns=names)

    finally:
        if cursor is not None:
            cursor.close()
            
        
    #for LowPoint Beer
    dsn_tns = cx_Oracle.makedsn(host, port, service_name = s_name)
    db_ts = cx_Oracle.connect('amartinezcotto', 'Kohlia#743', dsn_tns)
    cursor = db_ts.cursor()

    if str(state).upper() == 'KS':
        state_code = str(162)
    elif str(state).upper() == 'UT':
        state_code = str(239)
    elif str(state).upper() == 'MN':
        state_code = str(177)
    elif str(state).upper() == 'CO':
        state_code = str(125)
    elif str(state).upper() == 'OK':
        state_code = str(201)

    query_LowPoint = '''select /*+ parallel(6) */ a15.STORE_CD  RTL_STORE_CD,
                     'LowPoint' as "BEERTYPE"
                    from EDW.BI_CRN_F_RETAIL_DEPL_MVW_VW a11
                     join EDW.BI_CRN_D_ITEM_VW a12
                       on  (a11.ITEM_ID = a12.ITEM_ID)
                     join EDW.BI_CRN_D_REL_TIME_DETAIL_VW a13
                       on  (a11.DATE_ID = a13.REL_DATE_ID)
                     join EDW.BI_CRN_D_DISTRIBUTOR_VW a14
                       on  (a11.DIST_ID = a14.DIST_ID)
                     join EDW.BI_CRN_D_RETAILER_VW a15
                       on  (a11.RETAILER_ID = a15.RETAILER_ID)
                    where (a13.REL_TIME_CD in ('L3_TY')
                     and a14.STATE_PROVINCE_COUNTRY_CD in (''' +"'" + state_code + "')" + ''' 
                     and a15.STORE_CD not in 
                     (select /*+ parallel(6) */ a15.STORE_CD  RTL_STORE_CD
                    from EDW.BI_CRN_F_RETAIL_DEPL_MVW_VW a11
                    join EDW.BI_CRN_D_ITEM_VW a12
                       on  (a11.ITEM_ID = a12.ITEM_ID)
                     join EDW.BI_CRN_D_REL_TIME_DETAIL_VW a13
                       on  (a11.DATE_ID = a13.REL_DATE_ID)
                     join EDW.BI_CRN_D_DISTRIBUTOR_VW a14
                       on  (a11.DIST_ID = a14.DIST_ID)
                     join EDW.BI_CRN_D_RETAILER_VW a15
                       on  (a11.RETAILER_ID = a15.RETAILER_ID)
                    where (a13.REL_TIME_CD in ('L3_TY')
                     and a14.STATE_PROVINCE_COUNTRY_CD in (''' + "'" + state_code + "')" + ''' and a12.MASTER_SKU_CD not in ('80013450', '80013452', '80013455', '80013979', '80013456', '80013475', '80061325', '80013477', '80013982', '80013980', '80059842', '80058839', '80014020', '80014022', '80014023', '80014024', '80014025', '80013992', '80013991', '80013464', '80013466', '80013469', '80013471', '80013472', '80013473', '80013478', '80031998', '80013485', '80056926', '80058838', '80019746', '80031994', '80015966', '80014017', '80012704', '80014015', '80014009', '80014011', '80024732', '80013994', '80056172', '80014010'))
                    group by a15.STORE_CD))
                    group by a15.STORE_CD '''

    try:
        cursor.execute(query_LowPoint)
        names = [ x[0] for x in cursor.description]
        rows = cursor.fetchall()
        LowPoint_DF = pd.DataFrame(rows, columns=names)

    finally:
        if cursor is not None:
            cursor.close()
    
    #concat two beer types
    StoreType = pd.concat([NonLowPoint_DF, LowPoint_DF])
    
    
    return StoreType

In [12]:
def grid_search_32(classification_df, StoreType, state):
    
    #list of columns that don't include in train
    no_train_list = ['RTL_STORE_CD','RTL_FIPS_COUNTY_DSC', 'RTL_PREMISE_TYPE_CD', 'RTL_CHANNEL_DSC', 'RTL_SUBCHANNEL_DSC', 'RTL_BEER_FLAG', 'RTL_LIQUOR_FLG', 'RTL_STATE_CD', 'LowPoint_Y', 'BEERTYPE']
    
    #left join the store type data with subsetted dataframe(classification_df)
    result_df = pd.merge(classification_df, StoreType, on="RTL_STORE_CD", how="left")  
    
    #binarize the columns:
    binarize_columns = ['RTL_FIPS_COUNTY_DSC', 'RTL_PREMISE_TYPE_CD', 'RTL_CHANNEL_DSC', 'RTL_SUBCHANNEL_DSC', 'RTL_BEER_FLAG', 'RTL_LIQUOR_FLG']
    result_df = pd.concat([result_df, pd.get_dummies(result_df[binarize_columns], prefix = ['B_'+c for c in binarize_columns])] , axis = 1)
    
    #subset the training dataset
    LowPoint = result_df.loc[result_df['BEERTYPE'] == 'LowPoint']  
    NonLowPoint = result_df.loc[result_df['BEERTYPE'] == 'NonLowPoint']
    train_df =  pd.concat([LowPoint, NonLowPoint])
    
    #fit the model using traning data
    #independent variables:
    X_labels = [c for c in train_df.columns if c not in no_train_list]
    X_train = train_df.loc[:,X_labels]
    #dependent variable:
    Y_train = train_df['BEERTYPE'] 
    
    #train the model, and find the best parameter
    parameters = {'max_depth':range(1,21), 'min_samples_leaf':range(3,21,3), 'min_samples_split':range(3,21,3), 'random_state': [0]}
    clf = tree.DecisionTreeClassifier()
    clf = GridSearchCV(clf, parameters, n_jobs = -2)
    clf.fit(X_train, Y_train)
    accuracy = clf.best_score_ 
    best_params = clf.best_params_
    best_depth = best_params['max_depth']
    best_leaf = best_params['min_samples_leaf']
    best_split = best_params['min_samples_split']
    
    
    #using the best parameter to train and fit the model
    clf = tree.DecisionTreeClassifier(max_depth=best_depth, min_samples_leaf= best_leaf, min_samples_split = best_split, random_state=0)
    clf = clf.fit(X_train,Y_train)
    
    #Prepare for prediction dataset
    X_pred = result_df.loc[:,X_labels]

    #make prediction for whole dataset.
    result_df['LowPoint_Y'] = clf.predict(X_pred) == 'LowPoint'        

    return result_df

# Big function: classify_32

In [13]:
def classify_32(dataframe, state, isPickle = True):
    '''
    This function takes in dataframe, and classify the store type to either it is a LowPoint store or not.
        LowPoint_Y = True: stores that can only sell 3.2% beer.
        LowPoint_Y = False: stores that can sell any beer.
    
    Parameter:
    ---------
        state: takes a string. Indicates the state.
        isPickle: takes a boolean. Indicates classification method. 'True' for pickle, 'False' for grid search
        
    Return:
    -------
        the dataframe with prediction result under 'LowPoint_Y' column.
     
    '''
    no_train_list = ['RTL_STORE_CD','RTL_FIPS_COUNTY_DSC', 'RTL_PREMISE_TYPE_CD', 'RTL_CHANNEL_DSC', 'RTL_SUBCHANNEL_DSC', 'RTL_BEER_FLAG', 'RTL_LIQUOR_FLG', 'RTL_STATE_CD', 'LowPoint_Y', 'BEERTYPE']
    
    #define a function to get the subset of dataframe
    def classification_subset_32(dataframe, state):
        '''
        This function subsets the dataframe to prepare for classification.

        parameter:
        ---------
            state: Takes a string. The name of state to subset dataframe. 
            
        return:
        -------
            The subsetted dataframe.        
        '''
        #load the columns for subset from pickle:
        subset_columns = pickle.load(open('classification_columns.p', 'rb'))

        #subset the dataframe:
        classification_df = pd.DataFrame(dataframe[subset_columns])

        classification_df = classification_df[classification_df['RTL_STATE_CD'] == str(state).upper()]
        
        return classification_df

    #define a function for decision_tree model saved in Pickle    
    def decision_tree_pickle_32(classification_df, state):
        '''
        This function takes the name of the state, and using the subsetted dataframe from function 'classification_subset' 
            to classify the stores using pickled decision tree model. 

        parameter:
        ----------
            state: Takes a string. The name of the State

        return:
        -------
            The dataframe with prediction result.        
        '''
        #call the binarize function to transform the columns into binary
        binarize_columns = ['RTL_FIPS_COUNTY_DSC', 'RTL_PREMISE_TYPE_CD', 'RTL_CHANNEL_DSC', 'RTL_SUBCHANNEL_DSC', 'RTL_BEER_FLAG', 'RTL_LIQUOR_FLG']
        result_df = pd.concat([classification_df, pd.get_dummies(classification_df[binarize_columns] , prefix = ['B_'+c for c in binarize_columns])] , axis = 1)

        #load the model from pickle for that state
        filename = str(state).upper()+'_decision_tree_model.p'
        decision_tree_model = pickle.load(open(filename, 'rb'))

        #prepare independent variables:
        X_labels = [c for c in result_df.columns if c not in no_train_list]
        X = result_df.loc[:,X_labels]

        #dependent variable:
        result_df['LowPoint_Y'] = decision_tree_model.predict(X) == 'LowPoint'

        return result_df
    
    #define a function for business_rule_pickle
    def business_rule_pickle_32(classification_df, state):  
        '''
        This function takes the name of the state, and using the subsetted dataframe from function 'classification_subset' 
            to classify the stores using pickled business rule. This method only works for UTAH, OKLAHOMA, and Colorado.

        parameter:
        ----------
            state: Takes a string. The name of the State
            classification_df: Takes a dataframe. The subsetted dataframe after excuting the function "classification_subset"

        return:
        -------
            The dataframe with prediction result.        
        '''    
        #load the business rule from pickle for that State
        filename = str(state).upper()+'_business_rule.p'
        rule_func = pickle.load(open(filename, 'rb'))

        #make the prediction
        result_df = classification_df
        result_df['LowPoint_Y'] = rule_func(result_df)

        return result_df
    
        
    def read_query_32(state):
        
        '''
        This function queries from Oracle SQL for specific state and store type and save the data in a dataframe. 
        parameter:
        ---------
            state: Takes a string. The state name

        return:
        -------
            a dataframe with store number in that state and the type of the beers that the stores sell. 
        '''
    
        #Access to Oracle DB
        host = 'tncluster6.cbi.net'
        port = 1521
        s_name = 'EDW1TS_EX.cbi.net'
        dsn_tns = cx_Oracle.makedsn(host, port, service_name = s_name)
        db_ts = cx_Oracle.connect('amartinezcotto', 'Kohlia#743', dsn_tns)
        cursor = db_ts.cursor()

        if str(state).upper() == 'KS':
            state_code = str(162)
        elif str(state).upper() == 'UT':
            state_code = str(239)
        elif str(state).upper() == 'MN':
            state_code = str(177)
        elif str(state).upper() == 'CO':
            state_code = str(125)
        elif str(state).upper() == 'OK':
            state_code = str(201)

        query_NonLowPoint = '''select /*+ parallel(6) */ a15.STORE_CD  RTL_STORE_CD, 
                        'NonLowPoint' as "BEERTYPE"
                        from EDW.BI_CRN_F_RETAIL_DEPL_MVW_VW a11
                        join EDW.BI_CRN_D_ITEM_VW a12
                           on  (a11.ITEM_ID = a12.ITEM_ID)
                         join EDW.BI_CRN_D_REL_TIME_DETAIL_VW a13
                           on  (a11.DATE_ID = a13.REL_DATE_ID)
                         join EDW.BI_CRN_D_DISTRIBUTOR_VW a14
                           on  (a11.DIST_ID = a14.DIST_ID)
                         join EDW.BI_CRN_D_RETAILER_VW a15
                           on  (a11.RETAILER_ID = a15.RETAILER_ID)
                        where (a13.REL_TIME_CD in ('L3_TY')
                         and a14.STATE_PROVINCE_COUNTRY_CD in (''' + "'" + state_code + "')" + ''' and a12.MASTER_SKU_CD not in ('80013450', '80013452', '80013455', '80013979', '80013456', '80013475', '80061325', '80013477', '80013982', '80013980', '80059842', '80058839', '80014020', '80014022', '80014023', '80014024', '80014025', '80013992', '80013991', '80013464', '80013466', '80013469', '80013471', '80013472', '80013473', '80013478', '80031998', '80013485', '80056926', '80058838', '80019746', '80031994', '80015966', '80014017', '80012704', '80014015', '80014009', '80014011', '80024732', '80013994', '80056172', '80014010'))
                        group by a15.STORE_CD'''

        try:
            cursor.execute(query_NonLowPoint)
            names = [ x[0] for x in cursor.description]
            rows = cursor.fetchall()
            NonLowPoint_DF = pd.DataFrame(rows, columns=names)

        finally:
            if cursor is not None:
                cursor.close()


        #for LowPoint Beer
        dsn_tns = cx_Oracle.makedsn(host, port, service_name = s_name)
        db_ts = cx_Oracle.connect('amartinezcotto', 'Kohlia#743', dsn_tns)
        cursor = db_ts.cursor()

        if str(state).upper() == 'KS':
            state_code = str(162)
        elif str(state).upper() == 'UT':
            state_code = str(239)
        elif str(state).upper() == 'MN':
            state_code = str(177)
        elif str(state).upper() == 'CO':
            state_code = str(125)
        elif str(state).upper() == 'OK':
            state_code = str(201)

        query_LowPoint = '''select /*+ parallel(6) */ a15.STORE_CD  RTL_STORE_CD,
                         'LowPoint' as "BEERTYPE"
                        from EDW.BI_CRN_F_RETAIL_DEPL_MVW_VW a11
                         join EDW.BI_CRN_D_ITEM_VW a12
                           on  (a11.ITEM_ID = a12.ITEM_ID)
                         join EDW.BI_CRN_D_REL_TIME_DETAIL_VW a13
                           on  (a11.DATE_ID = a13.REL_DATE_ID)
                         join EDW.BI_CRN_D_DISTRIBUTOR_VW a14
                           on  (a11.DIST_ID = a14.DIST_ID)
                         join EDW.BI_CRN_D_RETAILER_VW a15
                           on  (a11.RETAILER_ID = a15.RETAILER_ID)
                        where (a13.REL_TIME_CD in ('L3_TY')
                         and a14.STATE_PROVINCE_COUNTRY_CD in (''' +"'" + state_code + "')" + ''' 
                         and a15.STORE_CD not in 
                         (select /*+ parallel(6) */ a15.STORE_CD  RTL_STORE_CD
                        from EDW.BI_CRN_F_RETAIL_DEPL_MVW_VW a11
                        join EDW.BI_CRN_D_ITEM_VW a12
                           on  (a11.ITEM_ID = a12.ITEM_ID)
                         join EDW.BI_CRN_D_REL_TIME_DETAIL_VW a13
                           on  (a11.DATE_ID = a13.REL_DATE_ID)
                         join EDW.BI_CRN_D_DISTRIBUTOR_VW a14
                           on  (a11.DIST_ID = a14.DIST_ID)
                         join EDW.BI_CRN_D_RETAILER_VW a15
                           on  (a11.RETAILER_ID = a15.RETAILER_ID)
                        where (a13.REL_TIME_CD in ('L3_TY')
                         and a14.STATE_PROVINCE_COUNTRY_CD in (''' + "'" + state_code + "')" + ''' and a12.MASTER_SKU_CD not in ('80013450', '80013452', '80013455', '80013979', '80013456', '80013475', '80061325', '80013477', '80013982', '80013980', '80059842', '80058839', '80014020', '80014022', '80014023', '80014024', '80014025', '80013992', '80013991', '80013464', '80013466', '80013469', '80013471', '80013472', '80013473', '80013478', '80031998', '80013485', '80056926', '80058838', '80019746', '80031994', '80015966', '80014017', '80012704', '80014015', '80014009', '80014011', '80024732', '80013994', '80056172', '80014010'))
                        group by a15.STORE_CD))
                        group by a15.STORE_CD '''

        try:
            cursor.execute(query_LowPoint)
            names = [ x[0] for x in cursor.description]
            rows = cursor.fetchall()
            LowPoint_DF = pd.DataFrame(rows, columns=names)

        finally:
            if cursor is not None:
                cursor.close()

        #concat two beer types
        StoreType = pd.concat([NonLowPoint_DF, LowPoint_DF])


        return StoreType
    
    
    def grid_search_32(classification_df, StoreType, state):
        
        '''
        This function train the data uses decision tree model, and predict the types of beer the stores sell for whole dataset. 
        
        parameter:
        ---------
            StoreType: Takes a dataframe. The dataframe you get after running function: read_query_32.
            state: Takes a string. The state name

        return:
        -------
            a dataframe with prediction result.  
        '''
    
    
        #list of columns that don't include in train
        no_train_list = ['RTL_STORE_CD','RTL_FIPS_COUNTY_DSC', 'RTL_PREMISE_TYPE_CD', 'RTL_CHANNEL_DSC', 'RTL_SUBCHANNEL_DSC', 'RTL_BEER_FLAG', 'RTL_LIQUOR_FLG', 'RTL_STATE_CD', 'LowPoint_Y', 'BEERTYPE']


        #left join the store type data with subsetted dataframe(classification_df)
        result_df = pd.merge(classification_df, StoreType, on="RTL_STORE_CD", how="left")  

        #binarize the columns:
        binarize_columns = ['RTL_FIPS_COUNTY_DSC', 'RTL_PREMISE_TYPE_CD', 'RTL_CHANNEL_DSC', 'RTL_SUBCHANNEL_DSC', 'RTL_BEER_FLAG', 'RTL_LIQUOR_FLG']
        result_df = pd.concat([result_df, pd.get_dummies(result_df[binarize_columns], prefix = ['B_'+c for c in binarize_columns])] , axis = 1)

        #subset the training dataset
        LowPoint = result_df.loc[result_df['BEERTYPE'] == 'LowPoint']  
        NonLowPoint = result_df.loc[result_df['BEERTYPE'] == 'NonLowPoint']
        train_df =  pd.concat([LowPoint, NonLowPoint])

        #fit the model using traning data
        #independent variables:
        X_labels = [c for c in train_df.columns if c not in no_train_list]
        X_train = train_df.loc[:,X_labels]
        #dependent variable:
        Y_train = train_df['BEERTYPE'] 

        #train the model, and find the best parameter
        parameters = {'max_depth':range(1,21), 'min_samples_leaf':range(3,21,3), 'min_samples_split':range(3,21,3), 'random_state': [0]}
        clf = tree.DecisionTreeClassifier()
        clf = GridSearchCV(clf, parameters, n_jobs = -2)
        clf.fit(X_train, Y_train)
        accuracy = clf.best_score_ 
        best_params = clf.best_params_
        best_depth = best_params['max_depth']
        best_leaf = best_params['min_samples_leaf']
        best_split = best_params['min_samples_split']


        #using the best parameter to train and fit the model
        clf = tree.DecisionTreeClassifier(max_depth=best_depth, min_samples_leaf= best_leaf, min_samples_split = best_split, random_state=0)
        clf = clf.fit(X_train,Y_train)

        #Prepare for prediction dataset
        X_pred = result_df.loc[:,X_labels]

        #make prediction for whole dataset.
        result_df['LowPoint_Y'] = clf.predict(X_pred) == 'LowPoint'        

        return result_df

    
################################################################################################################
    #run the whole function 
    
    #create a LowPoint_Y column for entire dataset. 
    dataframe['LowPoint_Y'] = False
    
    #take a subset of the data classification_subset 
    classification_df = classification_subset_32(dataframe, state)
    
    #if using Pickle
    if isPickle:
        
        #Decision tree model for Kansas and Minnesota
        #If the state is Kansas:
        if str(state).upper() == 'KS':   
            result_df = decision_tree_pickle_32(classification_df, 'KS')

        #If the state is Minnesota:    MN
        elif str(state).upper() == 'MN':
            result_df = decision_tree_pickle_32(classification_df, 'MN')
      
    
        #Business rule for Utah, Oklahoma, and Colorado
        #If the state is Utah: UT
        elif str(state).upper() == 'UT':
            result_df = business_rule_pickle_32(classification_df, 'Utah')
            
        #If the state is Oklahoma:  OK 
        elif str(state).upper() == 'OK':
            result_df = business_rule_pickle_32(classification_df, 'OK')


        #If the state is Colorado:  CO
        elif str(state).upper() == 'CO':
            result_df = business_rule_pickle_32(classification_df, 'CO')

        #If enter other States:
        else:
            print('wrong state')
            return
    
    #if using grid search
    else:
        
        #If the state is Kansas:
        if str(state).upper() == 'KS': 
            #get the query for store type
            StoreType = read_query_32('KS')
            #get the prediction result
            result_df = grid_search_32(classification_df, StoreType, 'KS')
        
        #If the state is Minnesota:    
        elif str(state).upper() == 'MN':
            #get the query for store type
            StoreType = read_query_32('MN')
            #get the prediction result
            result_df = grid_search_32(classification_df, StoreType, 'MN')
            
        else:
            print('wrong state')
            return
    result_cols = ['RTL_STORE_CD', 'RTL_FIPS_COUNTY_DSC', 'RTL_PREMISE_TYPE_CD', 'RTL_CHANNEL_DSC', 'RTL_SUBCHANNEL_DSC', 'RTL_BEER_FLAG', 'RTL_LIQUOR_FLG', 'LowPoint_Y']
    return result_df[result_cols]
    
    

In [None]:
#put each state into a loop.

state_list = ['KS', 'MN', 'OK', 'UT', 'CO']
for each in state_list:
    result_df = classify_32(dataframe, each)

In [15]:
#Try 

In [19]:
# Load the data
data_dir = 'Data/'
Stores = pd.read_csv(data_dir + 'AllStoresMinnesota.csv', dtype = str)
Stores['RTL_STATE_CD'] = 'MN'
Stores.head()

Unnamed: 0,RTL_STORE_CD,RTL_FIPS_COUNTY_DSC,RTL_PREMISE_TYPE_CD,RTL_CHANNEL_DSC,RTL_SUBCHANNEL_DSC,RTL_BEER_FLAG,RTL_LIQUOR_FLG,RTL_STATE_CD
0,105531125,DAKOTA,ON,DINING,CASUAL DINING,Y,Y,MN
1,205646301,MORRISON,ON,BAR/NIGHTCLUB,NEIGHBORHOOD BAR,Y,Y,MN
2,105646889,AITKIN,ON,LODGING,FULL SERVICE LODGING,Y,Y,MN
3,102004427,FILLMORE,OFF,LIQUOR,CONVENTIONAL LIQUOR,Y,Y,MN
4,105645122,WINONA,ON,DINING,CASUAL DINING,Y,Y,MN


In [20]:
result_df = classify_32(Stores, 'MN', True)
result_df.head()

Unnamed: 0,RTL_STORE_CD,RTL_FIPS_COUNTY_DSC,RTL_PREMISE_TYPE_CD,RTL_CHANNEL_DSC,RTL_SUBCHANNEL_DSC,RTL_BEER_FLAG,RTL_LIQUOR_FLG,LowPoint_Y
3624,100077508,STEELE,OFF,CONVENIENCE STORE,CONVENTIONAL CONVENIENCE,Y,N,True
673,100077538,WASHINGTON,OFF,CONVENIENCE STORE,CONVENTIONAL CONVENIENCE,Y,N,True
2908,100077541,DAKOTA,OFF,LIQUOR,CONVENTIONAL LIQUOR,Y,Y,False
3658,100077550,DAKOTA,OFF,CONVENIENCE STORE,CONVENTIONAL CONVENIENCE,Y,N,True
4650,100077574,DAKOTA,OFF,CONVENIENCE STORE,CONVENTIONAL CONVENIENCE,Y,N,True


In [None]:
sql_df = read_query_32('KS')
sql_df.head()

In [None]:
sql_df = sql_df.sort_values('RTL_STORE_CD')
sql_df.head()

In [None]:
sql_df.loc[sql_df['RTL_STORE_CD'] == '100763567']

In [None]:
result_df.loc[result_df['RTL_STORE_CD'] == '100763567']