In [1]:
import pandas as pd 
from sklearn.preprocessing import StandardScaler
import category_encoders as ce
import xgboost as xgb
import pickle
import joblib
import os 
import numpy as np
from multipledispatch import dispatch
from sklearn.metrics import f1_score 

In [2]:
def read_pickle_dictionary(filename):
    '''
    Load serialized dictionary
    '''
    with open(os.path.join(filename), 'rb') as handle:
        file_dict = pickle.load(handle)
        
    return file_dict

def deserialize_model( filename):
    """
    DeSerialize trained model 
    """
    return joblib.load(filename)


def standardize(X_train, X_test ,test_preprocessing_object , flag ='train'):
    """
    This function standardize test and train columns if flag is train standization will fit and trainsform 
    if test it will just standardize.
    returns train ,test and  test_preprocessing_object if flag is train else test data and test_preprocessing_object
    X_train : train data
    X_test  :test data 
    test_preprocessing_object : dictionary containing object of columns transformer for different column 
    flag : string either train or test 
    """
    std_columns =['ur_pr_reordered','order_number','ttl_cnt_product_user','Avg_no_prod_perOrder','days_since_prior_order','usr_ro_ratio','product_name_length']
    scaler_objects =[]

    if(flag == 'train'):
        for col in std_columns:

            scaler = StandardScaler()
            scaler.fit(X_train.loc[:,col].values.reshape(-1,1))
            X_train.loc[:,col] =scaler.transform(X_train.loc[:,col].values.reshape(-1,1))
            X_test.loc[:,col] =scaler.transform(X_test.loc[:,col].values.reshape(-1,1))
            scaler_objects.append({col:scaler})
            del scaler
            gc.collect()
        test_preprocessing_object['std']=scaler_objects
    elif(flag =='test'):
        for item in test_preprocessing_object['std']:
            for col_item in item.items():
                col =col_item[0]
                scaler=col_item[1]

                X_test.loc[:,col] =scaler.transform(X_test.loc[:,col].values.reshape(-1,1))
    if(flag=='train'):
        return X_train.copy() , X_test.copy() ,test_preprocessing_object
    else:
        return X_test.copy() ,test_preprocessing_object
            
def response_code_test( X_test ,response_dict):
    """
    This function takes data and does fit transform based on column wise,
    according to column specific encoder stored in reponse_dctionary 
    X_test : test data 
    response_dict : dictionary containing encoder object 
    return transformed test data 
    """

    response_column =['max_hour_of_day' ,'reordered_last','max_dow']
    for col in response_column:
        encoder =response_dict[col]
        X_test.loc[:,col] =encoder.transform(X_test.loc[:,col])


    
    return X_test.copy()


def merge_products(x):
    """
    x : string input
    This function merge group to a list 
    returns list of strings 
    """
    return " ".join(list(x.astype('str')))

            

In [3]:
def suggestProduct(test_sub, pred , thresh):
    """
    This suggests products based on if Prediction is reordered =1
    
    returns Dataframe containing order_id and group of product_id seperated by space 
    
    test_sub : dataframe containing order_id and product_id
    
    pred:prediction probality of positive class
    
    thresh : optimal threshold to convert probality to class 
    """
    
    # if pobality is greter than threshold predict 1 else 0
    test_sub["Pred"] = np.where(pred>=thresh ,1,0)
    # select all cases where prediction is 1
    test_sub = test_sub.loc[test_sub["Pred"].astype('int')==1]
    #group by order_id and create lsit of products
    test_sub = test_sub.groupby("order_id")["product_id"].aggregate(merge_products).reset_index()
    test_sub.columns = ["order_id", "products"]
    

    return test_sub['products'].values

@dispatch(list,int)
def validate_order_id( orderIds,id_):
    """
    This function checks if querried order id is there in list of order_id
    returns True is order id is present and false if not present
    """

    flag =False
    for order_id in orderIds:
        if id_ == order_id:
            flag=True
            break       
    return flag


@dispatch(list,list)
def validate_order_id( orderIds,id_list):

    """
    This function returns list of valid order_id  querried by user
    """

    valid_order_ids =[]
    for querry_id in id_list:
        for order_id in orderIds:
            if order_id ==querry_id:
                valid_order_ids.append(querry_id)
                break
      
    return valid_order_ids
            

In [4]:
@dispatch(int)
def final(orderNumber):
    """
    This function takes orderNumber and suggest Product user is most Likely to buy
    orderNumber :Integer
    retruns : None or string of product Id seperated by space
    
    """
    # Read data set 
    test =pd.read_parquet('data/test.gzip')
    # get all the order_id in dataset
    orderIds =list(test.order_id.values)
    product_suggestion ='None'
    
    # check if order If order Id querred is valid 
    if(validate_order_id( orderIds,orderNumber)):
        # filter dataset based on orderId
        test=test[test.order_id ==orderNumber]
        
        # store all the product user bought for particular order id
        test_temp = test[["order_id", "product_id"]]
        
        # drop unnecessarory columns  
        test.drop(columns =["order_id",'ur_pr_count' ,"user_id" ,'product_id' ,'department_id' ,'aisle_id' ,'ur_pr_count'] , inplace =True)
        # standardizing test data 
        test_preprocessing_object= read_pickle_dictionary(os.path.join('final_model_pkl' ,'test_preprocessing_object_dict.pkl' ))
        test , test_preprocessing_object=standardize(None, test ,test_preprocessing_object , flag ='test')
        # Target code encodding
        reponse_dict= read_pickle_dictionary(os.path.join('final_model_pkl' ,'reponse_dict.pkl' ))
        test=response_code_test( test ,reponse_dict)
        # read pickled  model for prediction 
        xgboost=deserialize_model('final_model_pkl/xgboost.pkl')
        # predict probality
        predict_xg_test =xgboost.predict_proba(test)[:,1]
        # threshold for prediction
        threshold=0.692886
        product_name =suggestProduct(test_temp ,predict_xg_test ,threshold)
        # if suggested product is not empty take that as suggested product else None
        if(product_name.shape[0]>0):
            product_suggestion =product_name[0]
                  
    return product_suggestion
    
    

In [5]:

@dispatch(list)
def final(orderNumber_list):
    """
    This function takes list orderNumber and return F1 score
    orderNumber :list of integer 

    
    """
    
     # Read data set 
    train =pd.read_parquet('data/train.gzip')
    # get all the order_id in dataset
    orderIds =list(train.order_id.values)
    

    y_predicted_final =[]
    y_orignal_final =[]
    
    # check if order If order Id querred is valid 
    validated_order_id=validate_order_id( orderIds,orderNumber_list)
  
    # filter dataset based on orderId
    train=train.loc[train.order_id.isin(validated_order_id)]
    # getting Y original value
    y_orignal = train.reordered
    y_orignal_final.extend(y_orignal)

    # preparing train data
    train.drop(columns =["order_id",'ur_pr_count' ,"user_id" ,'product_id' ,'department_id' ,'aisle_id' ,'ur_pr_count' ,'reordered'] , inplace =True)
    # standardizing train data 

    test_preprocessing_object= read_pickle_dictionary(os.path.join('final_model_pkl' ,'test_preprocessing_object_dict.pkl' ))
    train , test_preprocessing_object=standardize(None, train ,test_preprocessing_object , flag ='test')
    # Target code encodding
    reponse_dict= read_pickle_dictionary(os.path.join('final_model_pkl' ,'reponse_dict.pkl' ))
    train=response_code_test( train ,reponse_dict)

    # read pickled  model for prediction 
    xgboost=deserialize_model('final_model_pkl/xgboost.pkl')
    # predict probality
    predict_xg_test =xgboost.predict_proba(train)[:,1]

    threshold=0.692886
    # threshold for prediction
    y_predicted =np.where(predict_xg_test>=threshold ,1,0)
    y_predicted_final.extend(y_predicted)


    return  f1_score(y_orignal_final ,y_predicted_final)
    

## 1. final method is overloaded <br>
## final(int) - Take Order number and returns string which contains list of product_id seperarted by space <br>
## final(list[int]) Take list of Order number and returns F1_score(y_true , y_pred) <br>


In [6]:
test =pd.read_parquet('data/test.gzip')
orderIds_te =set(test.order_id.values)

# Read data set 
train =pd.read_parquet('data/train.gzip')
# get all the order_id in dataset
orderIds_tr =set(train.order_id.values)

print("Train and test order id id mutually exclusive:",orderIds_te.intersection(orderIds_tr))

Train and test order id id mutually exclusive: set()


# final(int) 
#### Filter all the row based on order_id than pass data it through  standardization and Target encoding 
#### Load pickled model, make prediction 
#### For all the prediction that is marked 1 or reodered , use associated product_id as suggestion

In [7]:
# loading test data to get order_id

In [8]:
test =pd.read_parquet('data/test.gzip')
orderIds_te =list(set(test.order_id.values))

In [9]:
# randomly take 20 order id and make prediction
for i in range(20):
    order_id=np.random.choice(orderIds_te ,1)
    print("Order id: {} suggested product: {}  ".format(order_id, final(int(order_id[0]))))

Order id: [1884719] suggested product: 21137 19048 48679 46969 28986 9018  
Order id: [2729022] suggested product: 13176 18963 34335 27548 28226 5120  
Order id: [1381670] suggested product: 13176 7781 40310 15902 24759 1194 41488 29387  
Order id: [2245252] suggested product: 24184 24852 22935 30406 22963  
Order id: [2537860] suggested product: 42265 27398 43772 46069 48370 24713  
Order id: [1138659] suggested product: 24852 8518 21616 35417  
Order id: [2910428] suggested product: 13176 21137 22825 42265 24184 34126 47209 1025 24964 45007 19057 22963 24489 20574 20842 28289 10749 5015 19820 48110 34584 11193 18594 40064 44815 10337 18993 39676 36267  
Order id: [2337867] suggested product: 39928 13176 27845 44449 49478 30492 25487 13776 9689  
Order id: [98749] suggested product: 43154 13575  
Order id: [2231023] suggested product: 28535 31506 4367 35163 13225  
Order id: [1486478] suggested product: 21137 22298 22025  
Order id: [892226] suggested product: 30908  
Order id: [12975

# final([int]) 

#### Using train data as test data does not have target value to calulate f1 score 
#### Filter all the row based on list of order_id than pass data it through  standardization and Target encoding 
#### Load pickled model, make prediction 
#### Use predicted and orignal target value to calculate f1 score

In [10]:
# Read data set 
train =pd.read_parquet('data/train.gzip')
# get all the order_id in dataset
orderIds_tr =list(set(train.order_id.values))

# randomly take 20 order id and make prediction
for i in range(20):
    np.random.randint(1,5)
    
    order_ids=list(np.random.choice(orderIds_tr ,np.random.randint(1,5)))
    print("Order list : " ,order_ids)  
    print("F1_score(y_true , y_pred) : ",final(order_ids))

Order list :  [2279114, 2732458]
F1_score(y_true , y_pred) :  0.35714285714285715
Order list :  [337549, 355082]
F1_score(y_true , y_pred) :  0.32
Order list :  [989351, 164154, 1444993]
F1_score(y_true , y_pred) :  0.44897959183673475
Order list :  [1848007, 1964314, 254280, 583469]
F1_score(y_true , y_pred) :  0.5
Order list :  [58135]
F1_score(y_true , y_pred) :  0.4
Order list :  [276061]
F1_score(y_true , y_pred) :  0.0
Order list :  [74542, 754657]
F1_score(y_true , y_pred) :  0.3333333333333333
Order list :  [569915, 1327561, 1215855, 909335]
F1_score(y_true , y_pred) :  0.22950819672131145
Order list :  [3296061, 1353800, 888633]
F1_score(y_true , y_pred) :  0.7368421052631577
Order list :  [287761, 1883209, 3196189, 73765]
F1_score(y_true , y_pred) :  0.1142857142857143
Order list :  [830177, 779724]
F1_score(y_true , y_pred) :  0.40625000000000006
Order list :  [3248564, 3198252, 2187619]
F1_score(y_true , y_pred) :  0.5783132530120482
Order list :  [1740268, 3393288, 202476]