### Rappi Challenge - Model Training

#### Lib Setup

In [None]:
import numpy as np
import pandas as pd
import datetime as dt

import math
import pickle
import os.path
import sklearn

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, roc_curve, roc_auc_score, precision_score, recall_score, precision_recall_curve
from sklearn.metrics import f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.utils import resample

Current version of imported libraries

In [None]:
print(f'Numpy version: {np.version.version}')
print(f'Pandas version: {pd.__version__}')
print(f'Sci-kit learn version: {sklearn.__version__}')

#### Read csv file

In [None]:
orders_data = pd.read_csv('orders.csv');
orders_data.head()

#### Value distribution

In [None]:
taken = orders_data["taken"].value_counts()
taken

In [None]:
current_rate = orders_data["taken"].value_counts(normalize=True)
print(f'Taza de aprobación del {current_rate[1]}. {counts[1]} Aceptados de {total} órdenes')

#### Preprocessing

Aux function for droping order_id and categorize store_id. It was assumed that the prefix of store_id up to "end" could give a hint on it's type. In the end, I took all store_id values through OneHotEncoding

In [None]:
def prepare_ids(data, end):
    n_data = data
    n_data["store_id"] = data["store_id"].apply(lambda x: str(x)[0: end])
    n_data = n_data.drop("order_id", axis=1)
    
    return n_data

Extract cyclic features from the date field

In [None]:
def prepare_created_at(data):
    n_data = data
    
    _datetime = n_data["created_at"].apply(lambda x: dt.datetime.strptime(x, '%Y-%m-%dT%H:%M:%SZ'))
    
    xmes = _datetime.dt.month.apply(lambda x: math.sin(2 * math.pi * x / 12))
    ymes = _datetime.dt.month.apply(lambda x: math.cos(2 * math.pi * x / 12))
    
    avg_days_year = 365.2425
    
    xdia = _datetime.dt.dayofyear.apply(lambda x: math.sin(2 * math.pi * x / avg_days_year))
    ydia = _datetime.dt.dayofyear.apply(lambda x: math.cos(2 * math.pi * x / avg_days_year))
    
    xsem = _datetime.dt.dayofweek.apply(lambda x: math.sin(2 * math.pi * x / 7))
    ysem = _datetime.dt.dayofweek.apply(lambda x: math.cos(2 * math.pi * x / 7))
    
    xhora = _datetime.dt.hour.apply(lambda x: math.sin(2 * math.pi * x / 24))
    yhora = _datetime.dt.hour.apply(lambda x: math.cos(2 * math.pi * x / 24))
    
    xmin = _datetime.dt.minute.apply(lambda x: math.sin(2 * math.pi * x / 60))
    ymin = _datetime.dt.minute.apply(lambda x: math.cos(2 * math.pi * x / 60))
    
    n_columns = pd.concat([xmes, ymes, xdia, ydia, xsem, ysem, xhora, yhora, xmin, ymin], axis=1);
    n_columns.columns = ["xmes", "ymes", "xdia", "ydia", "xsem", "ysem", "xhora", "yhora", "xmin", "ymin"]
    
    n_data = n_data.drop("created_at", axis=1)
    n_data = pd.concat([n_data, n_columns], axis=1)
    
    return n_data

Defines the OneHotEncoding and StandardScaler with the information from the training data. Additionally, it dumps them into pickle files for the api to load and use

In [None]:
def encode_scale(x_train, _dataset):
    base_encode = _dataset["store_id"].values.reshape(-1 ,1)
    base_scale = x_train.drop(["store_id"], axis=1)
    
    _ohencoder = None
    _stdscaler = None
    
    if(os.path.isfile("_encoder.pkl")):
        _ohencoder = pickle.load(open("_encoder.pkl", "rb"))
    else:
        _ohencoder = OneHotEncoder(sparse=False, handle_unknown="ignore").fit(base_encode)
        pickle.dump(_ohencoder, open("_encoder.pkl", "wb"))
    
    if(os.path.isfile("_scaler.pkl")):
        _stdscaler = pickle.load(open("_scaler.pkl", "rb"))
    else:
        _stdscaler = StandardScaler().fit(base_scale)
        pickle.dump(_stdscaler, open("_scaler.pkl", "wb"))
    
    return _ohencoder, _stdscaler

#### Data preprocessing and cross validation

Applies all the methods defined previously

In [None]:
def preprocess_data(dataset):
    store_id_limit = 10
    prep_id = prepare_ids(orders_data, store_id_limit)
    dataset = prepare_created_at(prep_id)
    taken = dataset["taken"]
    dataset = dataset.drop("taken", axis=1)
    
    x_train, x_test, y_train, y_test = train_test_split(dataset, taken, test_size=0.3, stratify=taken)
    
    ohencoder, stdscaler = encode_scale(x_train, dataset)
    
    x_train_ohe = ohencoder.transform(x_train["store_id"].values.reshape(-1, 1))
    x_train_std = stdscaler.transform(x_train.drop("store_id", axis=1))
    
    x_train = pd.DataFrame(np.concatenate([x_train_ohe, x_train_std], axis=1))
    
    x_test_ohe = ohencoder.transform(x_test["store_id"].values.reshape(-1, 1))
    x_test_std = stdscaler.transform(x_test.drop("store_id", axis=1))
    
    x_test = pd.DataFrame(np.concatenate([x_test_ohe, x_test_std], axis=1))
    
    return (ohencoder, stdscaler, x_train, x_test, y_train, y_test)

Get's all the output from the preprocess method

In [None]:
ohencoder, stdscaler, x_train, x_test, y_train, y_test = preprocess_data(orders_data)

#### Upsampling

In the end training data was not upsampled for the training of the model

In [None]:
_y_train = y_train.values
_y_train = pd.Series(_y_train, name="taken")
x_log_reg = x_train.join(_y_train)

taken_0 = x_log_reg[x_log_reg["taken"] == 0]
taken_1 = x_log_reg[x_log_reg["taken"] == 1]

up_taken_0 = resample(taken_0, n_samples=taken_1.shape[0])
# dw_taken_1 = resample(taken_1, n_samples=taken_0.shape[0])

x_log_reg = pd.concat([up_taken_0, taken_1])
# x_log_reg = pd.concat([taken_0, dw_taken_1])

y_log_reg = x_log_reg["taken"]
x_log_reg = x_log_reg.drop("taken", axis=1)

x_log_reg

#### Model Training

Logistic Regression with class_weight

In [None]:
_class_weight={0: 0.9, 1: 0.1}
log_reg = LogisticRegression(max_iter=50000, class_weight=_class_weight,
                             C=1, solver='liblinear')

log_reg.fit(x_train.values, y_train.values)

Accuracy can improve with more exact class weights at the cost of ROC AUC, which could indicate overfitting. It was deliberately left like this, in favor of a higher ROC AUC

In [None]:
y_pred = log_reg.predict(x_test.values)

print(f'Score: {log_reg.score(x_test.values, y_test.values)}')
print(f'Accuracy Score: {accuracy_score(y_test.values, y_pred)}')
print(f'Confusion Matrix: \n{confusion_matrix(y_test.values, y_pred)}')
print(f'Area Under Curve: {roc_auc_score(y_test.values, y_pred)}')
print(f'Recall score: {recall_score(y_test.values, y_pred)}')

In [None]:
rnd_forest = RandomForestClassifier(class_weight=_class_weight)
# rnd_forest.fit(x_train.values, y_train.values)

If we want to completely overfit the model. High accuracy, log ROC AUC. ends up with a pickle file size of 200+ MB if no max_depth is specified, which makes the logistic_regression approach more worth using

In [None]:
'''
y_pred = rnd_forest.predict(x_test.values)

print(f'Score: {rnd_forest.score(x_test.values, y_test.values)}')
print(f'Accuracy Score: {accuracy_score(y_test.values, y_pred)}')
print(f'Confusion Matrix: \n{confusion_matrix(y_test.values, y_pred)}')
print(f'Area Under Curve: {roc_auc_score(y_test.values, y_pred)}')
print(f'Recall score: {recall_score(y_test.values, y_pred)}')
'''

#### Evaluation

In [None]:
def save_models():
    pickle.dump(log_reg, open("_log_reg.pkl", "wb"))
    # pickle.dump(rnd_forest, open("_rnd_forest.pkl", "wb"))

In [None]:
save_models()

In [None]:
def prepare_eval(dataset, ohencoder, stdscaler):
    store_id_limit = 10
    prep_id = prepare_ids(dataset, store_id_limit)
    dataset = prepare_created_at(prep_id)
    taken = dataset["taken"]
    dataset = dataset.drop("taken", axis=1)
    
    base_encode = dataset["store_id"]
    base_scale = dataset.drop("store_id", axis=1)
    
    x_eval_ohe = ohencoder.transform(base_encode.values.reshape(-1, 1))
    x_eval_std = stdscaler.transform(base_scale)
    
    x_eval = pd.DataFrame(np.concatenate([x_eval_ohe, x_eval_std], axis=1))
    
    return x_eval, taken

In [None]:
x_eval, y_eval = prepare_eval(orders_data, ohencoder, stdscaler)

In [None]:
log_reg.predict(x_eval).sum()