# Preprocessing


---



> Importing libraries and loading data

In [1]:
import numpy as np
import pandas as pd

In [2]:
train = pd.read_csv('train_data.csv')
test = pd.read_csv('test_data.csv')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


> Looking at the train and test sets

In [3]:
train.head()

Unnamed: 0,order_time,order_id,order_date,allot_time,accept_time,pickup_time,delivered_time,rider_id,first_mile_distance,last_mile_distance,alloted_orders,delivered_orders,cancelled,undelivered_orders,lifetime_order_count,reassignment_method,reassignment_reason,reassigned_order,session_time,cancelled_time
0,2021-01-26 02:21:35,556753,2021-01-26 00:00:00,2021-01-26 02:21:59,2021-01-26 02:22:08,2021-01-26 02:32:51,2021-01-26 02:49:47,11696,1.5666,2.65,46.0,46.0,0,0.0,621.0,,,,,
1,2021-01-26 02:33:16,556754,2021-01-26 00:00:00,2021-01-26 02:33:57,2021-01-26 02:34:45,2021-01-26 02:50:25,2021-01-26 03:11:15,18117,2.5207,2.76,8.0,8.0,0,0.0,105.0,,,,3.266667,
2,2021-01-26 02:39:49,556755,2021-01-26 00:00:00,2021-01-26 02:39:57,2021-01-26 02:40:13,2021-01-26 02:56:00,2021-01-26 03:12:46,18623,2.2074,4.8,1.0,1.0,0,0.0,66.0,,,,9.816667,
3,2021-01-26 02:47:53,556756,2021-01-26 00:00:00,2021-01-26 02:48:25,2021-01-26 02:49:06,2021-01-26 03:21:51,2021-01-26 03:41:05,15945,2.1894,6.38,1.0,1.0,0,0.0,127.0,,,,17.533333,
4,2021-01-26 03:06:30,556757,2021-01-26 00:00:00,2021-01-26 03:07:21,2021-01-26 03:07:57,2021-01-26 03:31:38,2021-01-26 04:00:15,17589,2.787,4.01,34.0,34.0,0,0.0,84.0,,,,1.35,


In [4]:
test.head()

Unnamed: 0,order_time,order_id,order_date,allot_time,accept_time,rider_id,first_mile_distance,last_mile_distance,alloted_orders,delivered_orders,undelivered_orders,lifetime_order_count,reassignment_method,reassignment_reason,reassigned_order,session_time
0,2021-02-06 10:03:24,130231,2021-02-06 00:00:00,2021-02-06 10:03:49,2021-02-06 10:04:15,12884,1.6585,4.54,216.0,215.0,1.0,747.0,,,,273.4
1,2021-02-06 10:03:26,130232,2021-02-06 00:00:00,2021-02-06 10:03:27,2021-02-06 10:03:36,3541,2.0709,5.84,52.0,52.0,0.0,75.0,,,,252.1
2,2021-02-06 10:03:27,130233,2021-02-06 00:00:00,2021-02-06 10:04:14,2021-02-06 10:05:34,603,1.3884,0.99,289.0,289.0,0.0,2214.0,,,,241.383333
3,2021-02-06 10:03:29,130234,2021-02-06 00:00:00,2021-02-06 10:03:30,2021-02-06 10:03:53,3414,1.9039,2.59,125.0,122.0,3.0,1020.0,,,,291.933333
4,2021-02-06 10:03:35,130235,2021-02-06 00:00:00,2021-02-06 10:03:43,2021-02-06 10:04:43,1426,0.8275,0.94,352.0,350.0,2.0,7284.0,,,,247.133333


> Delete pickup_time, delivered_time and cancelled_time from train set because they are missing in test set

In [5]:
train.drop(['pickup_time','delivered_time','cancelled_time'], axis=1, inplace=True)

> Convert date & time columns into datetime format

In [6]:
datetime_format = '%Y-%m-%d %H:%M:%S'
error_type = 'coerce'

for col in ['order_date','order_time','allot_time','accept_time'] :
    train[col] = pd.to_datetime(train[col], format=datetime_format, errors=error_type)
    test[col] = pd.to_datetime(test[col], format=datetime_format, errors=error_type)

> Extract day of the week, hour and minute for each field

In [7]:
train['order_date_dayofweek'] = train['order_date'].dt.dayofweek
train['order_time_hour'] = train['order_time'].dt.hour
train['order_time_minute'] = train['order_time'].dt.minute

test['order_date_dayofweek'] = test['order_date'].dt.dayofweek
test['order_time_hour'] = test['order_time'].dt.hour
test['order_time_minute'] = test['order_time'].dt.minute

> Since order_time, allot_time and accept_time differ only by seconds, we find the differences as new features

In [8]:
train['diff_allot-order'] = (train.allot_time - train.order_time).astype('timedelta64[s]')
train['diff_accept-allot'] = (train.accept_time - train.allot_time).astype('timedelta64[s]')

test['diff_allot-order'] = (test.allot_time - test.order_time).astype('timedelta64[s]')
test['diff_accept-allot'] = (test.accept_time - test.allot_time).astype('timedelta64[s]')

> An order is either reassiged or not, hence the NaNs in reassigned_order were filled with 0s

In [9]:
train['reassigned_order'] = train['reassigned_order'].fillna(0);
test['reassigned_order'] = test['reassigned_order'].fillna(0);

> Saving test_order_id for submission later, dropping unnecessary columns, placing target variable at the end 

In [10]:
test_order_id = pd.DataFrame(test['order_id'], columns = ['order_id'])
cols_dropped = ['order_time','order_id','order_date','allot_time','accept_time','rider_id','reassignment_method','reassignment_reason']
train.drop(cols_dropped, axis=1, inplace=True)
test.drop(cols_dropped, axis=1, inplace=True)
train = train[[c for c in train if c not in ['cancelled']] + [c for c in ['cancelled'] if c in train]]

> Taking a look at the train and test sets again, we see that they have the same features

In [11]:
train.head()

Unnamed: 0,first_mile_distance,last_mile_distance,alloted_orders,delivered_orders,undelivered_orders,lifetime_order_count,reassigned_order,session_time,order_date_dayofweek,order_time_hour,order_time_minute,diff_allot-order,diff_accept-allot,cancelled
0,1.5666,2.65,46.0,46.0,0.0,621.0,0.0,,1,2,21,24.0,9.0,0
1,2.5207,2.76,8.0,8.0,0.0,105.0,0.0,3.266667,1,2,33,41.0,48.0,0
2,2.2074,4.8,1.0,1.0,0.0,66.0,0.0,9.816667,1,2,39,8.0,16.0,0
3,2.1894,6.38,1.0,1.0,0.0,127.0,0.0,17.533333,1,2,47,32.0,41.0,0
4,2.787,4.01,34.0,34.0,0.0,84.0,0.0,1.35,1,3,6,51.0,36.0,0


In [12]:
test.head()

Unnamed: 0,first_mile_distance,last_mile_distance,alloted_orders,delivered_orders,undelivered_orders,lifetime_order_count,reassigned_order,session_time,order_date_dayofweek,order_time_hour,order_time_minute,diff_allot-order,diff_accept-allot
0,1.6585,4.54,216.0,215.0,1.0,747.0,0.0,273.4,5,10,3,25.0,26.0
1,2.0709,5.84,52.0,52.0,0.0,75.0,0.0,252.1,5,10,3,1.0,9.0
2,1.3884,0.99,289.0,289.0,0.0,2214.0,0.0,241.383333,5,10,3,47.0,80.0
3,1.9039,2.59,125.0,122.0,3.0,1020.0,0.0,291.933333,5,10,3,1.0,23.0
4,0.8275,0.94,352.0,350.0,2.0,7284.0,0.0,247.133333,5,10,3,8.0,60.0


> Data matrix and target column were extracted as X and y respectively

In [13]:
X = train.drop(['cancelled'], axis=1)
y = pd.DataFrame(train['cancelled'], columns = ['cancelled'])

> Describing the train set

In [14]:
X.describe()

Unnamed: 0,first_mile_distance,last_mile_distance,alloted_orders,delivered_orders,undelivered_orders,lifetime_order_count,reassigned_order,session_time,order_date_dayofweek,order_time_hour,order_time_minute,diff_allot-order,diff_accept-allot
count,450000.0,450000.0,433052.0,432659.0,432659.0,449947.0,450000.0,446325.0,450000.0,450000.0,450000.0,450000.0,449843.0
mean,1.229889,2.968873,104.620909,103.950448,0.764165,853.640664,0.030562,220.474779,2.885242,12.7238,30.595551,73.199969,61.005662
std,0.846183,1.884124,90.135492,89.639646,1.066473,1502.976162,0.172129,176.713853,1.742036,3.619623,17.191479,275.048691,107.45466
min,0.000134,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-28534.0
25%,0.539575,1.47,36.0,36.0,0.0,165.0,0.0,84.1,1.0,9.0,16.0,1.0,15.0
50%,1.1387,2.67,81.0,81.0,0.0,396.0,0.0,175.55,3.0,14.0,32.0,15.0,37.0
75%,1.853,4.22,147.0,146.0,1.0,948.0,0.0,316.766667,4.0,16.0,45.0,48.0,63.0
max,42.0381,22.41,567.0,562.0,9.0,30469.0,1.0,1298.966667,6.0,21.0,59.0,32334.0,930.0


> We see  that diff_accept-allot has negative values, which is meaningless. We drop those rows 

In [15]:
y.drop(X[X['diff_accept-allot'] < 0].index, inplace=True)
X.drop(X[X['diff_accept-allot'] < 0].index, inplace=True)

# Training and Validation


---


> The hyperparameters were obtained through trial and error, with 5-fold stratified CV to handle the class imbalance



## Model 1: XGBoost


In [16]:
from xgboost import XGBClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score

kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)
xgbc1 = XGBClassifier(learning_rate=0.1,
                     max_depth=5,
                     gamma=0.1,
                     min_child_weight=7,
                     scale_pos_weight=85.24,
                     random_state=1,
                     n_jobs=-1)
# scores = cross_val_score(xgbc1, X, y.values.ravel(), scoring='roc_auc', cv=kfold)
# print(scores.mean())

from sklearn.utils import shuffle
X_sh, y_sh = shuffle(X, y, random_state=1)

xgbc1.fit(X_sh, y_sh.values.ravel())

y_pred_1 = xgbc1.predict_proba(test)

## Model 2: LightGBM
> Probabilities calibrated to handle the class imbalance

In [17]:
from lightgbm import LGBMClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score

kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)
lgbc1 = LGBMClassifier(num_leaves=16,
                       max_depth=5,
                       learning_rate=0.05,
                       n_estimators=100,
                       subsample_for_bin=200000,
                       is_unbalance=True,
                       min_split_gain=0.,
                       min_child_weight=1e-3,
                       min_child_samples=20,
                       subsample=1.,
                       subsample_freq=0,
                       colsample_bytree=1.,
                       reg_alpha=0.,
                       reg_lamba=0.,
                       random_state=1,
                       n_jobs=-1)
# scores = cross_val_score(lgbc1, X, y.values.ravel(), scoring='roc_auc', cv=kfold)
# print(scores.mean())

from sklearn.utils import shuffle
X_sh, y_sh = shuffle(X, y, random_state=1)

from sklearn.calibration import CalibratedClassifierCV
lgbc1_cal = CalibratedClassifierCV(lgbc1, cv=kfold, n_jobs=-1)
lgbc1_cal.fit(X_sh, y_sh.values.ravel())

y_pred_2 = lgbc1_cal.predict_proba(test)

## Splitting order_date_dayofweek Into Separate Features

In [18]:
def extract_day_info(df) :
    days = ['Monday','Tuesday','Wednesday','Thursday','Friday','Saturday','Sunday']

    for i in range(0, 7) :
        df[days[i]] = np.where(df['order_date_dayofweek']==i, 1, 0)

    df.drop(['order_date_dayofweek'], axis=1, inplace=True)

extract_day_info(X)
extract_day_info(test)

## Model 3: LightGBM
> After week day splitting; probabilities calibrated to handle the class imbalance




In [19]:
from lightgbm import LGBMClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score

kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)
lgbc2 = LGBMClassifier(num_leaves=5,
                       max_depth=6,
                       learning_rate=0.1,
                       n_estimators=150,
                       subsample_for_bin=200000,
                       scale_pos_weight=85.63361571043512,
                       min_split_gain=0,
                       min_child_weight=0.001,
                       min_child_samples=21,
                       subsample=1,
                       subsample_freq=0,
                       colsample_bytree=0.9,
                       reg_alpha=0,
                       reg_lamba=0,
                       random_state=1,
                       n_jobs=-1)
# scores = cross_val_score(lgbc2, X, y.values.ravel(), scoring='roc_auc', cv=kfold)
# print(scores.mean())

from sklearn.utils import shuffle
X_sh, y_sh = shuffle(X, y, random_state=1)

from sklearn.calibration import CalibratedClassifierCV
lgbc2_cal = CalibratedClassifierCV(lgbc2, cv=kfold, n_jobs=-1)
lgbc2_cal.fit(X_sh ,y_sh.values.ravel())

y_pred_3 = lgbc2_cal.predict_proba(test)

## Imputing Missing Values

In [None]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
imputer = imputer.fit(X)
  
X = imputer.transform(X)

In [None]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
imputer = imputer.fit(test)
  
test = imputer.transform(test)

## Model 4: XGBoost
> After week day splitting and imputing

In [22]:
from xgboost import XGBClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score

kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)
xgbc2 = XGBClassifier(learning_rate=0.1,
                     max_depth=5,
                     gamma=0.125,
                     min_child_weight=7,
                     scale_pos_weight=84.88,
                     random_state=1,
                     n_jobs=-1)
# scores = cross_val_score(xgbc2, X, y.values.ravel(), scoring='roc_auc', cv=kfold)
# print(scores.mean())

from sklearn.utils import shuffle
X_sh, y_sh = shuffle(X, y, random_state=1)

xgbc2.fit(X_sh, y_sh.values.ravel())

y_pred_4 = xgbc2.predict_proba(test)

# Final Prediction: Ensemble of Four Models


---



In [23]:
df = pd.DataFrame(test_order_id, columns = ['order_id'])
df['cancelled'] = (np.array(y_pred_1[:,1]) + np.array(y_pred_2[:,1]) + np.array(y_pred_3[:,1]) + np.array(y_pred_4[:,1]))/4.0
df.to_csv("./predictions_proba_ensembled_best4.csv", header=True, index=False)