<h1>eBay Machine Learning Challenge: <span style="color: blue">Vilqueso</span></h1>

<p> <b>Goal:</b> Predict delivery date </p>

Dataset Columns and Starting Assumptions

<ul>
    <li><i>payment_datetime:</i> Can predict delivery days instead of date and then add prediction to this date <b>[KEEP]</b></li>
    <li><i>acceptance_scan_timestamp:</i> Can be used in order to calculate a synthetic feature "actual_handling_days"<b> [KEEP]</b></li>
    <li><i>b2c_c2c:</i> This is only relevent to predicting handling part of delivery <b>[DROP]</b></li>
    <li><i>seller_id:</i> This is only relevant to predicting handling part of delivery <b>[DROP]</b></li>
    <li><i>declared_handling_days:</i> We already established a "actual_handling_days" feature <b>[DROP]</b></li>
    <li><i>shipment_method_id:</i> This could be useful for shipping days part of delivery <b>[KEEP]</b></li>
    <li><i>shipping_fee:</i> Given that seller decides this fee, it is irrelevant to carrier <b>[DROP]</b></li>
    <li><i>carrier_min_estimate:</i> This could be useful for shipping days part of delivery <b>[KEEP]</b></li>
    <li><i>carrier_max_estimate:</i> This could be useful for shipping days part of delivery <b>[KEEP]</b></li>
    <li><i>item_zip:</i> This could be useful for shipping days part of delivery if used with buyer_zip <b>[KEEP]</b></li>
    <li><i>buyer_zip:</i> This could be useful for shipping days part of delivery if used with item_zip <b>[KEEP]</b></li>
    <li><i>category_id:</i> It is possible that some categories such as furniture, take longer on average <b>[KEEP]</b></li>
    <li><i>item_price:</i> Price is irrelevant to delivery day as a phone could be worth more than a sofa <b>[DROP]</b></li>
    <li><i>quantity:</i> Quantity is only relevant to handling days. <b>[DROP]</b></li>
    <li><i>delivery_date:</i> Labels <b>[KEEP]</b></li>
    <li><i>weight:</i> Can be crossed with weight_units to get all instances in same units <b>[KEEP]</b></li>
    <li><i>weight_units:</i> Change cat 2 to 2.2 in order to preform feature cross to change kg to lbs <b>[KEEP]</b></li>
    <li><i>package_size:</i> Smaller packages are usually easier to ship <b>[KEEP]</b></li>
    <li><i>record_number:</i> Only needed for quiz set <b>[KEEP FOR QUIZ]</b></li>
</ul> 

In [None]:
# General imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Data Cleaning & Exploration

In [None]:
# Retrieving data
relevant_features = {
                    'payment_datetime': 'object',
                    'acceptance_scan_timestamp': 'object',
                    'shipment_method_id': 'int16',
                    'carrier_min_estimate': 'int16',
                    'carrier_max_estimate': 'int16',
                    'item_zip': 'object',
                    'buyer_zip': 'object', 
                    #'delivery_date': 'object',
                    #'weight': 'float32',
                    #'weight_units': 'int8',
                    #'package_size': 'category',
                    }

#data = pd.read_csv('train.tsv', sep='\t', usecols=relevant_features.keys(), dtype=relevant_features)
quiz = pd.read_csv('quiz.tsv', sep='\t', usecols=relevant_features.keys(), dtype=relevant_features)

In [None]:
# Function to change time objects to just date of datetime
from sklearn.base import BaseEstimator, TransformerMixin

class DateConverter(BaseEstimator, TransformerMixin):
    def __init__(self, have_labels=True):
        self.cols = [
                     'acceptance_scan_timestamp', 
                     'payment_datetime',
                     'delivery_date',
                    ]
        if not have_labels:
            self.cols.pop()
            
    def fit(self, X):
        return self
    
    def transform(self, X):
        for col in self.cols: 
            X[col] = X[col].astype(str)
            X[col] = X[col].str[:10]
            X[col] = pd.to_datetime(X[col])
            
        return X

In [None]:
# Function to add actual_handling_days / delivery_days
class DatesToDays(BaseEstimator, TransformerMixin):
    def __init__(self, have_labels=True):
        self.have_labels = have_labels
        
    def fit(self, X):
        return self
    
    def transform(self, X):
        X['actual_handling_days'] = X['acceptance_scan_timestamp'] - X['payment_datetime']
        X['actual_handling_days'] = X['actual_handling_days'].dt.days
        X['actual_handling_days'] = X['actual_handling_days'].astype('int16')
        #X.drop(columns=['acceptance_scan_timestamp'], inplace=True)
        
        if self.have_labels:
            X['delivery_days'] = X['delivery_date'] - X['payment_datetime']
            X['delivery_days'] = X['delivery_days'].dt.days
            X['delivery_days'] = X['delivery_days'].astype('int16')
            X.drop(columns=['delivery_date', 'payment_datetime'], inplace=True)
            
        return X

In [None]:
# Function to convert all weight to lbs
class NormalizeWeights(BaseEstimator, TransformerMixin):
    def __init__(self):
        return
    
    def fit(self, X):
        return self
    
    def transform(self, X):
        X['weight_units'].replace(to_replace=2, value=2.2)
        
        X['weight_lbs'] = X['weight'] * X['weight_units']
        X['weight_lbs'].astype('float32')
        X.drop(columns=['weight', 'weight_units'], inplace=True)
        
        X['weight_lbs'] = (X['weight_lbs'] - X['weight_lbs'].min()) / (X['weight_lbs'].max() - X['weight_lbs'].min())
        X['weight_lbs'] = X['weight_lbs'].astype('float32')
        return X

In [None]:
# Function to label zips based on their first digit
class ZipsToBin(BaseEstimator, TransformerMixin):
    def __init__(self, can_drop_unacceptable=True):
        self.acceptable = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']
        self.can_drop = can_drop_unacceptable
        
    def fit(self, X):
        return self
    
    def transform(self, X):
        X['item_zip'] = X['item_zip'].astype(str)
        X['item_zip'] = X['item_zip'].str[0]

        X['buyer_zip'] = X['buyer_zip'].astype(str)
        X['buyer_zip'] = X['buyer_zip'].str[0]
        
        if self.can_drop:
            X = X[X['item_zip'].isin(self.acceptable)]
            X = X[X['buyer_zip'].isin(self.acceptable)]
        else:
            X.loc[~X.buyer_zip.isin(self.acceptable), 'buyer_zip'] = '4'
            X.loc[~X.item_zip.isin(self.acceptable), 'item_zip'] = '4'
            
        X['item_zip'] = X['item_zip'].astype('int8')
        X['buyer_zip'] = X['buyer_zip'].astype('int8')
        
        bins = [-1, 4, 7, 8, 9]
        labels = ['east', 'central', 'mountain', 'pacific']
        X['item_zip'] = pd.cut(X['item_zip'], bins, labels=labels)
        X['buyer_zip'] = pd.cut(X['buyer_zip'], bins, labels=labels)
        
        return X

In [None]:
# Function to convert all package_size to ordinal labels
class PackageSizeToNum(BaseEstimator, TransformerMixin):
    def __init__(self):
        return
    
    def fit(self, X):
        return self
    
    def transform(self, X):
        X = X.astype('object')
        X.loc[X.package_size == 'LETTER', 'package_size'] = 0
        X.loc[X.package_size == 'PACKAGE_THICK_ENVELOPE', 'package_size'] = 1
        X.loc[X.package_size == 'LARGE_ENVELOPE', 'package_size'] = 2
        X.loc[X.package_size == 'LARGE_PACKAGE', 'package_size'] = 3
        X.loc[X.package_size == 'EXTRA_LARGE_PACKAGE', 'package_size'] = 4
        X.loc[X.package_size == 'VERY_LARGE_PACKAGE', 'package_size'] = 5
        X.loc[X.package_size == 'NONE', 'package_size'] = 3
        
        return X

In [None]:
# Function to convert all weight to lbs
class LabelEncodeCat(BaseEstimator, TransformerMixin):
    def __init__(self):
        return
    
    def fit(self, X):
        return self
    
    def transform(self, X):
        X = pd.concat([X, pd.get_dummies(X['shipment_method_id'], prefix='shipment_method_id', drop_first=False)], axis=1)
        
        X['timezone_difference'] = 0
        
        X.loc[(X['item_zip'] == 'east') & (X['buyer_zip'] == 'central'), 'timezone_difference'] = 1
        X.loc[(X['item_zip'] == 'central') & (X['buyer_zip'] == 'east'), 'timezone_difference'] = 1
        
        X.loc[(X['item_zip'] == 'pacific') & (X['buyer_zip'] == 'mountain'), 'timezone_difference'] = 1
        X.loc[(X['item_zip'] == 'mountain') & (X['buyer_zip'] == 'pacific'), 'timezone_difference'] = 1
        
        X.loc[(X['item_zip'] == 'mountain') & (X['buyer_zip'] == 'central'), 'timezone_difference'] = 1
        X.loc[(X['item_zip'] == 'central') & (X['buyer_zip'] == 'mountain'), 'timezone_difference'] = 1
        
        X.loc[(X['item_zip'] == 'central') & (X['buyer_zip'] == 'pacific'), 'timezone_difference'] = 2
        X.loc[(X['item_zip'] == 'pacific') & (X['buyer_zip'] == 'central'), 'timezone_difference'] = 2
        
        X.loc[(X['item_zip'] == 'east') & (X['buyer_zip'] == 'mountain'), 'timezone_difference'] = 2
        X.loc[(X['item_zip'] == 'mountain') & (X['buyer_zip'] == 'east'), 'timezone_difference'] = 2
        
        X.loc[(X['item_zip'] == 'pacific') & (X['buyer_zip'] == 'east'), 'timezone_difference'] = 3
        X.loc[(X['item_zip'] == 'east') & (X['buyer_zip'] == 'pacific'), 'timezone_difference'] = 3
        
        
        X.drop(columns=['shipment_method_id', #'shipment_method_id_24'
                        'item_zip', 'buyer_zip'], inplace=True)
        
        return X

In [None]:
# Function that uses max and min estimate to add bound features
class AddBounds(BaseEstimator, TransformerMixin):
    def __init__(self):
        return
    
    def fit(self, X):
        return self
    
    def transform(self, X):
        X['top_bound'] = X['carrier_max_estimate'] + X['actual_handling_days']
        X['bot_bound'] = X['carrier_min_estimate'] + X['actual_handling_days']
        
        X.drop(columns=['carrier_min_estimate', 'carrier_max_estimate'], inplace=True)
        
        return X

In [None]:
# Function to remove invalid dates from set
class RemoveInvalidDates(BaseEstimator, TransformerMixin):
    def __init__(self):
        return
    
    def fit(self, X):
        return self
    
    def transform(self, X):
        X = X[X['delivery_days'] > 0]
        X = X[X['actual_handling_days'] > 0]
        
        return X

In [None]:
# Function to add calendar days onto carrier estimate
from pandas.tseries.holiday import USFederalHolidayCalendar as calendar
class OffDaysFromCarrier(BaseEstimator, TransformerMixin):
    def __init__(self):
        return
    
    def fit(self, X):
        return self
    
    def transform(self, X):
        cal = calendar() 
        holidays = cal.holidays(start=X['acceptance_scan_timestamp'].min(), end=X['acceptance_scan_timestamp'].max())
                
        temp = X['carrier_min_estimate'].apply(lambda x: pd.Timedelta(x, unit='D'))
        X['min_off_days'] = X['acceptance_scan_timestamp'] + temp
        temp = X['carrier_max_estimate'].apply(lambda x: pd.Timedelta(x, unit='D'))
        X['max_off_days'] = X['acceptance_scan_timestamp'] + temp

        X['min_off_days'] = [(pd.date_range(x, y).isin(holidays) | pd.date_range(x, y).weekday.isin([5, 6])).sum() for x, y in zip(X.acceptance_scan_timestamp, X.min_off_days)]
        X['max_off_days'] = [(pd.date_range(x, y).isin(holidays) | pd.date_range(x, y).weekday.isin([5, 6])).sum() for x, y in zip(X.acceptance_scan_timestamp, X.max_off_days)]
        
        return X

In [None]:
# Pipelines
from sklearn.pipeline import Pipeline

training_pipeline = Pipeline([
                    ('date_converter', DateConverter()),
                    ('date_to_days', DatesToDays()),
                    #('weight_normalize', NormalizeWeights()),
                    ('bin_zip', ZipsToBin()),
                    #('package_to_num', PackageSizeToNum()),
                    ('label_encode_cat', LabelEncodeCat()),
                    #('bound_features', AddBounds()),
                    ('remove_invalid_dates', RemoveInvalidDates()),
                    ('weekends_holidays', OffDaysFromCarrier()),
                    ])

prediction_pipeline = Pipeline([
                     ('date_converter', DateConverter(have_labels=False)),
                     ('date_to_days', DatesToDays(have_labels=False)),
                     #('weight_normalize', NormalizeWeights()),
                     ('bin_zip', ZipsToBin(can_drop_unacceptable=False)),
                     #('package_to_num', PackageSizeToNum()),
                     ('label_encode_cat', LabelEncodeCat()),
                     #('bound_features', AddBounds()),
                     ('weekends_holidays', OffDaysFromCarrier()),
                     ])

In [None]:
# Transforming training data
data = training_pipeline.fit_transform(data)
data.head()

In [None]:
# Transforming quiz data
quiz = prediction_pipeline.fit_transform(quiz)
quiz.head()

In [None]:
# Data null count
data.isnull().sum()

In [None]:
# Quiz null count
quiz.isnull().sum()

In [None]:
# Correlation matrix
data.corr()

In [None]:
# Data describe
data.describe()

In [None]:
# Unique values
data.apply(lambda x: len(x.unique()))

In [None]:
# Seeing correlation between actual_handling_days and delivery_days
sns.scatterplot(x='actual_handling_days', y='delivery_days', data=data)

In [None]:
# Seeing correlation between carrier estimate delivery_days
# sns.scatterplot(x='carrier_max_estimate', y='delivery_days', data=data)

In [None]:
# Seeing correlation between weight_lbs and normalized delivery_days
# sns.scatterplot(x='weight_lbs', y='delivery_scaled', data=data)

In [None]:
# Seeing if delivery_days vary between different categories
# for i in range(0, 33):
#     median = data[data[f'category_id_{i}'] == 1]['delivery_days'].median()
#     mean = data[data[f'category_id_{i}'] == 1]['delivery_days'].mean()
#     print(f'{i} mean: {mean}, median: {median}')

In [None]:
# Seeing if delivery_days vary between different shipment methods
# for i in range(0, 27):
#     median = data[data[f'shipment_method_id_{i}'] == 1]['delivery_days'].median()
#     mean = data[data[f'shipment_method_id_{i}'] == 1]['delivery_days'].mean()
#     print(f'{i} mean: {mean}, median: {median}')

In [None]:
# Seeing if delivery_days vary between shipping distance
# data['distance'] = abs(data['item_zip'] - data['buyer_zip'])
# for i in range(0, 10):
#     median = data[data['distance'] == i]['delivery_days'].median()
#     mean = data[data['distance'] == i]['delivery_days'].mean()
#     print(f'{i} mean: {mean}, median: {median}')

In [None]:
# Seeing if delivery_days vary between package_size
# for i in range(0, 6):
#     median = data[data['package_size'] == i]['delivery_days'].median()
#     mean = data[data['package_size'] == i]['delivery_days'].mean()
#     print(f'{i} mean: {mean}, median: {median}')

<span><b>Notes From Exploring Data</b></span>
<ul>
    <li>Need to remove negative days from data <b>[DONE]</b></li>
    <li><i>category_id</i> seems to have same similar values between different categories, drop attribute <b>[DONE]</b></li>
    <li><i>shipment_id</i> is still useful besides 23-25 (nulls), drop those <b>[DONE]</b></li>
    <li><i>delivery_days</i> grows linearly as the difference between zips increases, we can make more specific later since zip code 7XXXX and zip code 3XXXX are neighbros but have a distance of 4 (too far) in my approach, ohe <b>[DONE]</b></li>
    <li>Huge outlier in <i>weight_lbs</i>, need to remove <b>[DONE]</b></li>
    <li>Way to many 0 values in <i>weight_lbs</i>, need to remove entire feature <b>[DONE]</b></li>
    <li><i>package_size</i> takes on similar values besides label 3, which is because I gave null package sizes the median label, can later bucket the sizes but for now will drop feature entirely <b>[DONE]</b></li>
    <li><i>carrier_min_estimate/carrier_max_estimate</i> add no new info, will instead use synthetic features called <i>top_bound</i> and <i>bot_bound</i> which will be used as bounds, if a prediction goes past these bounds in either direction, the bound will be used as the prediction instead<b> [DONE]</b></li>
<li>Might need to scale features later on <b>[DONE]</b></li>
<li> Create one hot encoding from zips 1 time zone away, 2 time zones away, 3 time zones away <b>[DONE]</b></li>
</ul>

In [None]:
# Saving current data
#data.to_csv('data_3', index=False)
quiz.to_csv('quiz_3', index=False)

In [None]:
# General imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Loading current data
dtype = {
         'carrier_min_estimate': 'int16',
         'carrier_max_estimate': 'int16',
         'actual_handling_days': 'int16',
         'delivery_days': 'int16',
         'timezone_difference': 'int8',
         'acceptance_scan_timestamp': 'datetime64',
         'min_off_days': 'int16',
         'max_off_days': 'int16',
        }

for i in range(0, 27):
    if i in [23, 24, 25]:
        continue
    dtype[f'shipment_method_id_{i}'] = 'int8'
    
data = pd.read_csv('data_3', usecols=dtype.keys(), dtype=dtype)
data.head()

# Training Time :)

In [None]:
# Splitting features and labels
cols = [
        'delivery_days',
        'acceptance_scan_timestamp',
       ]
X = data.drop(columns=cols)
cols.pop()
y = data.filter(cols)
del data

In [None]:
# Splitting data into training and test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=1)
del X, y

## Scikit-learn

In [None]:
# Importing and choosing models
from sklearn.linear_model import LinearRegression, Ridge, HuberRegressor, ElasticNetCV, SGDRegressor, Lasso, ElasticNet
models = [
          LinearRegression(),
          Ridge(),
          ElasticNetCV(),
         ]

In [None]:
# Quick way to test error from different sklearn models
def validate(model, N):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    y_pred = np.array(y_pred)
    y_pred = np.round(y_pred)
    
    y_pred = pd.DataFrame(data=y_pred, columns=['delivery_days'])
    
    y_pred['pure'] = y_val['delivery_days'].subtract(y_pred['delivery_days'])
    
    early_error = 0.4 * (y_pred[y_pred['pure'] <= 0]['pure'].sum())
    early_error = abs(early_error)
    late_error = 0.6 * (y_pred[y_pred['pure'] > 0]['pure'].sum())
    
    error = (early_error + late_error) / N
    error = round(error, 5)
    
    del y_pred
    
    print(f'{model}: {error}')

In [None]:
# Trying out models
y_val = y_val.reset_index(drop=True)
N = y_val.shape[0]
for model in models:
    validate(model, N)

<span><b>Notes From Sklearn</b></span>
<ul>
    <li><i>LinearRegression</i> error without using bound data: .4881, get score for bound data<b> [DONE]</b></li>
    <li><i>LinearRegression</i> error on bound approach: .4843, get score for min/max scaled data<b> [DONE]</b></li>
    <li><i>LinearRegression</i> error on min/max scaled data doesn't change: .4843, get score for standard scaled data<b> [DONE]</b></li>
    <li><i>LinearRegression</i> error on standard scaled data doesn't change: .4843, try neural net<b> [DONE]</b></li>
</ul>

## Tensorflow and Keras

In [None]:
# Data scaling for neural nets
from sklearn.preprocessing import MinMaxScaler
mm_scale = MinMaxScaler()
X_train[['actual_handling_days','zip_distance']] = mm_scale.fit_transform(X_train[['actual_handling_days','zip_distance']])
y_train['delivery_days'] = y_train['delivery_days'].astype('float32')
X_train.head()

In [3]:
# Deep learning imports
import tensorflow as tf
from tensorflow import keras

$$custom \ loss = \frac{1}{m} \ [ \ ( \ 0.4 \ * \sum_{overest.} \ |y-y'| \ )+ ( \ 0.6 \ * \sum_{underest.} \ (y-y') \ ) \ ]$$

In [None]:
# Custom loss from eBay
def custom_loss(y, y_pred):
    y = tf.cast(y, tf.float32)
    pure = tf.subtract(y, y_pred)
    
    over_est = tf.abs(tf.reduce_sum(tf.clip_by_value(pure, clip_value_min=-500, clip_value_max=0)))
    under_est = tf.reduce_sum(tf.clip_by_value(pure, clip_value_min=0, clip_value_max=500))
    del pure
    
    return tf.divide(tf.add(tf.multiply(0.4, over_est), tf.multiply(0.6, under_est)), tf.cast(tf.size(y), tf.float32))

In [None]:
# Creating neural network
model = keras.models.Sequential([
    keras.layers.Dense(1, input_shape=X_train.shape[1:]),
])

In [None]:
# Fitting neural network to data
opt = keras.optimizers.SGD(lr=0.01, momentum=0.5)
model.compile(optimizer=opt, loss=custom_loss)
hist = model.fit(X_train, y_train, batch_size=400, epochs=1, validation_data=(X_val, y_val))

In [None]:
# Saving model weights
model.save_weights('./checkpoints/my_checkpoint')

In [None]:
# Evaluating on test set
model.evaluate(X_test, y_test)

<span><b>Notes From TF / Keras</b></span>
<ul>
    <li><i>Neural Network</i> MAE doesn't go below .95, create custom_loss using eBay function<b> [DONE] </b></li>
    <li><i>Neural Network</i> custom_loss slows around .49, tune optimizer for speed<b> [DONE] </b></li>
    <li><i>Neural Network</i> accidentally included bounds, val loss is way better, check val loss without scaling actual_handling_days and zip_distance<b> [DONE] </b></li>
    <li><i>Neural Network</i> loss is better with bigger batch_size, try increasing and checking<b> [DONE] </b></li>
    <li>Check if <i>Neural Network</i> loss is better with bounds as features, it is not, drop features<b> [DONE] </b></li>
    <li><i>Neural Network</i> loss is not any better with OHE zips, bin into time zones assuming that on average if source and destination are in the same time zone, faster delivery<b> [DONE] </b></li>
    <li><i>Neural Network</i> loss is better with shipment id than without, find more clever features from the timestamps, and only carrier_min_estimate, carrier_max_estimate can be revived as a relevant feature, for example the estimates are probably only business days, so add weekends and holidays count between estimate and acceptance date as features more accuracy<b> [DONE] </b></li>
    <li> Figure out the best layers in order to output less loss on <i>Neural Network</i><b> [ND] </b></li>
</ul>

# Output Center

In [1]:
import pandas as pd
import numpy as np

In [5]:
# Loading quiz
dtype = {
        'acceptance_scan_timestamp': 'object',
        'carrier_min_estimate': 'int16',
        'carrier_max_estimate': 'int16',
        'actual_handling_days': 'int8',
        'payment_datetime': 'object',
        'timezone_difference': 'int8',
        'min_off_days': 'int8',
        'max_off_days': 'int8',
       }

for i in range(0, 19):
    dtype[f'shipment_method_id_{i}'] = 'int8'
    
quiz = pd.read_csv('quiz_3', usecols=dtype.keys(), dtype=dtype)

submission = quiz['payment_datetime']
submission = pd.DataFrame(data=submission, columns=['payment_datetime'])

quiz.drop(columns=['payment_datetime', 'acceptance_scan_timestamp'], inplace=True)
for i in range(26, 18, -1):
    if i in [23, 25]:
        continue
    quiz.insert(22, f'shipment_method_id_{i}', 0)
    quiz[f'shipment_method_id_{i}'] = quiz[f'shipment_method_id_{i}'].astype('int8')
    
submission['payment_datetime'] = pd.to_datetime(submission['payment_datetime'])

In [6]:
# Creating neural network
model = keras.models.Sequential([
    keras.layers.Dense(1, input_shape=quiz.shape[1:]),
])

In [7]:
# Loading weights from training data
model.load_weights('./checkpoints/my_checkpoint')

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x25e0ec7bf70>

In [8]:
# Getting prediction as day into dataframe
pred = model.predict(quiz)
pred = np.around(pred)
pred = pred.astype(int)
submission['pred'] = pred

In [15]:
# Getting prediction as a date
temp = submission['pred'].apply(lambda x: pd.Timedelta(x, unit='D'))
submission['pred'] = submission['payment_datetime'] + temp
submission.drop(columns=['payment_datetime'], inplace=True)

In [22]:
# Loading record number
records = pd.read_csv('quiz.tsv', sep='\t', usecols=['record_number'])

In [24]:
# Getting predictions
records['predicted_delivery_date'] = submission['pred']
del submission

In [27]:
# Exporting
records.to_csv('submission_2', sep='\t', index=False)