In [1]:
import pickle
import pandas as pd
import os
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm import tqdm
from datetime import datetime, timedelta
import joblib
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from imblearn.over_sampling import SMOTE

In [2]:
# import cleaned dataframe
cwd = os.getcwd()
cleaned_df = pickle.load( open('data/data.pkl', "rb" ) )

# 2. Add month of year

In [3]:
# 2. Add features
cleaned_df_1 = cleaned_df.copy(deep=False)
start_rows = len(cleaned_df_1)
start_col = len(cleaned_df_1.columns)
print(f"Starting with {start_rows} rows, {start_col} columns")

## Create month of year feature
cleaned_df_1['month'] = cleaned_df_1['shipment_date'].dt.month

end_rows = len(cleaned_df_1)
end_col = len(cleaned_df_1.columns)
print(f"Ending with {end_rows} rows, {end_col} columns")

Starting with 11515017 rows, 42 columns
Ending with 11515017 rows, 43 columns


# 3. Create target variable Y i.e. time window

In [4]:
### Keep rows from -1 to 5 days in transit
start_rows = len(cleaned_df_1)
start_col = len(cleaned_df_1.columns)
print(f"Starting with {start_rows} rows, {start_col} columns")

max_days_to_keep = 5
cleaned_df_1 = cleaned_df_1[cleaned_df_1['days_in_transit'].isin(np.arange(-1,max_days_to_keep))]
days_in_transit_rows = len(cleaned_df_1) 
print(f"Keep -1 to {max_days_to_keep} days in transit: Removed {start_rows - days_in_transit_rows} rows")

end_rows = len(cleaned_df_1)
end_col = len(cleaned_df_1.columns)
print(f"Ending with {end_rows} rows, {end_col} columns")

Starting with 11515017 rows, 43 columns
Keep -1 to 5 days in transit: Removed 100697 rows
Ending with 11414320 rows, 43 columns


In [8]:
## Create time window logic
### Window percentage of day
percentage_of_day_list = []
#### 8.00am 
eight_am = timedelta(hours=8, minutes=0).total_seconds() / timedelta(days=1).total_seconds()
percentage_of_day_list.append(eight_am)
#### 10.30am
ten_thirty_am = timedelta(hours=10, minutes=30).total_seconds() / timedelta(days=1).total_seconds()
percentage_of_day_list.append(ten_thirty_am)
#### 3.00pm 
three_pm = timedelta(hours=15, minutes=0).total_seconds() / timedelta(days=1).total_seconds()
percentage_of_day_list.append(three_pm)
#### 4.30pm 
four_thirty_pm = timedelta(hours=16, minutes=30).total_seconds() / timedelta(days=1).total_seconds()
percentage_of_day_list.append(four_thirty_pm)
#### 6.30pm 
six_thirty_pm = timedelta(hours=18, minutes=30).total_seconds() / timedelta(days=1).total_seconds()
percentage_of_day_list.append(six_thirty_pm)


print(f"8.00am: {eight_am} day, \
10.30am: {ten_thirty_am} day, \
3.00pm: {three_pm} day, \
4.30pm: {four_thirty_pm} day, \
6.30pm: {six_thirty_pm} day")

8.00am: 0.3333333333333333 day, 10.30am: 0.4375 day, 3.00pm: 0.625 day, 4.30pm: 0.6875 day, 6.30pm: 0.7708333333333334 day


In [9]:
### Create time window thresholds
time_window_thresholds = [eight_am, ten_thirty_am, three_pm, 1,
                          1 + ten_thirty_am, 1 + four_thirty_pm, 1 + six_thirty_pm, 2,
                          2 + four_thirty_pm, 2 + six_thirty_pm, 3,
                          4, 5]
  
### Create time window assignment function 
#### Total of 31 time windows: 
#### 0 business days -> Window 0, 
#### 3 time windows * each of 1-10 business days -> Windows 1 - 31
def assign_time_window(time): 
    # If 0 business days, assign to window 0
    if time <= 0: return 0
    lowerBound = 0
    # If >0 business days, assign to respective time window
    for upperBound in time_window_thresholds: 
        if lowerBound < time <= upperBound:
            # Use index of thresholds to assign time windows
            return time_window_thresholds.index(upperBound)+1 # +1 since Window 0 is occupied
            break

print(f"No. of thresholds: {len(time_window_thresholds)+1}, Thresholds: 0 and {time_window_thresholds}")

No. of thresholds: 14, Thresholds: 0 and [0.3333333333333333, 0.4375, 0.625, 1, 1.4375, 1.6875, 1.7708333333333335, 2, 2.6875, 2.7708333333333335, 3, 4, 5]


In [10]:
### Assign time window 
start_rows = len(cleaned_df_1)
start_col = len(cleaned_df_1.columns)
print(f"Starting with {start_rows} rows, {start_col} columns")

tqdm.pandas(desc="Assign time window")
cleaned_df_1['Y'] = cleaned_df_1['days_taken_float'].progress_apply(assign_time_window)

end_rows = len(cleaned_df_1)
end_col = len(cleaned_df_1.columns)
print(f"Ending with {end_rows} rows, {end_col} columns")

### Visualize time window assignment
cleaned_df_1.Y.value_counts().to_frame().style.bar()

## Reset index since no more removing of rows
cleaned_df_1 = cleaned_df_1.reset_index(drop=True)

Assign time window:   0%|          | 0/11414320 [00:00<?, ?it/s]

Starting with 11414320 rows, 43 columns


Assign time window: 100%|██████████| 11414320/11414320 [00:18<00:00, 604052.88it/s]


Ending with 11414320 rows, 43 columns


In [11]:
cleaned_df_1[['days_taken_float', 'Y']].iloc[:20]

Unnamed: 0,days_taken_float,Y
0,4.688889,13
1,3.526389,12
2,0.557639,3
3,0.557639,3
4,0.557639,3
5,3.517361,12
6,3.6875,12
7,3.495833,12
8,4.615972,13
9,3.65,12


In [12]:
### Visualize time window assignment
cleaned_df_1.Y.value_counts().to_frame().style.bar()

Unnamed: 0,Y
6,2667608
9,2117051
3,1791890
12,1769008
5,677888
13,675174
2,557137
4,509351
7,241163
10,169003


# 5.1 Clean data - remove unneeded columns 

In [13]:
cleaned_df_1.columns

Index(['year_week', 'business_sid', 'industry', 'sub_industry', 'shipper',
       'service_type', 'std_service_type', 'package_count', 'weight',
       'std_weight', 'shipment_date', 'delivery_date', 'delivery_time',
       'freight_charges', 'freight_discount_amount', 'misc_charges',
       'misc_discount_amount', 'net_charge_amount', 'zone', 'sender_city',
       'sender_state', 'sender_zip', 'recipient_city', 'recipient_state',
       'recipient_zip', 'week_number', 'day_of_week', 'days_in_transit',
       'days_taken_float', 'Y', 'same_MSA', 'sender_in_MSA', 'rec_in_MSA',
       'sender_MSA_num', 'rec_MSA_num', 'distance', 'sender_pop',
       'sender_pop_density', 'sender_houses', 'recipient_pop',
       'recipient_pop_density', 'recipient_houses', 'month'],
      dtype='object')

In [14]:
cleaned_df_2 = cleaned_df_1.copy(deep=False)
start_rows = len(cleaned_df_2)
start_col = len(cleaned_df_2.columns)
print(f"Starting with {start_rows} rows, {start_col} columns")

# Columns to keep
columns_kept = ['shipper','std_service_type','std_weight','freight_charges','zone',
                'sender_state','recipient_state', 'distance',
                'sender_pop', 'sender_pop_density', 'sender_houses', 
                'recipient_pop', 'recipient_pop_density', 'recipient_houses',
                'same_MSA', 'sender_in_MSA', 'rec_in_MSA', 'sender_MSA_num', 'rec_MSA_num',
                'week_number','day_of_week','month','Y']

cleaned_df_2 = cleaned_df_2[columns_kept]

end_rows = len(cleaned_df_2)
end_col = len(cleaned_df_2.columns)
print(f"Removed unneeded columns: Removed {start_col-end_col} columns")
print(f"Ending with {end_rows} rows, {end_col} columns")

Starting with 11414320 rows, 43 columns
Removed unneeded columns: Removed 20 columns
Ending with 11414320 rows, 23 columns


# 5.2 Clean data - categorize

In [15]:
cat_cols = ['shipper','std_service_type','zone',
            'sender_state','recipient_state',
            'same_MSA', 'sender_in_MSA', 'rec_in_MSA', 'sender_MSA_num', 'rec_MSA_num',
            'week_number','day_of_week','month']

float_cols = ['std_weight','freight_charges','distance',
                'sender_pop', 'sender_pop_density', 'sender_houses', 
                'recipient_pop', 'recipient_pop_density', 'recipient_houses']
assert len(cat_cols) + len(float_cols) == len(cleaned_df_2.columns) - 1 # -1 for Y variable

In [16]:
cleaned_df_2[cat_cols] = cleaned_df_2[cat_cols].astype('category')
cleaned_df_2[float_cols] = cleaned_df_2[float_cols].astype('float64')

In [17]:
## Export to pickle
joblib.dump(cleaned_df_2, "./data/final_df_14_Windows.pkl.z")

['./data/final_df_14_Windows.pkl.z']

## One hot encode

In [25]:
final_df = joblib.load('data/final_df_14_Windows.pkl.z')

In [26]:
final_df.columns

Index(['shipper', 'std_service_type', 'std_weight', 'freight_charges', 'zone',
       'sender_state', 'recipient_state', 'distance', 'sender_pop',
       'sender_pop_density', 'sender_houses', 'recipient_pop',
       'recipient_pop_density', 'recipient_houses', 'same_MSA',
       'sender_in_MSA', 'rec_in_MSA', 'sender_MSA_num', 'rec_MSA_num',
       'week_number', 'day_of_week', 'month', 'Y'],
      dtype='object')

In [27]:
def prepare_data(X, y, oversample=False):
    '''
    This function will prepare the data for classification.
    It expects the following parameters:
      - X: feature columns
      - y: target variable column
      - train_size: proportion of dataset used for training
      - random_state: the random seed to use when selecting a subset of rows
    
    This function returns a dictionary with the following entries
      - X_train: the matrix of training data
      - y_train: the array of training labels
      - X_test: the matrix of testing data
      - y_test: the array of testing labels
    '''
    # Split data
    print("Splitting data...")
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=71)    
    
    # Scale the variables
    print("Scaling data...")
    scaler = preprocessing.MinMaxScaler()

    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    
    # SMOTE 
    # Oversample training set
    if oversample is True:
        os = SMOTE(n_jobs=-1, random_state=71)
        print("Oversampling...", X_train.shape, X_test.shape)  
        X_train, y_train = os.fit_sample(X_train, y_train)
        print("Oversampled: ", X_train.shape, X_test.shape)  
    
    # return training and testing data
    out = {'X_train':X_train, 'y_train':y_train, 
           'X_test':X_test, 'y_test':y_test}
    
    return out

In [None]:
y = final_df.Y
final_df = final_df.drop(columns=['Y'])
ohe_df = pd.get_dummies(final_df)
X = ohe_df

# Free up memory
# del final_df
# del ohe_df
# del cleaned_df_1
# del cleaned_df_2

In [None]:
print(X.shape, y.shape)

In [None]:
data_dict = prepare_data(X,y, oversample=False)

In [None]:
npz_path = 'data/data_dict_14windows_SMOTEno_MSAyes.npz'
np.savez_compressed(npz_path, 
        X_train = data_dict['X_train'],
        y_train = data_dict['y_train'],
        X_test = data_dict['X_test'],
        y_test = data_dict['y_test'])