In [7]:
import pickle
import pandas as pd
import os
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm import tqdm
import joblib

## 1. Preprocess

In [65]:
import pgeocode
from uszipcode import SearchEngine, SimpleZipcode, Zipcode

def get_distance(zip1, zip2):
    dist = pgeocode.GeoDistance('us')
    return dist.query_postal_code(zip1, zip2)

search = SearchEngine()
def get_zip_details(zip):
    zipcode = search.by_zipcode(zip)
    pop = zipcode.population 
    pop_density = zipcode.population_density
    housing_units = zipcode.housing_units
    state = zipcode.state
    # Return 0 if not found. nans will encounter error later.
    return pop or 0, pop_density or 0, housing_units or 0, state or 0

def add_missing_dummy_columns(d, columns ):
    missing_cols = set( columns ) - set( d.columns )
    for c in missing_cols:
        d[c] = 0

In [163]:
from datetime import datetime

def preprocess(shipment_date, shipper, std_weight, freight_charges, zone, sender_zip, recipient_zip):
    # Define standard service type
    std_service_type = "Ground"
    
    # Get datetime features
    
    ## Convert string to pandas datetime series type
    shipment_date_parsed = pd.Series(datetime.strptime(shipment_date, '%b %d %Y'))
    
    ## Get features from pandas series
    week_number = shipment_date_parsed.dt.week.values[0]
    day_of_week = shipment_date_parsed.dt.dayofweek.values[0]
    month = shipment_date_parsed.dt.month.values[0]
    
    ## Get sender_in_MSA and recipient_in_MSA and same_MSA booleans
    ##### COLTON
    sender_in_MSA = 0 # to edit
    rec_in_MSA = 0 # to edit
    same_MSA = 0 # to edit
    
    ## Get distance
    distance = get_distance(sender_zip, recipient_zip)
    
    ## Get population, density, no. houses, state code for recipient and sender
    recipient_pop, recipient_pop_density, recipient_houses, recipient_state = get_zip_details(recipient_zip)
    sender_pop, sender_pop_density, sender_houses, sender_state = get_zip_details(sender_zip)

    # Populate dataframe
    
    ## Create empty dataframe with correct columns 
    feature_names = np.load('data/feature_names.npz')
    df = pd.DataFrame(columns=feature_names['feature_names']) 
    
    ## Add row into df
    df.loc[0] = [shipper, std_service_type, std_weight, freight_charges, zone, 
                 sender_state, recipient_state, distance, sender_pop, sender_pop_density,
                 sender_houses, recipient_pop, recipient_pop_density, recipient_houses, same_MSA,
                 sender_in_MSA, rec_in_MSA, week_number, day_of_week, month]
    
    ## Define categorical and float columns
    cat_cols = ['shipper','std_service_type','zone','week_number','day_of_week',
                'sender_state','recipient_state','same_MSA', 'sender_in_MSA', 'rec_in_MSA',
                'month']

    float_cols = ['std_weight','freight_charges','distance', 'sender_pop', 'sender_pop_density',
                 'sender_houses', 'recipient_pop', 'recipient_pop_density', 'recipient_houses']
    
    df[cat_cols] = df[cat_cols].astype('category')
    df[float_cols] = df[float_cols].astype('float64')
        
    ## Dummify dataframe
    df = pd.get_dummies(df)
    
    ## Create empty dataframe in same shape as the one used in model, fill with 0s
    df_full = pd.DataFrame(columns=feature_names['feature_names_dummified']) 
    
    ## Execute a right join to align our test dataframe with full dataframe
    df, df_full = df.align(df_full, join='right', axis=1, fill_value=0) 
    
    ## Convert dataframe to numpy array for prediction
#     X_test = df.loc[0]
    X_test = df.loc[0].values
    
    ## Scale data with saved min-max scaler
    ## scaler = joblib.load('model/scaler')
    ## X_test = scaler.transform(X_test)
    
    return X_test

In [166]:
X_test = preprocess("Jul 25 2019", "fedex", 10, 20, 3, 15206, 15211)
X_test

array([1.00000000e+01, 2.00000000e+01, 9.79671146e+00, 2.86150000e+04,
       5.99100000e+03, 1.55850000e+04, 1.10810000e+04, 7.08300000e+03,
       6.48900000e+03, 1.00000000e+00, 0.00000000e+00, 1.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
      

## 2. Predict

In [16]:
'''
Features to be created:
    1. sender_MSA_num
    2. sender_in_MSA (binary 1 for MSA, 0 for MICRO)
    3. recipient_MSA_num
    4. recipient_in_MSA (binary 1 for MSA, 0 for MICRO) 
    
Features will then be used to determine if the sender and recipient
are in the same MSA.  These will be two additional
features as shown below:
    5. send_rec_same_MSA (binary)
    
Definition of CBSA, MSA, MICRO, and CSA as stated on inc.com:
https://www.inc.com/encyclopedia/metropolitan-statistical-area-msa.html

Core-Based Statistical Area
A CBSA is one or more counties with an urbanized cluster of at least 10,000 people. 
The area as a whole is defined by the interaction between the core and the outlying areas. 
This interaction, measured by commuting, means that at least 25 percent of people in 
outlying areas are working in the core. The CBSA is a generic definition of MICROs and MSAs, 
the difference being core population size.

Micropolitan Statistical Areas
A MICRO is simply a small CBSA, i.e., a county or counties with an urbanized core of 10,000 
but fewer than 50,000 in population. Outlying areas included are, again, defined by 
commuting patterns. As of November 2004, according to the Census Bureau, there were 575 MICROs 
in the U.S. and five in Puerto Rico.

Metropolitan Statistical Areas
An MSA has an urbanized core of minimally 50,000 population and includes outlying areas determined 
by commuting measures. In 2004, the U.S. had 362 MSAs and Puerto Rico eight.

Combined Statistical Areas
CSAs are two or more adjacent CBSAs in which there is at least a 15 employment interchange (measured 
by commuting) between cores. If this exchange is 25 percent or higher between a pair of CBSAs, they 
are combined into a CSA automatically; if the measure is at least 15 percent but below 25, local 
opinion in both areas is used to decide on combination. The U.S. had 116 CSAs in 2004.

'''

# Import zipcode to MSA map
zipcode_to_MSA_df = pd.read_csv(cwd + "\\data\\zip_to_MSA_numbers.csv", dtype = object)

cleaned_df_4 = cleaned_df_3.copy(deep=False)
start_rows = len(cleaned_df_4)
start_col = len(cleaned_df_4.columns)
print(f"Starting with {start_rows} rows, {start_col} columns")

# Change names of columns for simplicity in coding
zipcode_to_MSA_df.columns = ['zipcode', 'state', 'msa_num', 'county_num', 'msa_name']

# Creating dictionary for mapping zipcodes to MSA numbers
zip_msa_num_dict = zipcode_to_MSA_df.set_index('zipcode')['msa_num'].to_dict()
zip_msa_name_dict = zipcode_to_MSA_df.set_index('zipcode')['msa_name'].to_dict()

# Lists to be filled and then converted to dataframe column features
sender_MSA_num = []
sender_in_MSA = []
recipient_MSA_num = []
recipient_in_MSA = []
send_rec_same_MSA = []

# For debugging purposes (find zipcodes that don't show up in dictionary)
zips_not_in_dict = {}

for row in cleaned_df_4.itertuples():
    if row.sender_zip in zip_msa_num_dict:
        sender_MSA_num.append(zip_msa_num_dict[row.sender_zip])
        msa_name = zip_msa_name_dict[row.sender_zip]
        if 'MSA' in msa_name:
            sender_in_MSA.append(1)
        else:
            sender_in_MSA.append(0)
    else:
        sender_in_MSA.append(0)
        sender_MSA_num.append(0)
        if row.sender_zip not in zips_not_in_dict:
            zips_not_in_dict[row.sender_zip] = 1
        else:
            zips_not_in_dict[row.sender_zip] += 1
    if row.recipient_zip in zip_msa_num_dict:
        recipient_MSA_num.append(zip_msa_num_dict[row.recipient_zip])
        msa_name = zip_msa_name_dict[row.recipient_zip]
        if 'MSA' in msa_name:
            recipient_in_MSA.append(1)
        else:
            recipient_in_MSA.append(0)
    else:
        recipient_MSA_num.append(0)
        recipient_in_MSA.append(0)
        if row.recipient_zip not in zips_not_in_dict:
            zips_not_in_dict[row.recipient_zip] = 1
        else:
            zips_not_in_dict[row.recipient_zip] += 1
        
# Checking to see if sender and recipient are in same MSA and filling list
for s, r in zip(sender_MSA_num, recipient_MSA_num):
    if s == r:
        send_rec_same_MSA.append(1)
    else:
        send_rec_same_MSA.append(0)

# Creating columns and adding to dataframe
cleaned_df_4['same_MSA'] = pd.Series(send_rec_same_MSA)
cleaned_df_4['sender_in_MSA'] = pd.Series(sender_in_MSA)
cleaned_df_4['rec_in_MSA'] = pd.Series(recipient_in_MSA)
cleaned_df_4['sender_MSA_num'] = pd.Series(sender_MSA_num)
cleaned_df_4['rec_MSA_num'] = pd.Series(recipient_MSA_num)

end_rows = len(cleaned_df_4)
end_col = len(cleaned_df_4.columns)
print(f"Ending with {end_rows} rows, {end_col} columns")

Starting with 1612557 rows, 30 columns
Ending with 1612557 rows, 35 columns
