To run the following script, install the following via a terminal:

- pip install uszipcode
- pip install --upgrade uszipcode (if needed)
- pip install python-Levenshtein
- pip install fuzzywuzzy

Levenshtein may require a Microsoft C++ build package through Visual Studio. Follow prompts/errors as required.

In [56]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import MySQLdb as db
import credentials
import pickle
import time
from fuzzywuzzy import fuzz
from fuzzywuzzy import process

In [57]:
#Establish connection to the MySQLDB
MySQLDB = db.connect(credentials.host, credentials.user, credentials.password, credentials.db)

In [67]:
# Standardized shipping methods based primarily upon what is selectable through the FedEx API here:
# https://www.fedex.com/ratefinder/home. 'Home Delivery' and 'Smartpost' are not selectable.
fedex_methods = ['Same Day', 'First Overnight', 'Priority Overnight', 'First Overnight',
                 'Priority Overnight', 'Standard Overnight', '2Day AM', '2Day', 'Express Saver',
                 'Ground','Home Delivery','Smartpost']

# Standardized shipping methods based primarily on what is selectable through the API here:
# https://wwwapps.ups.com/ctc/request?loc=en_US. 'Surepost' and 'Standard' are not selectable.
ups_methods = ['Next Day Air Early', 'Next Day Air', 'Next Day Air Saver', '2nd Day Air A.M.',
               '2nd Day Air', '3 Day Select', 'Ground', 'Surepost', 'Standard']

# Standardized state names and codes of the 48 contiguous states based upon USPS standards found here:
# https://www.ups.com/worldshiphelp/WS14/ENU/AppHelp/Codes/State_Province_Codes.htm
state_codes = ['AL', 'AK', 'AZ', 'AR', 'CA', 'CO', 'CT', 'DE', 'DC', 'FL', 'GA', 'ID', 'IL', 'IN', 
               'IA', 'KS', 'KY', 'LA', 'ME', 'MD', 'MA', 'MI', 'MN', 'MS', 'MO', 'MT', 'NE', 'NV', 'NH',
               'NJ', 'NM', 'NY', 'NC', 'ND', 'OH', 'OK', 'OR', 'PA', 'RI', 'SC', 'SD', 'TN', 'TX', 'UT',
               'VT', 'VA', 'WA', 'WV', 'WI', 'WY']

# Instantiates variable for searching for state from zip
search = SearchEngine(simple_zipcode=True)

# Instantiates files for reading from and writing to a pickle file
file_path = "data/data.pickle"
pkl_file = open(file_path,"wb")

In [68]:
def query(lower,upper):
    """
    Queries for and returns a sample of records that meet where clause criteria
    """
    # initializes query based upon lower and upper data inputs
    sql_query = """
    select * from 
    (select * from 
    (select * from 
    (select year_week, business_sid, upper(trim(industry)) as industry, upper(trim(sub_industry)) as sub_industry,shipper,
    trim(service_type_description) as service_type,package_count, weight,shipment_date,delivery_date, delivery_time, 
    freight_charges,freight_discount_amount,misc_charges,misc_discount_amount, 
    net_charge_amount, zone, upper(trim(sender_city)) as sender_city, upper(trim(sender_state)) as sender_state,
    left(sender_zip,5) as sender_zip, upper(trim(recipient_city)) as recipient_city,
    upper(trim(recipient_state)) as recipient_state, left(recipient_zip,5) as recipient_zip 
    from libras.shipment_details 
    where sender_country = 'US' and recipient_country = 'US' and year_week >= {} and year_week < {}
    and delivery_date is not null
    ) t1 
    where t1.shipment_date is not null) t2 
    where t2.freight_charges > 0) t3 
    where t3.zone is not null or trim(zone)!='' 
    """.format(lower,upper)
    
    # queries database and samples results
    records = pd.read_sql_query(sql_query, MySQLDB).sample(frac = 0.14, replace = False) 
    
    return records

In [69]:
def preprocess(records):
    """
    Preprocesses records to satisfy common cleansing requirements between benchmarking and delivery prediction solutions
    """    
    # sets float dtypes and standardizes zones to single digits    
    float_cols = ['freight_charges','freight_discount_amount','misc_charges','misc_discount_amount','net_charge_cmount','zone']
    records = records[records.zone.apply(lambda x: x.isnumeric())]
    records[float_cols] = records[float_cols].astype('float64')
    records.zone %= 10

    # strips leading and trailing whitespaces from all string values
    obj_columns = records.select_dtypes(include='object').columns + ['zone']
    for column in obj_columns:
        records[column] = records[column].str.strip()
    
    # converts a subset columns of dtype 'object' to dtype 'category' for memory conservation and later ml use 
    cat_columns = ['industry','sub_industry','sender_state','recipient_state','zone']
    records[cat_cols] = records[cat_cols].astype('category')
    
    # creates std_weight (weight/package_count)
    records.insert(8, 'std_weight', records['weight'] / records['package_count'])
    
    # applies fuzzy macthing to each service type relative to the standardized list per carrier.
    service_type_fuzzy_match = []
    columns = ['shipper','service_type']
    for record in records[columns].itertuples():       
        if record.shipper == 'fedex':
            service_type_fuzzy_match.append(process.extractOne(record.service_type, fedex_methods, scorer = fuzz.partial_ratio))
        else:
            service_type_fuzzy_match.append(process.extractOne(record.service_type, ups_methods, scorer = fuzz.partial_ratio))  
            
    # adds the standardized service type and drops all records with service type scores less than 70
    records.insert(6, 'std_service_type', [method for method, score in service_type_fuzzy_match])
    records = records.assign(std_service_type_score = [score for method, score in service_type_fuzzy_match])
    records = records[records.std_service_type_score >= 70]
    
    # removes records with sender or recipient states residing outside of the 48 contiguous states
    records = records[(records.recipient_state.isin(state_codes + ['']))]
    records = records[(records.sender_state.isin(state_codes + ['']))]
    
    # drops unneeded columns 
    records = records.drop(['std_service_type_score'], axis=1)
    
    return records

In [70]:
def store(records):
    """
    Stores results from each query into a pickle
    """
    try:
        temp_df = pd.DataFrame()
        new_df = pd.read_pickle(file_path)
        print("Merged pickle contains", len(new_df.index), "records before merge")
        temp_df = new_df.append(records, sort=False)
        temp_df.to_pickle(file_path)
        print("Merged pickle contains", len(temp_df.index), "records after merge")
    except EOFError:
        records.to_pickle(file_path)
        print("Merged pickle contains", len(records.index), "records with initial merge")

In [71]:
def process_batch(start_time_, batch_num_, lower_date_range_, upper_date_range_):
    batch_start_time = time.time()
    print("Initiating batch ({}): year_week {} and {} (exclusive)".format(
        batch_num_, lower_date_range_, upper_date_range_))
    
    # queries for a batch of records within a given 4-week range
    print("Total time: {} min | Batch time: {} sec | Querying records".format(
        int(round((time.time() - start_time_)/60, 2)),
        int(round(time.time() - batch_start_time))))
    queried_results = query(lower = lower_date_range_, upper = upper_date_range_)
    
    # preprocesses the record batch
    print("Total time: {} min | Batch time: {} sec | Preprocessing records".format(
        int(round((time.time() - start_time_)/60, 2)),
        int(round(time.time() - batch_start_time))))
    preprocessed_results = preprocess(queried_results)
    
    # appends the record batch
    print("Total time: {} min | Batch time: {} sec | Storing records".format(
        int(round((time.time() - start_time_)/60, 2)),
        int(round(time.time() - batch_start_time))))
    store(preprocessed_results)
    
    # reports final time
    print("Total time: {} min | Batch time: {} sec | Records stored".format(
        int(round((time.time() - start_time_)/60, 2)),
        int(round(time.time() - batch_start_time))))
    print("========================================")

In [72]:
start_time = time.time()
batch_num = 1

#Get records for 2018
min_date = 201823
max_date = 201852 #201852
lower_date_range = min_date
upper_date_range = lower_date_range + 4
while upper_date_range <=max_date:
    if upper_date_range <=201852:
        upper_date_range = lower_date_range + 4
        process_batch(start_time, batch_num, lower_date_range, upper_date_range)
        lower_date_range=upper_date_range
        batch_num += 1

#Get records for 2019        
min_date = 201901
max_date = 201925 #201925
lower_date_range = min_date
upper_date_range = lower_date_range + 4
while upper_date_range <=max_date:
    if upper_date_range <=201925:
        upper_date_range = lower_date_range + 4
        process_batch(start_time, batch_num, lower_date_range, upper_date_range)
        lower_date_range=upper_date_range
        batch_num += 1

Initiating batch (1): year_week 201823 and 201827 (exclusive)
Total time: 0 min | Batch time: 0 sec | Querying records
Total time: 1 min | Batch time: 104 sec | Preprocessing records
Total time: 2 min | Batch time: 154 sec | Storing records
Merged pickle contains 226803 records with initial merge
Total time: 2 min | Batch time: 155 sec | Records stored
Initiating batch (2): year_week 201827 and 201831 (exclusive)
Total time: 2 min | Batch time: 0 sec | Querying records
Total time: 4 min | Batch time: 97 sec | Preprocessing records
Total time: 5 min | Batch time: 143 sec | Storing records
Merged pickle contains 226803 records before merge
Merged pickle contains 431513 records after merge
Total time: 5 min | Batch time: 146 sec | Records stored
Initiating batch (3): year_week 201831 and 201835 (exclusive)
Total time: 5 min | Batch time: 0 sec | Querying records
Total time: 6 min | Batch time: 111 sec | Preprocessing records
Total time: 7 min | Batch time: 172 sec | Storing records
Merged

In [74]:
pkl_file.close()
pd.read_pickle(file_path).sample(frac=0.25, replace=False).to_pickle('data/data_sample.pickle')