To run the following script, install the following:

- pip install python-Levenshtein
- pip install fuzzywuzzy

Levenshtein may require a Microsoft C++ build package through Visual Studio. Follow prompts as required  

In [31]:
import pandas as pd
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
import sys
import os
import os.path

In [4]:
# Standardized shipping methods based primarily upon what is selectable through the FedEx API here:
# https://www.fedex.com/ratefinder/home. 'Home Delivery' and 'Smartpost' are not selectable
# through the API. This is because these services are available to businesses only.
fedex_methods = ['Same Day', 'First Overnight', 'Priority Overnight', 'First Overnight',
                 'Priority Overnight', 'Standard Overnight', '2Day AM', '2Day', 'Express Saver',
                 'Ground','Home Delivery','Smartpost']

# Standardized shipping methods based primarily on what is selectable through the API here:
# https://wwwapps.ups.com/ctc/request?loc=en_US. 'Surepost' and 'Standard' are not selectable
# through the API. Standard is Ground to the 48 continguous states, whereas Ground includes
# Alaska and Hawaii.
ups_methods = ['Next Day Air Early', 'Next Day Air', 'Next Day Air Saver',
               '2nd Day Air A.M.', '2nd Day Air', '3 Day Select', 'Ground', 'Surepost', 'Standard']

In [35]:
# Reads in and samples the data. The sampling will be removed in the final solution
data = pd.read_pickle('C:/Users/royce/Downloads/test_merged.pickle')
data_sample = data.sample(frac=.01, replace=False)

In [7]:
data_sample.columns = ['year_week', 'business_sid', 'industry', 'sub_industry', 'shipper',
       'service_type', 'package_count', 'weight', 'shipment_date',
       'delivery_date', 'delivery_time', 'freight_charges',
       'freight_discount_amount', 'misc_charges', 'misc_discount_amount',
       'net_charge_amount', 'zone', 'sender_city', 'sender_zip',
       'recipient_city', 'recipient_zip']

In [8]:
# Applies the 'fuzz.partial_ratio' fuzzy macthing algorithm to each record based upon the record's service_type_description.
# The partial_ratio function is designed to return the shipping method with the highest score as a two pair tuple
# i.e., (standardized_shipping_method, score). The term 'partial' is used because it looks for subset / superset pairs.
# For instance 'Big is Better' and 'Better' would match and have a score of 100 because 'Better' perfectly matches 
# a subset of the words in 'Big is Better'. For shipping methods, 'Ground Commerical' and 'Ground' will match
# with a score of 100 because 'Ground' perfeclty matches a subset of the words in 'Ground Commerical'

data_sample_partial_ratio = []
for i, series in data_sample[['shipper','service_type']].iterrows():
    if series.shipper == 'fedex':
        data_sample_partial_ratio.append(process.extractOne(
            series.service_type,
            fedex_methods,
            scorer = fuzz.partial_ratio))
    else:
        data_sample_partial_ratio.append(process.extractOne(
            series.service_type,
            ups_methods,
            scorer = fuzz.partial_ratio))

In [22]:
# Applies the 'fuzz.partial_ratio' fuzzy macthing algorithm to each record based upon the record's service_type_description.
# The partial_ratio function is designed to return the shipping method with the highest score as a two pair tuple
# i.e., (standardized_shipping_method, score

## USING ITERTUPLES ##
data_sample_partial_ratio = []
for row in data_sample[['shipper','service_type']].itertuples():
    if row.shipper == 'fedex':
        data_sample_partial_ratio.append(process.extractOne(
            row.service_type,
            fedex_methods,
            scorer = fuzz.partial_ratio))
    else:
        data_sample_partial_ratio.append(process.extractOne(
            row.service_type,
            ups_methods,
            scorer = fuzz.partial_ratio))

In [23]:
# The standardized shipping method and its associated score are added to the data.
# The score should be dropped from the final solution

data_sample = data_sample.assign(partial_ratio_std_shipping_method =
                                 [method for method, score in data_sample_partial_ratio])

data_sample = data_sample.assign(partial_ratio_score =
                                 [score for method, score in data_sample_partial_ratio])

In [24]:
# Exports the results for manual review / comparison. This and all other cells should be removed from the final solution.

data_sample[['shipper',
             'service_type',
             'partial_ratio_std_shipping_method',
             'partial_ratio_score']].to_csv('C:/Users/royce/Downloads/Capstone/shipping_methods.csv')