# Delivery Prediction - Batch Prediction
This notebook covers the following:
- Getting correct input for batch prediction

In [17]:
import datetime
import os
import time

import joblib
import numpy as np
import pandas as pd
import pgeocode
import shippo
import zipcodes
from tabulate import tabulate
from tqdm import tqdm
from uszipcode import SearchEngine

import delivery_prediction_preprocess
import paths
import utilities

## Import batch CSV

In [20]:
# Import batch
batch_sample_csv = pd.read_csv('../data/delivery_prediction/input/delivery_prediction_batch_sample_cmu.csv', dtype=str)
batch_sample_csv.head()

Unnamed: 0,shipment_date,sender_zip,recipient_zip,weight,shipper,service_type,zone
0,2019-05-22,15213,33324,33,fedex,fedex_first_overnight,5
1,2019-11-08,29910,16066,33,fedex,fedex_priority_overnight,4
2,2019-03-15,29673,43015,41,fedex,fedex_standard_overnight,4
3,2019-06-15,21044,3053,3,fedex,fedex_2_day_am,4
4,2019-07-01,30019,10954,3,fedex,fedex_2_day,5


## Validate batch function

In [43]:
def validate_batch(df):
    """Validates input in CSV.

    Please see module documentation for valid input.

    Args:
        df (pandas dataframe obj): Pandas dataframe that must contain service_type

    Returns:
        True if batch passes validation
    """
    print("Validating input...")
    # Below loading edited for current ipynb file path 
    fedex_services_dict = joblib.load("../"+paths.fedex_service_types_to_time_window)
    ups_services_dict = joblib.load("../"+paths.ups_service_types_to_time_window)
    all_ok = True
    for row in df.itertuples():
        # Validate da1te input
        try:
            datetime.datetime.strptime(row.shipment_date, '%Y-%m-%d')
        except ValueError:
            print(
                f"Found incorrect date input `{row.shipment_date}` in row {row.Index}. Please make sure dates are in YYYY-MM-DD format before trying again.")
            all_ok = False
        # Validate zip codes
        ok = False
        if row.sender_zip.isdigit() and len(row.sender_zip) == 5:
            ok = zipcodes.is_real(row.sender_zip) and \
                 (zipcodes.matching(row.sender_zip)[0]['state'] != 'HI') and \
                 (zipcodes.matching(row.sender_zip)[0]['state'] != 'AK')
        if not ok:
            print(
                f"Found invalid sender zipcode `{row.sender_zip}` in row {row.Index}.\nPlease amend zipcode before trying again.")
            all_ok = False
        # Validate zip codes
        ok = False
        if row.recipient_zip.isdigit() and len(row.recipient_zip) == 5:
            ok = zipcodes.is_real(row.recipient_zip)
        if not ok:
            print(
                f"Found invalid sender zipcode `{row.recipient_zip}` in row {row.Index}. Please amend zipcode before trying again.")
            all_ok = False
        # Validate weight
        try:
            weight = float(row.weight)
        except ValueError:
            print(f"Found invalid weight `{row.weight}` in row {row.Index}. Please amend weight before trying again.")
            all_ok = False
        # Validate shipper
        ok = False
        if row.shipper.lower() in ['fedex', 'ups']:
            ok = True
        if ok == False:
            print(
                f"Found invalid shipper `{row.shipper}` in row {row.Index}. Model only takes fedex and ups now. Please amend shipper before trying again.")
            all_ok = False
        # Validate service_type
        ok = False
        if row.shipper.lower() in ['fedex']:
            if row.service_type.lower() in list(fedex_services_dict.keys()):
                ok = True
        elif row.shipper.lower() in ['ups']:
            if row.service_type.lower() in list(ups_services_dict.keys()):
                ok = True
        if ok == False:
            print(
                f"Found invalid service type `{row.service_type}` in row {row.Index}. Please amend service type before trying again.")
            all_ok = False
        # Validate zone
        ok = False
        try:
            if int(row.zone) in range(2, 9):
                ok = True
            if not ok:
                print(f"Found invalid zone `{row.zone}` in row {row.Index}. Model only takes zones between 2 and 8. Please amend zone before trying again.")
        except ValueError:
            print(f"Zone must be a number")
            all_ok = False
    return all_ok

## Validate current sample CSV

In [22]:
validate_batch(batch_sample_csv)

Validating input...


True

## What is validated?

To note: For CSV batch shipment input, CSV file must be stored in delivery_prediction/input folder and
contain the following column names with fixed format:
- shipment_date (str): YYYY-MM-DD format
- sender_zip (str): String representation of 5-digit zipcode, Non Alaska / Hawaii
- recipient_zip (str): String representation of 5-digit zipcode, Non Alaska / Hawaii
- weight (str): Shipment weight in pounds
- shipper (str): ups or fedex
- service_type (str): Shipper service type with fixed format. See below.
- zone (int): 2 to 8

In [35]:
fedex_services_dict = joblib.load("../"+paths.fedex_service_types_to_time_window)
ups_services_dict = joblib.load("../"+paths.ups_service_types_to_time_window)
print("Allowed service_type strings:")
print("FEDEX:", end=" ")
print(list(fedex_services_dict.keys()))
print("----------")
print ("UPS:", end=" ")
print(list(ups_services_dict.keys()))

Allowed service_type strings:
FEDEX: ['fedex_first_overnight', 'fedex_priority_overnight', 'fedex_standard_overnight', 'fedex_2_day_am', 'fedex_2_day', 'fedex_express_saver']
----------
UPS: ['ups_next_day_air_early_am', 'ups_next_day_air', 'ups_next_day_air_saver', 'ups_second_day_air_am', 'ups_second_day_air', 'ups_3_day_select']


## Example of invalid input in CSV

In [37]:
# Invalid zip codes
batch_sample_csv.at[1, 'sender_zip'] = "abc"
batch_sample_csv.at[2, 'sender_zip'] = "11111"
batch_sample_csv.at[10, 'recipient_zip'] = "99501" # Alaska zipcode
batch_sample_csv.at[12, 'recipient_zip'] = "96701" # Hawaii zipcode
# Invalid shipment_date
batch_sample_csv.at[11, 'shipment_date'] = "2019-02-30" 
batch_sample_csv.at[13, 'shipment_date'] = "2019" 
batch_sample_csv.at[17, 'shipment_date'] = "a" 
# Invalid weight
batch_sample_csv.at[20, 'weight'] = "a" 
# Invalid shipper
batch_sample_csv.at[2, 'shipper'] = "usps" 
# Invalid service_type
batch_sample_csv.at[0, 'service_type'] = "ups_next_day_air"  # UPS service for fedex shipment
batch_sample_csv.at[20, 'service_type'] = "abc"
# Invalid zone
batch_sample_csv.at[14, 'zone'] = "10" # Not in range 2-8
batch_sample_csv.at[19, 'zone'] = "a"
batch_sample_csv.at[14, 'zone'] = 1

In [44]:
validate_batch(batch_sample_csv)

Validating input...
Found invalid service type `ups_next_day_air` in row 0. Please amend service type before trying again.
Found invalid sender zipcode `abc` in row 1.
Please amend zipcode before trying again.
Found invalid sender zipcode `11111` in row 2.
Please amend zipcode before trying again.
Found invalid shipper `usps` in row 2. Model only takes fedex and ups now. Please amend shipper before trying again.
Found invalid service type `fedex_standard_overnight` in row 2. Please amend service type before trying again.
Found incorrect date input `2019-02-30` in row 11. Please make sure dates are in YYYY-MM-DD format before trying again.
Found incorrect date input `2019` in row 13. Please make sure dates are in YYYY-MM-DD format before trying again.
Found invalid zone `1` in row 14. Model only takes zones between 2 and 8. Please amend zone before trying again.
Found incorrect date input `a` in row 17. Please make sure dates are in YYYY-MM-DD format before trying again.
Zone must be a 

False