# Discovery File Validation Notebook
### Description: 
This notebook is intended to automate validation for discovery files
and provide questions/examples for the client about the provided data files

In [None]:
import os
import numpy as np
import pandas as pd


## Read in files from data directory and create Dataframes for each file

In [None]:
file_types = [
    'companies',
    'customers', 
    'drivers', 
    'moves', 
    'orders', 
    'stops', 
    'tractors',
    'trailers'
    ]

# WRITE PATH TO DATA DIRECTORY BELOW 

data_path = '../data/amx/'
# data_path = '../data/reedtms/'

files = []
data = {}
files_received = []

try:
    for i in os.listdir(data_path):
        if os.path.isfile(os.path.join(data_path,i)):
            files.append(i)
except FileNotFoundError:
    print("PLEASE MAKE SURE THE DATA PATH LEADS TO THE DATA DIRECTORY WHERE THE DATA FILES ARE LOCATED\n")   
except NameError:
    print("PLEASE INSERT A VALID PATH STRING TO THE DATA DIRECTORY\n")

if '.DS_Store' in files:
    files.remove('.DS_Store')
for file in files:
    file_type = file.split('-')[0]
    data[file_type] = pd.read_csv(data_path + file, sep='\t', low_memory=False)

for file in file_types:
    try:
        data[file]
        files_received.append(file)
    except KeyError:
        print(f'** The {file} file is missing from the data folder **'.upper())

if len(files_received) == 0:
    print("\n!!! NO FILES RECEIVED !!!")
    
for file in files_received:
    print(f"{file} - {type(data[file])}")

### Validation for each file

#### Drivers File

In [None]:
driver_df = data['drivers']

print(driver_df.info(verbose = True))
print("\n")
print("TYPE_OF - COUNTS")
print(driver_df['type_of'].value_counts())
print("\n")
print('COUNTRY_HOS_RULES - COUNTS')
print(driver_df['country_hos_rules'].value_counts())
print("\n")
print('GROUP_ID - COUNTS')
print(driver_df['group_id'].value_counts())
print("\n")
print('DRIVERS WITH ONDUTY HOURS OUTSIDE NORM')
avl_onduty_hours_df = driver_df[(driver_df['avl_onduty_hours'] < 0) | (driver_df['avl_onduty_hours'] > 14)]
print(avl_onduty_hours_df[['avl_onduty_hours']])
print('\n')
print('DRIVERS WITH AVALABLE DRIVER HOURS OUTSIDE NORM')
avl_drive_hours_df = driver_df[(driver_df['avl_drive_hours'] < 0) | (driver_df['avl_drive_hours'] > 11)]
print(avl_drive_hours_df[['avl_drive_hours']])
print('\n')
print("TERMINATED DRIVERS WITH TRUE ACTIVE STATUS")
terminated_drivers_df = driver_df[(driver_df['termination_date'] > driver_df['hire_date'])]
print(terminated_drivers_df[['is_active', 'termination_date', 'hire_date']].loc[terminated_drivers_df['is_active'] == True])
print('\n')
print("REHIRED DRIVERS WITH FALSE ACTIVE STATUS")
rehired_drivers_df = driver_df[(driver_df['termination_date'] < driver_df['hire_date'])]
print(rehired_drivers_df[['is_active', 'termination_date', 'hire_date']].loc[rehired_drivers_df['is_active'] == False])

#### Moves File

In [None]:
moves_df = data['moves']

print(moves_df.info(verbose = True))
print("\n")
print('MOVE_STATUS - COUNTS')
print(moves_df['move_status'].value_counts())
print("\n")
print('LOADED - COUNTS')
print(moves_df['loaded'].value_counts())
print("\n")
print('ORDER_ID - BLANKS')
order_id_df = moves_df['order_id'].isnull()
print(order_id_df)
print("\n")
print('PRORATED_REVENUE - NEGATIVE')
prorated_revenue_df = moves_df[(moves_df['prorated_revenue'] < 0)]
print(prorated_revenue_df[['prorated_revenue']])
print("\n")
print('BROKERAGE - COUNTS')
print(moves_df['brokerage'].value_counts())
print("\n")
print('MOVE_DISTANCE - OUT OF BOUNDS (<0 OR >4000)')
move_distance_df = moves_df[(moves_df['move_distance'] < 0)|(moves_df['move_distance'] > 4000)]
print(move_distance_df[['move_distance']])


#### Orders File

In [None]:
orders_df = data['orders']

print(orders_df.info(verbose = True))
print("\n")
print('STATUS - COUNTS')
print(orders_df['status'].value_counts())
print("\n")
print('COMMODITY_ID - COUNTS')
print(orders_df['commodity_id'].value_counts())
print("\n")
print('REVENUE_CODE_ID - COUNTS')
print(orders_df['revenue_code_id'].value_counts())
print("\n")
print('CUSTOMER_ID - BLANKS')
print(orders_df[orders_df['customer_id'].isnull()])
print('\n')
print('CUSTOMER_ID - COUNTS TOP 10')
print(orders_df['customer_id'].value_counts().head(10))
print("\n")
print('ORDERED_DATE - BLANKS')
print(orders_df[orders_df['ordered_date'].isnull()])
print("\n")
print('BILL_DISTANCE - OUT OF BOUNDS (<0 OR >4000)')
print(orders_df[(orders_df['bill_distance'] < 0)|(orders_df['bill_distance'] > 4000)])
print("\n")
print('FREIGHT_CHARGE - NEGATIVE')
print(orders_df[orders_df['freight_charge'] < 0])
print("\n")
print('OTHERCHARGETOTAL - NEGATIVE')
print(orders_df[orders_df['otherchargetotal'] < 0])

#### Stops File

In [None]:
stops_df = data['stops']

print(stops_df.info(verbose = True))
print("\n")
print('ACTUAL_ARRIVAL - BLANKS')
actual_arrival_df = stops_df[stops_df['actual_arrival'].isnull()]
print(actual_arrival_df[['actual_arrival']])
print("\n")
print('ACTUAL_DEPARTURE - BLANKS')
actual_departure_df = stops_df[stops_df['actual_departure'].isnull()]
print(actual_departure_df[['actual_departure']])
print("\n")
print('STOP_TYPE - COUNTS')
print(stops_df['stop_type'].value_counts())
print("\n")
print('SCHED_ARRIVE_EARLY - BLANKS')
sched_arrive_early_df = stops_df[stops_df['sched_arrive_early'].isnull()]
print(sched_arrive_early_df[['sched_arrive_early']])
print("\n")
print('SCHED_ARRIVE_LATE - BLANKS')
sched_arrive_late_df = stops_df[stops_df['sched_arrive_late'].isnull()]
print(sched_arrive_late_df[['sched_arrive_late']])
print("\n")
print('ZIP_CODE - BLANKS')
zip_code_blank_df = stops_df[stops_df['zip_code'].isnull()]
print(zip_code_blank_df[['zip_code']])
print("\n")
print('ZIP_CODE - INCORRECT FORMAT - Too Short')
zip_code_short_df = stops_df[(stops_df['zip_code'].str.len() < 5)]
print(zip_code_short_df[['zip_code']])
print("\n")
print('ZIP_CODE - INCORRECT FORMAT - Too Long')
zip_code_partition_hyphen_df = stops_df['zip_code'].str.partition('-')
zip_code_long_df = zip_code_partition_hyphen_df[(zip_code_partition_hyphen_df[0].str.len() > 5)]
zip_code_long_df = zip_code_long_df.rename(columns={0: 'zip_code'})
print(zip_code_long_df[['zip_code']])
print("\n")
print('ZIP_CODE - INCORRECT FORMAT - Non-numeric Characters')
zip_code_no_hyphen_df = stops_df['zip_code'].str.replace('-', '')
zip_code_alpha_df = zip_code_no_hyphen_df[zip_code_no_hyphen_df.str.isnumeric() == False]
print(zip_code_alpha_df)
print("\n")

#### Tractors File

#### Trailers File