# Discovery File Validation Notebook
### Description: 
This notebook is intended to automate validation for discovery files
and provide questions/examples for the client about the provided data files

In [35]:
import os
import numpy as np
import pandas as pd


## Read in files from data directory and create Dataframes for each file

In [36]:
file_types = [
    'companies',
    'customers', 
    'drivers', 
    'moves', 
    'orders', 
    'stops', 
    'tractors',
    'trailers'
    ]

data_path = '../data/amx/'

files = []
data = {}
files_received = []
for i in os.listdir(data_path):
    if os.path.isfile(os.path.join(data_path,i)):
        files.append(i)

for file in files:
    file_type = file.split('-')[0]
    data[file_type] = pd.read_csv(data_path + file, sep='\t')

for file in file_types:
    try:
        data[file]
        files_received.append(file)
    except KeyError:
        print(f'The {file} file is missing from the data folder')

for file in files_received:
    print(f"{file} - {type(data[file])}")

  exec(code_obj, self.user_global_ns, self.user_ns)


companies - <class 'pandas.core.frame.DataFrame'>
customers - <class 'pandas.core.frame.DataFrame'>
drivers - <class 'pandas.core.frame.DataFrame'>
moves - <class 'pandas.core.frame.DataFrame'>
orders - <class 'pandas.core.frame.DataFrame'>
stops - <class 'pandas.core.frame.DataFrame'>
tractors - <class 'pandas.core.frame.DataFrame'>
trailers - <class 'pandas.core.frame.DataFrame'>


  exec(code_obj, self.user_global_ns, self.user_ns)


### Validation for each file

#### Drivers File

In [40]:
driver_df = data['drivers']

print(driver_df.info(verbose = True))
print("\n")
print("TYPE_OF - COUNTS")
print(driver_df['type_of'].value_counts())
print("\n")
print('COUNTRY_HOS_RULES - COUNTS')
print(driver_df['country_hos_rules'].value_counts())
print("\n")
print('GROUP_ID - COUNTS')
print(driver_df['group_id'].value_counts())
print("\n")

print("TERMINATED DRIVERS WITH TRUE ACTIVE STATUS")
terminated_drivers_df = driver_df[(driver_df['termination_date'] > driver_df['hire_date'])]
print(terminated_drivers_df[['is_active', 'termination_date', 'hire_date']].loc[terminated_drivers_df['is_active'] == True])

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 504 entries, 0 to 503
Data columns (total 30 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   is_active            504 non-null    bool   
 1   latitude             504 non-null    float64
 2   type_of              504 non-null    object 
 3   last_home_date       24 non-null     object 
 4   __currentMovementId  504 non-null    int64  
 5   state                504 non-null    object 
 6   termination_date     288 non-null    object 
 7   hazmat_certified     504 non-null    bool   
 8   event_date           493 non-null    object 
 9   country_hos_rules    504 non-null    object 
 10  first_name           503 non-null    object 
 11  group_id             462 non-null    object 
 12  fleet_manager        442 non-null    object 
 13  company_id           504 non-null    object 
 14  license_date         504 non-null    object 
 15  hire_date            504 non-null    obj

#### Moves File

#### Orders File

In [4]:
orders_df = data['orders']

print('STATUS - COUNTS')
print(orders_df['status'].value_counts())
print("\n")
print('COMMODITY_ID - COUNTS')
print(orders_df['commodity_id'].value_counts())
print("\n")
print('REVENUE_CODE_ID - COUNTS')
print(orders_df['revenue_code_id'].value_counts())
print("\n")
print('CUSTOMER_ID - BLANKS')
print(orders_df[orders_df['customer_id'].isnull()])
print("\n")
print('ORDERED_DATE - BLANKS')
print(orders_df[orders_df['ordered_date'].isnull()])

STATUS - COUNTS
D    119051
P       440
A       137
V         6
Name: status, dtype: int64


COMMODITY_ID - COUNTS
FOOD         17619
PA           11989
PAPERPRDS    10087
MISC          9206
BROKER        8803
FAK           6787
BLDGMTLS      6144
PKGMTLS       4686
PLASTIC       4682
TOBPRODS      2201
HVAC           875
FLOORING       806
CH             770
AUTO           739
PETSUP         677
ALCOHOL        570
PROLL          452
AIRCOND        425
TEXTILE        262
FERTILIZE      226
LUMBER         181
FURNITURE      163
BEVERAGE       147
MATTRESS       118
SCRAP          107
ANIMALFOO      103
NYLON           51
ELEC            41
RECYCLE         32
EBOTTLES        29
HEAVY           23
BEER            18
GLOVES          17
CHEM            13
COSME            8
AIR              6
FORKLIFT         6
MP               4
WIRE             2
ADIPIC           1
SNACK            1
Name: commodity_id, dtype: int64


REVENUE_CODE_ID - COUNTS
LS       23718
OTR      20399
ATL      16670
R

#### Stops File

In [5]:
stops_df = data['stops']

print('ACTUAL_ARRIVAL - BLANKS')
print(stops_df[stops_df['actual_arrival'].isnull()])
print("\n")
print('ACTUAL_DEPARTURE - BLANKS')
print(stops_df[stops_df['actual_departure'].isnull()])
print("\n")
print('STOP_TYPE - COUNTS')
print(stops_df['stop_type'].value_counts())
print("\n")
print('SCHED_ARRIVE_EARLY - BLANKS')
print(stops_df[stops_df['sched_arrive_early'].isnull()])
print("\n")
print('SCHED_ARRIVE_LATE - BLANKS')
print(stops_df[stops_df['sched_arrive_late'].isnull()])
print("\n")
print('ZIP_CODE - BLANKS')
print(stops_df[stops_df['zip_code'].isnull()])

ACTUAL_ARRIVAL - BLANKS
        zip_code              address state   order_id actual_arrival  \
547        35601      723OldTrinityRd    AL   297842.0            NaN   
558        39832      12551HWY273WEST    GA   296485.0            NaN   
568        46158  100SunPolymersDrive    IN  5149545.0            NaN   
569        36603    901EzraTriceBlvd.    AL  5149689.0            NaN   
571     18020600       4000MILLERCIRN    PA  5150004.0            NaN   
...          ...                  ...   ...        ...            ...   
427031     32218     1310TradeportDr.    FL  5152462.0            NaN   
427032     28079        600RadiatorRd    NC  5152506.0            NaN   
427033     51501          2849RiverRd    IA  5152506.0            NaN   
427034     95540       1000RiverRanch    CA  5152536.0            NaN   
427035     95354        671MariposaRd    CA  5152536.0            NaN   

            city_name driver_load_unload actual_departure  \
547           DECATUR                 

#### Tractors File

#### Trailers File