In [None]:
import pandas as pd
import re

#!python --version  #Python 3.8.2
#pd.__version__     #1.0.3 
#re.__version__     #2.2.1

In [None]:
#CUSTOM SETTINGS set these as appropriate for your environment

# Enter the path to the local data files:
path_datafiles = "OriginalData/"

# Download the state names/abbreviations from US Postal Service Publication 28
#              https://pe.usps.com/text/pub28/28apb.htm
#        Required to verify that state abbreviations are valid.
#        Expects first column is state name, second column is state abbreviation.
# Enter the path to the file on your system.
postal_file = '20200712_StateAbbreviations.txt'

# Get quarterly LESO Shipment and Cancellation data file from 
#     Defense Logicstics Agency Law Enforcement Support Office Public Information
# Orginal name of the data file should be in the form:
#      DISP_Shipments_Cancellations_mmddyyyy_mmddyyyy.xlsx  
# Enter the local file name
LESO_file = "DISP_Shipments_Cancellations_04012020_06302020.xlsx"
#LESO_file = "DISP_Shipments_Cancellations_01012020_03312020.xlsx"

DISP_Shipments_Cancellations_mmddyyyy_mmddyyyy.xlsx is downloaded from:    
https://www.dla.mil/DispositionServices/Offers/Reutilization/LawEnforcement/PublicInformation/   
The following is an image of the relevant section of the website:

![test](Images/DISP_Shipments_CancellationsXLSX.png)

## Check DISP_Shipments_Cancellations_mmddyyyy_mmddyyyy.xlsx

This notebook checks that the schema matches previous versions of the file. It checks for null/NaN data, some unique values, and that 'State' is a valid two-letter abbreviation. It expects that the XLSX file has two sheets labeled 'SHIPMENTS' and 'CANCELLATIONS' The two sheets have the different columns.

###### Expected Columns in 'SHIPMENTS' sheet:

__State__: two digit postal abbreviation<br>
> TYPE:str LENGTH: 2 CHARACTER_SET: [A-Z]   

__Station Name (LEA)__: descriptive name of agency requesting equipment<br>
> TYPE:str LENGTH: varies CHARACTER_SET: varies   

__Requisition ID__: ???unique??? alphanumeric string<br>
> TYPE:str LENGTH: 14 CHARACTER_SET: [A-z0-9]   

__FSC__: Federal Supply Classification Group number<br>
part of NATO Stock Number, see https://en.wikipedia.org/wiki/NATO_Stock_Number<br>
> TYPE:str LENGTH:4 CHARACTER_SET: [0-9] varies (xx: FSG yy: FSC)   

__NIIN__: National Item Identification number<br>
part of NATO Stock Number, see https://en.wikipedia.org/wiki/NATO_Stock_Number<br>
> TYPE:str LENGTH:9 CHARACTER_SET: varies {xx:CC||NCB yyy-yyyy: non-standard item code}   

__Item Name__: descriptive item name<br>
> TYPE:str LENGTH: varies CHARACTER_SET: varies   

__UI__: unit increment<br>
> TYPE:str LENGTH: varies CHARACTER_SET: varies   

__Quantity__: number of items requested<br>
> TYPE:int LENGTH: varies CHARACTER_SET: [0-9]   

__Acquisition Value__: value of requested items in dollars<br>
> TYPE:float64 LENGTH: varies CHARACTER_SET: [0-9.]   

__Date Shipped__: ???date shipped???<br>
> TYPE:datetime64 LENGTH:29 CHARACTER_SET: yyyy-mm-ddT00:00:00.000000000   

__Justification__: descriptive text<br>
> TYPE:str LENGTH: varies CHARACTER_SET: varies

###### Expected Columns in 'CANCELLATIONS' sheet:

__Cancelled By__: ???cancelling agency???<br>
> TYPE:str LENGTH: varies CHARACTER_SET: varies   

__RTD Ref__: ???unique??? reference number<br>
> TYPE:int LENGTH: 6 or 7 CHARACTER_SET: [0-9]  

__State__: two digit postal abbreviation<br>
> TYPE:str LENGTH: 2 CHARACTER_SET: [A-Z]   

__Station Name (LEA)__: descriptive name of agency requesting equipment<br>
> TYPE:str LENGTH: varies CHARACTER_SET: varies   

__FSC__: Federal Supply Classification Group number<br>
part of NATO Stock Number, see https://en.wikipedia.org/wiki/NATO_Stock_Number<br>
> TYPE:str LENGTH:4 CHARACTER_SET: [0-9] {aabb: FSG(aa),FSC(bb)}   

__NIIN__: National Item Identification number<br>
part of NATO Stock Number, see https://en.wikipedia.org/wiki/NATO_Stock_Number<br>
> TYPE:str LENGTH:9 CHARACTER_SET: [A-Z0-9] {aabbbbbbb: CC||NCB(aa),non-standard item code(bbbbbbb)}   

__Item Name__: descriptive item name<br>
> TYPE:str LENGTH: varies CHARACTER_SET: varies   

__UI__: unit increment<br>
> TYPE:str LENGTH: varies CHARACTER_SET: varies   

__Quantity__: number of items requested<br>
> TYPE:int LENGTH: varies CHARACTER_SET: [0-9]   

__Acquisition Value__: value of requested items in dollars<br>
> TYPE:float64 LENGTH: varies CHARACTER_SET: [0-9.]   

__Date Requested__: ???date requested???<br>
> TYPE:datetime64 LENGTH:29 CHARACTER_SET: yyyy-mm-ddThh:mm:ss.000000000   

__Justification__: descriptive text<br>
> TYPE:str LENGTH: varies CHARACTER_SET: varies   

__Reason Cancelled__: ???descriptive reason for cancellation???<br>
> TYPE:str LENGTH: varies CHARACTER_SET: varies

### Read Data From xlsx File:

In [None]:
ship_cancel_df = pd.read_excel("file:" + path_datafiles + LESO_file, sheet_name=None)
#transfer_df is a dictionary of two sheets in filename
#keys are 'SHIPMENTS', 'CANCELLATIONS'
#values are a single dataframe

### Expected Values

In [None]:
# based on sheets from previous files
expected_sheets = ['SHIPMENTS','CANCELLATIONS']

# based on columns from previous files
expected_shipments_columns = ['State', 'Station Name (LEA)', 'Requisition ID', 
                              'FSC', 'NIIN', 'Item Name', 'UI', 'Quantity', 
                              'Acquisition Value', 'Date Shipped', 'Justification']
expected_cancellations_columns = ['Cancelled By', 'RTD Ref', 'State', 'Station Name (LEA)', 
                                  'FSC', 'NIIN', 'Item Name', 'UI', 'Quantity', 
                                  'Acquisition Value', 'Date Requested', 'Justification', 
                                  'Reason Cancelled']

# dictionary based on U.S. Postal data from 'fullpath_postalfile'
#     key: state abbreviation, value: state name
expected_state_abbreviations = pd.read_csv(path_datafiles + postal_file,header=None,
                                           quotechar = "'").\
                                           set_index([1])[0].to_dict() 
#len(expected_state_abbreviations) #expect 59 U.S. states and territories

### Useful Functions for Checking the Values

In [None]:
def get_unique_values(df: pd.DataFrame,col: str) -> list:
    '''Returns a list of the unique values in a column given a dictionary of dataframes.    
    
    '''
    unique_values_list = []
    for dict_key in df:
        unique_values_array = df[dict_key][col].unique()
        for val in df[dict_key][col].unique():
            unique_values_list.append(val)
    return unique_values_list

def get_unexpected_values(to_check: set,expect: set)-> set:
    '''Returns a set of unexpected values, empty if none found.
    
    '''
    return to_check.difference(expect)

### Check for 2 Sheets

In [None]:
#type(ship_cancel_df) #dict
#len(ship_cancel_df.values()) #2
#for val in ship_cancel_df.values():
#    print('\n',type(val)) #<class 'pandas.core.frame.DataFrame'>
unexpected_sheets = get_unexpected_values(set(ship_cancel_df.keys()),
                                          set(expected_sheets))
if (len(ship_cancel_df.keys()) != 2) | (len(unexpected_sheets) != 0):
    print('XLSX has unexpected sheets:',ship_cancel_df.keys())
else:
    shipments_df = ship_cancel_df[expected_sheets[0]]
    cancellations_df = ship_cancel_df[expected_sheets[1]]

In [None]:
print('shipments_df has shape:',shipments_df.shape)
print('cancellations_df has shape:',cancellations_df.shape)

In [None]:
print('shipments_df has these default data types:',shipments_df.dtypes)
print('cancellations_df has these default data types:',cancellations_df.dtypes)

### Check the Shipments dataframe

###### QUESTION: Do the 'State' abbreviations in the 'SHIPMENTS' dataframe match US postal abbreviations?

In [None]:
incorrect_state_abbr = [state_abbr for state_abbr in shipments_df['State'] 
                         if state_abbr not in expected_state_abbreviations]
if len(incorrect_state_abbr) > 0:
    print('These states have inconsistant state/territory abbreviations:',incorrect_state_abbr)
else:
    print('No inconsistant state abbreviations were found.')

###### QUESTION: What columns are in the 'SHIPMENTS' dataframe?

In [None]:
new_columns = get_unexpected_values(set(shipments_df.columns), set(expected_shipments_columns))
missing_columns = get_unexpected_values(set(expected_shipments_columns),set(shipments_df.columns))

print('Expected columns are:',expected_shipments_columns)
if len(new_columns) > 0:
    print('These unexpected columns found:\n',new_columns)
elif len(missing_columns) > 0:
    print('These columns are missing:\n',missing_columns)
else:
    print('\nNo column discrepancies found.')

###### QUESTION: How many unique values are in each column of the 'SHIPMENTS' dataframe?

In [None]:
shipments_df.groupby('State').nunique()

###### QUESTION: How many null/NaN values are in the 'SHIPMENTS' dataframe?

In [None]:
shipments_df.isnull().sum()

###### QUESTION: What can we learn about the 'Requistion ID' values in the 'SHIPMENTS' dataframe?

In [None]:
print('Length,Count:')
print(dict(shipments_df['Requisition ID'].str.len().value_counts()))
print('Number of unique values:',shipments_df['Requisition ID'].nunique())

### Check the Cancellations dataframe

###### QUESTION: Do the 'State' abbreviations in the 'CANCELLATIONS' dataframe match US postal abbreviations?

In [None]:
incorrect_state_abbr = [state_abbr for state_abbr in cancellations_df['State'] 
                         if state_abbr not in expected_state_abbreviations]
if len(incorrect_state_abbr) > 0:
    print('These states have inconsistant state/territory abbreviations:',incorrect_state_abbr)
else:
    print('No inconsistant state abbreviations were found.')

###### QUESTION: What columns are in the 'CANCELLATIONS' dataframe?

In [None]:
new_columns = get_unexpected_values(set(cancellations_df.columns), set(expected_cancellations_columns))
missing_columns = get_unexpected_values(set(expected_cancellations_columns),set(cancellations_df.columns))

print('Expected columns are:',expected_cancellations_columns)
if len(new_columns) > 0:
    print('These unexpected columns found:\n',new_columns)
elif len(missing_columns) > 0:
    print('These columns are missing:\n',missing_columns)
else:
    print('\nNo column discrepancies found.')

###### QUESTION: How many unique values are in each column of the 'CANCELLATIONS' dataframe?

In [None]:
cancellations_df.groupby('State').nunique()

###### QUESTION: What unique values are found in the 'Cancelled By' column of the 'CANCELLATION dataframe?

In [None]:
cancellations_df['Cancelled By'].unique()

###### QUESTION: How many null/NaN values are in the 'CANCELLATIONS' dataframe?

In [None]:
cancellations_df.isnull().sum()

###### QUESTION: What can we learn about the 'RTD Ref' values in the 'CANCELLATIONS' dataframe?

In [None]:
print('Length,Count:')
print(dict(cancellations_df['RTD Ref'].astype(str).str.len().value_counts()))
print('Number of unique values:',cancellations_df['RTD Ref'].nunique())