In [None]:
import pandas as pd
import re

#!python --version  #Python 3.8.5
#pd.__version__      #1.1.2 
#re.__version__     #2.2.1

In [None]:
#CUSTOM SETTINGS set these as appropriate for your environment

# Enter the path to the local data files:
path_datafiles = "../../data/"

# Download the state names/abbreviations from US Postal Service Publication 28
#              https://pe.usps.com/text/pub28/28apb.htm
#        Required to verify that state abbreviations are valid.
#        Expects first column is state name, second column is state abbreviation.
# Enter the path to the file on your system.
postal_file = '20200712_StateAbbreviations.txt'

# Get quarterly LESO Shipment and Cancellation data file from 
#     Defense Logicstics Agency Law Enforcement Support Office Public Information
# Orginal name of the data file should be in the form:
#      DISP_Shipments_Cancellations_mmddyyyy_mmddyyyy.xlsx  
# Enter the local file name
#LESO_file = "DISP_Shipments_Cancellations_01012020_03312020.xlsx"
#LESO_file = "DISP_Shipments_Cancellations_04012020_06302020.xlsx"
LESO_file = "DISP_Shipments_Cancellations_07012020_09302020.xlsx"

DISP_Shipments_Cancellations_mmddyyyy_mmddyyyy.xlsx is downloaded from:    
https://www.dla.mil/DispositionServices/Offers/Reutilization/LawEnforcement/PublicInformation/   
The following is an image of the relevant section of the website:

![test](../Images/DISP_Shipments_CancellationsXLSX.png)

## Check DISP_Shipments_Cancellations_mmddyyyy_mmddyyyy.xlsx

This notebook checks that the schema matches previous versions of the file. It checks for null/NaN data, some unique values, and that 'State' is a valid two-letter abbreviation. It expects that the XLSX file has two sheets labeled 'SHIPMENTS' and 'CANCELLATIONS' The two sheets have the different columns.

###### Data Dictionary for Shipments sheet of Shipments_Cancellations files:

   
| Field | Data Type | Description | Length | Expected Pattern | null? |   
| ----- | ---- | ---- | ---- | ---- |---- |   
| State | string | two digit postal abbreviation for U.S. state or territory| 2 | \[A-Z\]\[A-Z\] | no |   
| Station Name (LEA) | string | descriptive name of requesting law enforcement agency | varies | varies | no |   
| Requisition ID | string | apparently unique identifier needs further research | 14 | [A-z0-9]{14} | no |   
| FSC | string | [Federal Supply Number](https://en.wikipedia.org/wiki/NATO_Stock_Number#Federal_Supply_Classification_Group_(FSCG)) consisting of the Federal Supply Group and Federal Supply Classification | 4 | \[0-9\]{4} | no |   
| NIIN | string | [National Item Identification Number](https://en.wikipedia.org/wiki/NATO_Stock_Number#National_Item_Identification_Number_(NIIN)) a Country Code followed by a 7-digit item identifier string | 9 | \[0-9\]{9} | no |   
| Item Name | string | descriptive name of requested item | varies | varies | no |   
| UI | string | units of requested item known as unit increments | varies | varies | no |   
| Quantity | integer | number of units requested | varies | [0-9]+ | no |   
| Acquisition Value | float | U.S. dollar amount paid when the item was originally purchased by the government | varies | [0-9]+.[0-9]{2} | no |   
| Date Shipped | datetime64 | date shipped maybe; needs further research | 29 | yyyy-mm-ddT00:00:00.000000000 | no |   
| Justification | string | descriptive text justifying request; needs further research | varies | varies | yes |   

###### Data Dictionary for Cancellations sheet of Shipments_Cancellations files:

   
| Field | Data Type | Description | Length | Expected Pattern | null? |   
| ----- | ---- | ---- | ---- | ---- |---- |   
| Cancelled By | string | apparently agency that cancelled request; needs further research | varies | varies | yes | 
| RTD Ref | string | apparently unique identifier; needs further research | 6 or 7 | [0-9]{7} | no |   
| State | string | two digit postal abbreviation for U.S. state or territory| 2 | \[A-Z\]\[A-Z\] | no |   
| Station Name (LEA) | string | descriptive name of requesting law enforcement agency | varies | varies | no |   
| FSC | string | [Federal Supply Number](https://en.wikipedia.org/wiki/NATO_Stock_Number#Federal_Supply_Classification_Group_(FSCG)) consisting of the Federal Supply Group and Federal Supply Classification | 4 | \[0-9\]{4} | no |   
| NIIN | string | [National Item Identification Number](https://en.wikipedia.org/wiki/NATO_Stock_Number#National_Item_Identification_Number_(NIIN)) a Country Code followed by a 7-digit item identifier string | 9 | \[0-9\]{9} | no |   
| Item Name | string | descriptive name of requested item | varies | varies | no |   
| UI | string | units of requested item known as unit increments | varies | varies | no |   
| Quantity | integer | number of units requested | varies | [0-9]+ | no |   
| Acquisition Value | float | U.S. dollar amount paid when the item was originally purchased by the government | varies | [0-9]+.[0-9]{2} | no |   
| Date Requested | datetime64 | date request made; needs further research | 29 | yyyy-mm-ddT00:00:00.000000000 | no |   
| Justification | string | descriptive text justifying request; needs further research | varies | varies | yes |   
| Reason Cancelled | string | capitalized code followed by description of why request is cancelled; needs further research | varies | varies | yes |   

### Read Data From xlsx File:

In [None]:
ship_cancel_df = pd.read_excel("file:" + path_datafiles + LESO_file, sheet_name=None)
#transfer_df is a dictionary of two sheets in filename
#keys are 'SHIPMENTS', 'CANCELLATIONS'
#values are a single dataframe

### Expected Values

In [None]:
# based on sheets from previous files
expected_sheets = ['SHIPMENTS','CANCELLATIONS']

# based on columns from previous files
expected_shipments_columns = ['State', 'Station Name (LEA)', 'Requisition ID', 
                              'FSC', 'NIIN', 'Item Name', 'UI', 'Quantity', 
                              'Acquisition Value', 'Date Shipped', 'Justification']
expected_cancellations_columns = ['Cancelled By', 'RTD Ref', 'State', 'Station Name (LEA)', 
                                  'FSC', 'NIIN', 'Item Name', 'UI', 'Quantity', 
                                  'Acquisition Value', 'Date Requested', 'Justification', 
                                  'Reason Cancelled']

# dictionary based on U.S. Postal data from 'fullpath_postalfile'
#     key: state abbreviation, value: state name
expected_state_abbreviations = pd.read_csv(path_datafiles + postal_file,header=None,
                                           quotechar = "'").\
                                           set_index([1])[0].to_dict() 
#len(expected_state_abbreviations) #expect 59 U.S. states and territories

### Useful Functions for Checking the Values

In [None]:
def get_unique_values(df: pd.DataFrame,col: str) -> list:
    '''Returns a list of the unique values in a column given a dictionary of dataframes.    
    
    '''
    unique_values_list = []
    for dict_key in df:
        unique_values_list += list(df[dict_key][col].unique())
    return unique_values_list

def get_unexpected_values(to_check: set,expect: set)-> set:
    '''Returns a set of unexpected values, empty if none found.
    
    '''
    return to_check.difference(expect)

### Check for 2 Sheets

In [None]:
#type(ship_cancel_df) #dict
#len(ship_cancel_df.values()) #2
#for val in ship_cancel_df.values():
#    print('\n',type(val)) #<class 'pandas.core.frame.DataFrame'>
unexpected_sheets = get_unexpected_values(set(ship_cancel_df.keys()),
                                          set(expected_sheets))
if (len(ship_cancel_df.keys()) != 2) | (len(unexpected_sheets) != 0):
    print('XLSX has unexpected sheets:',ship_cancel_df.keys())
else:
    shipments_df = ship_cancel_df[expected_sheets[0]]
    cancellations_df = ship_cancel_df[expected_sheets[1]]

In [None]:
print('shipments_df has shape:',shipments_df.shape)
print('cancellations_df has shape:',cancellations_df.shape)

In [None]:
print('shipments_df has these default data types:',shipments_df.dtypes)
print('cancellations_df has these default data types:',cancellations_df.dtypes)

### Check the Shipments dataframe

###### QUESTION: Do the 'State' abbreviations in the 'SHIPMENTS' dataframe match US postal abbreviations?

In [None]:
incorrect_state_abbr = [state_abbr for state_abbr in shipments_df['State'] 
                         if state_abbr not in expected_state_abbreviations]
if len(incorrect_state_abbr) > 0:
    print('These states have inconsistant state/territory abbreviations:',incorrect_state_abbr)
else:
    print('No inconsistant state abbreviations were found.')

###### QUESTION: What columns are in the 'SHIPMENTS' dataframe?

In [None]:
new_columns = get_unexpected_values(set(shipments_df.columns), set(expected_shipments_columns))
missing_columns = get_unexpected_values(set(expected_shipments_columns),set(shipments_df.columns))

print('Expected columns are:',expected_shipments_columns)
if len(new_columns) > 0:
    print('These unexpected columns found:\n',new_columns)
elif len(missing_columns) > 0:
    print('These columns are missing:\n',missing_columns)
else:
    print('\nNo column discrepancies found.')

###### QUESTION: How many unique values are in each column of the 'SHIPMENTS' dataframe?

In [None]:
shipments_df.groupby('State').nunique()

###### QUESTION: How many null/NaN values are in the 'SHIPMENTS' dataframe?

In [None]:
shipments_df.isnull().sum()

###### QUESTION: What can we learn about the 'Requistion ID' values in the 'SHIPMENTS' dataframe?

In [None]:
print('Length,Count:')
print(dict(shipments_df['Requisition ID'].str.len().value_counts()))
print('Number of unique values:',shipments_df['Requisition ID'].nunique())

### Check the Cancellations dataframe

###### QUESTION: Do the 'State' abbreviations in the 'CANCELLATIONS' dataframe match US postal abbreviations?

In [None]:
incorrect_state_abbr = [state_abbr for state_abbr in cancellations_df['State'] 
                         if state_abbr not in expected_state_abbreviations]
if len(incorrect_state_abbr) > 0:
    print('These states have inconsistant state/territory abbreviations:',incorrect_state_abbr)
else:
    print('No inconsistant state abbreviations were found.')

###### QUESTION: What columns are in the 'CANCELLATIONS' dataframe?

In [None]:
new_columns = get_unexpected_values(set(cancellations_df.columns), set(expected_cancellations_columns))
missing_columns = get_unexpected_values(set(expected_cancellations_columns),set(cancellations_df.columns))

print('Expected columns are:',expected_cancellations_columns)
if len(new_columns) > 0:
    print('These unexpected columns found:\n',new_columns)
elif len(missing_columns) > 0:
    print('These columns are missing:\n',missing_columns)
else:
    print('\nNo column discrepancies found.')

###### QUESTION: How many unique values are in each column of the 'CANCELLATIONS' dataframe?

In [None]:
cancellations_df.groupby('State').nunique()

###### QUESTION: What unique values are found in the 'Cancelled By' column of the 'CANCELLATION dataframe?

In [None]:
cancellations_df['Cancelled By'].unique()

###### QUESTION: How many null/NaN values are in the 'CANCELLATIONS' dataframe?

In [None]:
cancellations_df.isnull().sum()

###### QUESTION: What can we learn about the 'RTD Ref' values in the 'CANCELLATIONS' dataframe?

In [None]:
print('Length,Count:')
print(dict(cancellations_df['RTD Ref'].astype(str).str.len().value_counts()))
print('Number of unique values:',cancellations_df['RTD Ref'].nunique())