In [None]:
import pandas as pd
import re

from math import nan

#!python --version  #Python 3.8.5
#pd.__version__     #1.1.2 
#re.__version__     #2.2.1
# math is a standard library

In [None]:
#CUSTOM SETTINGS: set these as appropriate for your environment

# Enter the path to the local data files:
path_datafiles = "../../data/"

# Download the state names/abbreviations from US Postal Service Publication 28
#              https://pe.usps.com/text/pub28/28apb.htm
#        Required to verify that state abbreviations are valid.
#        Expects first column is state name, second column is state abbreviation.
# Enter the path to the file on your system.
postal_file = '20200712_StateAbbreviations.txt'

# Get quarterly cumulative LESO Transferred Property data file from 
#     Defense Logicstics Agency Law Enforcement Support Office Public Information
# Orginal name of the data file should be in the form:
#      DISP_AllStatesAndTerritories_mmddyyyy.xlsx  
# Enter the local file name

#LESO_file = "DISP_AllStatesAndTerritories_03312020.xlsx"
#LESO_file = "DISP_AllStatesAndTerritories_06302020.xlsx"
LESO_file = "DISP_AllStatesAndTerritories_09302020.xlsx"

DISP_AllStatesAndTerritories_mmddyyyy.xlsx is downloaded from:   
https://www.dla.mil/DispositionServices/Offers/Reutilization/LawEnforcement/PublicInformation/   
The following is an image of the relevant section of the website:

![test](../Images/DISP_AllStatesAndTerritoriesXLXS.png)

## Check DISP_AllStatesAndTerritories_mmddyyyy.xlsx

This notebook checks that the schema matches previous versions of the file. It checks for null/NaN data, some unique values, and that 'State' is a valid two-letter abbreviation. It expects that the XLSX file has up to 59 sheets labeled by full state or territory name. Each sheet should have the same columns.

###### Data Dictionary for AllStatesAndTerritories files:

   
| Field | Data Type | Description | Length | Expected Pattern | null? |   
| ----- | ---- | ---- | ---- | ---- |---- |   
| State | string | two digit postal abbreviation for U.S. state or territory | 2 | \[A-Z\]\[A-Z\] | no |   
| Station Name (LEA) | string | descriptive name of requesting law enforcement agency | varies | varies | no |   
| NSN | string | [NATO Stock Number](https://en.wikipedia.org/wiki/NATO_Stock_Number) a government-assigned identifier for requested item | 9 | \[0-9\]{4}-\[0-9\]{2}-\[A-Z0-9\]{3}-\[A-Z0-9\]{4} | no |   
| Item Name | string | descriptive name of requested item | varies | varies | no |   
| UI | string | units of requested item known as unit increments | varies | varies | no |   
| Quantity | integer | number of units requested | varies | [0-9]+ | no |   
| Acquisition Value | float | U.S. dollar amount paid when the item was originally purchased by the government | varies | [0-9]+.[0-9]{2} | no |   
| DEMIL Code | character | [demilitarization code](https://www.dla.mil/HQ/LogisticsOperations/Services/FIC/DEMILCoding/DEMILCodes/) for level of destruction required when the item leaves Department of Defense control | 1 | \[GPFDCEBQA\] | no |   
| DEMIL IC | integer | [demilitarization itegrity code](https://www.dla.mil/HQ/LogisticsOperations/Services/FIC/DEMILCoding/DEMILCodes/) validity of DEMIL Code (a missing value means it has not yet been reviewed), see [FLIS manual](https://www.dla.mil/HQ/LogisticsOperations/TrainingandReference/FLISProcedures/) for more information | 1 | [0-9] or blank | yes |   
| Ship Date | datetime64 | date transfered; needs further research | 29 | yyyy-mm-ddT00:00:00.000000000 | no |   
| Station Type | string | level of government associated with requesting agency; needs further research | 5 | 'State' | no |   

### Read Data From xlsx File

In [None]:
transfer_dict = pd.read_excel("file:" + path_datafiles + LESO_file, sheet_name=None)
#transfer_dict is a dictionary of all sheets in filename
#keys are full state/territory names
#values are a single dataframe of all transfers for that state/territory, cumulative to this quarter

### Expected Values

In [None]:
# based on columns in sheets from previous files
expected_columns = ['Complete State','State', 'Station Name (LEA)',
                    'NSN', 'Item Name', 'Quantity', 'UI', 'Acquisition Value',
                    'DEMIL Code', 'DEMIL IC', 'Ship Date','Station Type']

# based on previous values
expected_station_types = ['State']

# based on DOD 4160.28 DEMIL Program or DOD 4100.39M FLIS Manual and this website:
# https://www.dla.mil/HQ/LogisticsOperations/Services/FIC/DEMILCoding/DEMILCodes/
expected_demil_codes = ['G','P','F','D','C','E','B','Q','A']
# based on DOD 4160.28 DEMIL Program or DOD 4100.39M FLIS Manual and this website:
# https://www.dla.mil/HQ/LogisticsOperations/Services/FIC/DEMILCoding/DEMILCodes/
expected_demil_integritycodes = [0,1,2,3,4,5,6,7,8,9]

# dictionary based on U.S. Postal data from 'fullpath_postalfile'
#     key: state abbreviation, value: state name
expected_state_abbreviations = pd.read_csv(path_datafiles + postal_file,header=None,
                                           quotechar = "'").\
                                           set_index([1])[0].to_dict() 
#len(expected_state_abbreviations) #expect 59 U.S. states and territories

### Useful Functions for Checking the Values

In [None]:
def get_unique_values(df: pd.DataFrame,col: str) -> list:
    '''Returns a list of the unique values in a column given a dictionary of dataframes.    
    
    '''
    unique_values_list = []
    for dict_key in df:
        unique_values_list += list(df[dict_key][col].unique())
    return unique_values_list
#NOTE from Nicole B
# and from mymodule import get_unique_values instead of repeating in each notebook.
def get_unexpected_values(to_check: set,expect: set)-> set:
    '''Returns a set of unexpected values, empty if none found.
    
    '''
    return to_check.difference(expect)

### Check All Sheets

In [None]:
print('transfer_dict has',len(transfer_dict),'sheets')
print('transfer_dict has',sum([len(x) for x in transfer_dict.values()]),'rows in all sheets')

###### QUESTION: Do the state or territory names on all sheets match U.S. postal names?

In [None]:
incorrect_state_names = [state_name for state_name in transfer_dict 
                         if state_name not in expected_state_abbreviations.values()]
for i in incorrect_state_names:
    print('Misspelled state/territory name: ',i,' abbreviated as ',transfer_dict[i]['State'].unique(),
          ' Correct state name: ',expected_state_abbreviations[transfer_dict[i]['State'].unique()[0]])

###### QUESTION: Does each sheet have exactly one value for 'State'?

In [None]:
inconsistant_state_abbreviations = [state_name for state_name in transfer_dict
                                    if len(transfer_dict[state_name]['State'].unique()) != 1]
if len(inconsistant_state_abbreviations) > 0:
    print('These states do not have exactly one state/territory abbreviation:',inconsistant_state_abbreviations)
else:
    print('All sheets have exactly one state/territory abbreviation.')

###### QUESTION: Are the values of 'State' valid U.S. postal abbreviations?

In [None]:
unexpected_state_abbreviations = get_unexpected_values(set(get_unique_values(transfer_dict,'State')),
                                               set(expected_state_abbreviations.keys()))

#print('Expected state abbreviations:',list(expected_state_abbreviations.keys()))
if len(unexpected_state_abbreviations) == 0:
    print('\nOnly valid state abbreviations found.')
else:
    print('\nThese state abbreviations are not valid:',list(unexpected_state_abbreviations))

###### QUESTION: Do all sheets have the expected columns? (each dictionary item is a data frame)

In [None]:
column_discrepancy = []
for state_name in transfer_dict:
    if expected_columns is list(transfer_dict[state_name]):
        column_discrepancy.append(state_name)

print('Expected columns are:',expected_columns)
if len(column_discrepancy) > 0:
    print('Columns need to be checked on these states:\n',column_discrepancy)
else:
    print('\nNo column discrepancies found.')

###### QUESTION: How many unique values are in each column of each sheet?

In [None]:
unique_counts = pd.DataFrame(columns=expected_columns)
count = 0
for state_name in transfer_dict:
    for k,v in transfer_dict[state_name].nunique().iteritems():
        #print(k,v)
        unique_counts.loc[count, k] = v
    unique_counts.loc[count,'Complete State'] = state_name
    count+=1
unique_counts

###### QUESTION: How many null/NaN values are in each column of each sheet?

In [None]:
null_counts = pd.DataFrame(columns=expected_columns)
count = 0
for state_name in transfer_dict:
    for k,v in transfer_dict[state_name].isna().sum().iteritems():
    #for k,v in transfer_df[state_name].isnull().sum().iteritems():
        #print(k,v)
        null_counts.loc[count, k] = v
    null_counts.loc[count,'Complete State'] = state_name
    count+=1
null_counts

###### QUESTION: Are the unique values of 'Station Type' as expected?

In [None]:
unexpected_station_types = get_unexpected_values(set(get_unique_values(transfer_dict,'Station Type')),
                                                 set(expected_station_types))

#print('Expected station types:',expected_station_types)
if len(unexpected_station_types) == 0:
    print('\nOnly expected station types found.')
else:
    print('\nFound these unexpected station types:',list(unexpected_station_types))

###### QUESTION: Are the unique values of 'DEMIL Code' as expected?

In [None]:
unexpected_demil_codes = get_unexpected_values(set(get_unique_values(transfer_dict,'DEMIL Code')),
                                               set(expected_demil_codes))

#print('Expected DEMIL codes:',expected_demil_codes)
if len(unexpected_demil_codes) == 0:
    print('\nOnly expected DEMIL codes found.')
else:
    print('\nFound these unexpected DEMIL codes:',list(unexpected_demil_codes))

###### QUESTION: Are the unique values of 'DEMIL IC' as expected?

In [None]:
unexpected_demil_integritycodes = get_unexpected_values(set(get_unique_values(transfer_dict,'DEMIL IC')),
                                                        set(expected_demil_integritycodes))

#print('Expected DEMIL integrity codes:',expected_demil_integritycodes,'\n')
non_nan_list = []
[non_nan_list.append(ic) for ic in unexpected_demil_integritycodes if pd.notna(ic)]
if len(non_nan_list) > 0:
    print('Found these unexpected DEMIL integrity codes:',non_nan_list)
else:
    print('Only expected integrity codes found.')
print('Found',len(unexpected_demil_integritycodes) - len(non_nan_list),
      'sheets with NaN DEMIL integrity codes values. Recall that means the items have yet to be coded by DLA.')