In [4]:
import pandas as pd
import re

from math import nan

#!python --version  #Python 3.8.2
#pd.__version__     #1.0.3 
#re.__version__     #2.2.1
# math is a standard library

In [5]:
#CUSTOM SETTINGS: set these as appropriate for your environment

# Enter the path to the local data files:
path_datafiles = "OriginalData/"

# Download the state names/abbreviations from US Postal Service Publication 28
#              https://pe.usps.com/text/pub28/28apb.htm
#        Required to verify that state abbreviations are valid.
#        Expects first column is state name, second column is state abbreviation.
# Enter the path to the file on your system.
postal_file = '20200712_StateAbbreviations.txt'

# Get quarterly cumulative LESO Transferred Property data file from 
#     Defense Logicstics Agency Law Enforcement Support Office Public Information
# Orginal name of the data file should be in the form:
#      DISP_AllStatesAndTerritories_mmddyyyy.xlsx  
# Enter the local file name
LESO_file = "DISP_AllStatesAndTerritories_03312020.xlsx"

DISP_AllStatesAndTerritories_mmddyyyy.xlsx is downloaded from:   
https://www.dla.mil/DispositionServices/Offers/Reutilization/LawEnforcement/PublicInformation/   
The following is an image of the relevant section of the website:

![test](Images/DISP_AllStatesAndTerritoriesXLXS.png)

## Check DISP_AllStatesAndTerritories_mmddyyyy.xlsx

This notebook checks that the schema matches previous versions of the file. It checks for null/NaN data, some unique values, and that 'State' is a valid two-letter abbreviation. It expects that the XLSX file has up to 59 sheets labeled by full state or territory name. Each sheet should have the same columns.

###### Expected Columns in All Sheets:

__State__: two digit postal abbreviation<br>
> TYPE:str LENGTH: 2 CHARACTER_SET: [A-Z]   

__Station Name (LEA)__: descriptive name of agency requesting equipment<br>
> TYPE:str LENGTH: varies CHARACTER_SET: varies   

__NSN__: NATO Stock Number<br>
see https://en.wikipedia.org/wiki/NATO_Stock_Number<br>
> TYPE:str LENGTH:9 CHARACTER_SET: [A-Z0-9] + '-' {aabb-cc-ddd-dddd: FSG(aa),FSC(bb),CC||NCB(cc),non-standard item code(ddddddd)}   

__Item Name__: descriptive item name<br>
> TYPE:str LENGTH: varies CHARACTER_SET: varies   

__UI__: unit increment<br>
> TYPE:str LENGTH: varies CHARACTER_SET: varies   

__Quantity__: number of items requested<br>
> TYPE:int LENGTH: varies CHARACTER_SET: [0-9]   

__Acquisition Value__: value of requested items in dollars<br>
> TYPE:float64 LENGTH: varies CHARACTER_SET: [0-9.]   

__DEMIL Code__ demilitarization code<br>
https://www.dla.mil/HQ/LogisticsOperations/Services/FIC/DEMILCoding/DEMILCodes/<br>
> TYPE:char LENGTH:1 CHARACTER_SET: [GPFDCEBQA]   

__DEMIL IC__ demilitarization intgrity code<br>
https://www.dla.mil/HQ/LogisticsOperations/Services/FIC/DEMILCoding/DEMILCodes/<br>
> TYPE:int LENGTH: varies CHARACTER_SET: [0-9] & 'blank'(means not coded yet)   

__Ship Date__ ???date shipped???<br>
> TYPE:datetime64 LENGTH:29 CHARACTER_SET: yyyy-mm-ddT00:00:00.000000000   

__Station Type__ what governmental unit owns the agency<br>
> TYPE:str LENGTH:? CHARACTER_SET: 'State'

### Read Data From xlsx File

In [6]:
transfer_dict = pd.read_excel("file:" + path_datafiles + LESO_file, sheet_name=None)
#transfer_dict is a dictionary of all sheets in filename
#keys are full state/territory names
#values are a single dataframe of all transfers for that state/territory, cumulative to this quarter

### Expected Values

In [7]:
# based on columns in sheets from previous files
expected_columns = ['Complete State','State', 'Station Name (LEA)',
                    'NSN', 'Item Name', 'Quantity', 'UI', 'Acquisition Value',
                    'DEMIL Code', 'DEMIL IC', 'Ship Date','Station Type']

# based on previous values
expected_station_types = ['State']

# based on DOD 4160.28 DEMIL Program or DOD 4100.39M FLIS Manual and this website:
# https://www.dla.mil/HQ/LogisticsOperations/Services/FIC/DEMILCoding/DEMILCodes/
expected_demil_codes = ['G','P','F','D','C','E','B','Q','A']
# based on DOD 4160.28 DEMIL Program or DOD 4100.39M FLIS Manual and this website:
# https://www.dla.mil/HQ/LogisticsOperations/Services/FIC/DEMILCoding/DEMILCodes/
expected_demil_integritycodes = [0,1,2,3,4,5,6,7,8,9]

# dictionary based on U.S. Postal data from 'fullpath_postalfile'
#     key: state abbreviation, value: state name
expected_state_abbreviations = pd.read_csv(path_datafiles + postal_file,header=None,
                                           quotechar = "'").\
                                           set_index([1])[0].to_dict() 
#len(expected_state_abbreviations) #expect 59 U.S. states and territories

### Useful Functions for Checking the Values

In [8]:
def get_unique_values(df: pd.DataFrame,col: str) -> list:
    '''Returns a list of the unique values in a column given a dictionary of dataframes.    
    
    '''
    unique_values_list = []
    for dict_key in df:
        unique_values_array = df[dict_key][col].unique()
        for val in df[dict_key][col].unique():
            unique_values_list.append(val)
    return unique_values_list

def get_unexpected_values(to_check: set,expect: set)-> set:
    '''Returns a set of unexpected values, empty if none found.
    
    '''
    return to_check.difference(expect)

### Check All Sheets

In [9]:
print('transfer_dict has',len(transfer_dict),'sheets')
print('transfer_dict has',sum([len(x) for x in transfer_dict.values()]),'rows in all sheets')

transfer_dict has 53 sheets
transfer_dict has 141068 rows in all sheets


###### QUESTION: Do the state or territory names on all sheets match U.S. postal names?

In [10]:
incorrect_state_names = [state_name for state_name in transfer_dict 
                         if state_name not in expected_state_abbreviations.values()]
for i in incorrect_state_names:
    print('Misspelled state/territory name: ',i,' abbreviated as ',transfer_dict[i]['State'].unique(),
          ' Correct state name: ',expected_state_abbreviations[transfer_dict[i]['State'].unique()[0]])

Misspelled state/territory name:  Deleware  abbreviated as  ['DE']  Correct state name:  Delaware
Misspelled state/territory name:  Page1_43  abbreviated as  ['SD']  Correct state name:  South Dakota
Misspelled state/territory name:  CNMI  abbreviated as  ['MP']  Correct state name:  Northern Mariana Islands


###### QUESTION: Does each sheet have exactly one value for 'State'?

In [11]:
inconsistant_state_abbreviations = [state_name for state_name in transfer_dict
                                    if len(transfer_dict[state_name]['State'].unique()) != 1]
if len(inconsistant_state_abbreviations) > 0:
    print('These states do not have exactly one state/territory abbreviation:',inconsistant_state_abbreviations)
else:
    print('All sheets have exactly one state/territory abbreviation.')

All sheets have exactly one state/territory abbreviation.


###### QUESTION: Are the values of 'State' valid U.S. postal abbreviations?

In [12]:
unexpected_state_abbreviations = get_unexpected_values(set(get_unique_values(transfer_dict,'State')),
                                               set(expected_state_abbreviations.keys()))

#print('Expected state abbreviations:',list(expected_state_abbreviations.keys()))
if len(unexpected_state_abbreviations) == 0:
    print('\nOnly valid state abbreviations found.')
else:
    print('\nThese state abbreviations are not valid:',list(unexpected_state_abbreviations))


Only valid state abbreviations found.


###### QUESTION: Do all sheets have the expected columns? (each dictionary item is a data frame)

In [13]:
column_discrepancy = []
for state_name in transfer_dict:
    if expected_columns is list(transfer_dict[state_name]):
        column_discrepancy.append(state_name)

print('Expected columns are:',expected_columns)
if len(column_discrepancy) > 0:
    print('Columns need to be checked on these states:\n',column_discrepancy)
else:
    print('\nNo column discrepancies found.')

Expected columns are: ['Complete State', 'State', 'Station Name (LEA)', 'NSN', 'Item Name', 'Quantity', 'UI', 'Acquisition Value', 'DEMIL Code', 'DEMIL IC', 'Ship Date', 'Station Type']

No column discrepancies found.


###### QUESTION: How many unique values are in each column of each sheet?

In [14]:
unique_counts = pd.DataFrame(columns=expected_columns)
count = 0
for state_name in transfer_dict:
    for k,v in transfer_dict[state_name].nunique().iteritems():
        #print(k,v)
        unique_counts.loc[count, k] = v
    unique_counts.loc[count,'Complete State'] = state_name
    count+=1
unique_counts

Unnamed: 0,Complete State,State,Station Name (LEA),NSN,Item Name,Quantity,UI,Acquisition Value,DEMIL Code,DEMIL IC,Ship Date,Station Type
0,Alabama,1,226,1272,864,73,15,1251,7,5,1112,1
1,Alaska,1,6,51,40,20,5,46,6,2,36,1
2,Arkansas,1,126,391,288,52,13,374,6,5,461,1
3,Arizona,1,82,437,326,74,11,427,7,5,522,1
4,California,1,244,2351,1411,179,23,2122,7,5,1235,1
5,Colorado,1,136,445,300,40,7,417,7,6,566,1
6,Connecticut,1,79,162,116,33,6,162,6,4,292,1
7,Deleware,1,28,186,104,27,7,167,7,5,120,1
8,Florida,1,184,505,376,54,9,502,7,5,712,1
9,Georgia,1,291,845,551,85,9,809,7,6,1194,1


###### QUESTION: How many null/NaN values are in each column of each sheet?

In [15]:
null_counts = pd.DataFrame(columns=expected_columns)
count = 0
for state_name in transfer_dict:
    for k,v in transfer_dict[state_name].isna().sum().iteritems():
    #for k,v in transfer_df[state_name].isnull().sum().iteritems():
        #print(k,v)
        null_counts.loc[count, k] = v
    null_counts.loc[count,'Complete State'] = state_name
    count+=1
null_counts

Unnamed: 0,Complete State,State,Station Name (LEA),NSN,Item Name,Quantity,UI,Acquisition Value,DEMIL Code,DEMIL IC,Ship Date,Station Type
0,Alabama,0,0,0,0,0,0,0,0,549,0,0
1,Alaska,0,0,0,0,0,0,0,0,25,0,0
2,Arkansas,0,0,0,0,0,0,0,0,140,0,0
3,Arizona,0,0,0,0,0,0,0,0,260,0,0
4,California,0,0,0,0,0,0,0,0,738,0,0
5,Colorado,0,0,0,0,0,0,0,0,235,0,0
6,Connecticut,0,0,0,0,0,0,0,0,33,0,0
7,Deleware,0,0,0,0,0,0,0,0,49,0,0
8,Florida,0,0,0,0,0,0,0,0,133,0,0
9,Georgia,0,0,0,0,0,0,0,0,263,0,0


###### QUESTION: Are the unique values of 'Station Type' as expected?

In [16]:
unexpected_station_types = get_unexpected_values(set(get_unique_values(transfer_dict,'Station Type')),
                                                 set(expected_station_types))

#print('Expected station types:',expected_station_types)
if len(unexpected_station_types) == 0:
    print('\nOnly expected station types found.')
else:
    print('\nFound these unexpected station types:',list(unexpected_station_types))


Only expected station types found.


###### QUESTION: Are the unique values of 'DEMIL Code' as expected?

In [17]:
unexpected_demil_codes = get_unexpected_values(set(get_unique_values(transfer_dict,'DEMIL Code')),
                                               set(expected_demil_codes))

#print('Expected DEMIL codes:',expected_demil_codes)
if len(unexpected_demil_codes) == 0:
    print('\nOnly expected DEMIL codes found.')
else:
    print('\nFound these unexpected DEMIL codes:',list(unexpected_demil_codes))


Only expected DEMIL codes found.


###### QUESTION: Are the unique values of 'DEMIL IC' as expected?

In [18]:
unexpected_demil_integritycodes = get_unexpected_values(set(get_unique_values(transfer_dict,'DEMIL IC')),
                                                        set(expected_demil_integritycodes))

#print('Expected DEMIL integrity codes:',expected_demil_integritycodes,'\n')
non_nan_list = []
[non_nan_list.append(ic) for ic in unexpected_demil_integritycodes if pd.notna(ic)]
if len(non_nan_list) > 0:
    print('Found these unexpected DEMIL integrity codes:',non_nan_list)
else:
    print('Only expected integrity codes found.')
print('Found',len(unexpected_demil_integritycodes) - len(non_nan_list),
      'sheets with NaN DEMIL integrity codes values. Recall that means the items have yet to be coded by DLA.')

Only expected integrity codes found.
Found 52 sheets with NaN DEMIL integrity codes values. Recall that means the items have yet to be coded by DLA.
