In [None]:
import datetime as dt
import pandas as pd
import re

from pathlib import Path

#!python --version     #Python 3.8.5
#pd.__version__       #1.1.2
#re.__version__       #2.2.1
# datetime standard module
# pathlib standard module

In [None]:
#CUSTOM SETTINGS: set these as appropriate for your environment

# Enter the path to the local data files:
path_datafiles = "../../data/"

# Enter the path to save the clean, merged data:
path_mergedfiles = "../../data/merged/"

# This is the LESO_file from Check_DISP_AllStatesAndTerritories.ipynb
# Please check the file there before trying this notebook/
#LESOfile_all = "DISP_AllStatesAndTerritories_03312020.xlsx"
#LESOfile_all = "DISP_AllStatesAndTerritories_06302020.xlsx"
#LESOfile_all = "DISP_AllStatesAndTerritories_09302020.xlsx"
# like "DISP_Shipments_Cancellations_mmddyyyy_mmddyyyy.xlsx"
#LESOfile_qtr = "DISP_Shipments_Cancellations_01012020_03312020.xlsx"
#LESOfile_qtr = "DISP_Shipments_Cancellations_04012020_06302020.xlsx"
#LESOfile_qtr = "DISP_Shipments_Cancellations_07012020_09302020.xlsx"

# values for OriginatingFiles column
originate_allstates = LESOfile_all + '[ALL]'
originate_shipments = LESOfile_qtr + '[SHIPMENTS]'
originate_cancellations = LESOfile_qtr + '[CANCELLATIONS]'

In [None]:
# used by all dataframes
ordered_columns_list = ['OriginatingFile', 'StateAbbreviation', 'RequestingAgency',
                        'ItemDescription', 'RecordDate', 'AcquisitionValue', 'Quantity',
                        'UnitIncrement', 'Item_FSG', 'Item_FSC', 'Item_CC',
                        'Item_Code', 'Justification', 'NSN', 'FSC', 'NIIN', 'DEMILCode',
                        'DEMILIC', 'StationType', 'RequisitionID' ,'CancelledBy',
                        'RTDRef', 'ReasonCancelled']

##### PLEASE RUN Check_DISP_AllStatesAndTerritories.ipynb and Check_DISP_Shipments_Cancellations.ipynb FIRST

That notebook works from assumptions in the check notebooks. For more information about various columns and values:   
https://www.dla.mil/Portals/104/Documents/DispositionServices/LESO/DISP_QuickStartGuide_11012017_hyperlinked.pdf   
https://www.dla.mil/DispositionServices/Offers/Reutilization/LawEnforcement/ProgramFAQs.aspx

This notebook merges the AllStatesAndTerrirtories and Shipments_Cancellations data into a single dataframe. When you run the notebook, you can decide how to save the data (examples for all in one tsv file or split into files by quarter or state abbreviation).

The idea is that the original data can be recreated from the merged dataframe. For analysis inside a notebook, all of these columns would not be needed.

###### Data Dictionary for Merged Data:

   
| Field | Data Type | Description | Original Column | Length | Expected Pattern | null? |   
| ----- | ---- | ---- | ---- | ---- |---- | ---- |   
||| __Constructed Fields__ |||||   
| OriginatingFile | string | file that populated this record | created from LESO filename and sheet | varies | see Custom Settings above | no |   
| Item_FSG | string | supply category the item belongs to; see [Federal Supply Group Number](https://en.wikipedia.org/wiki/List_of_NATO_Supply_Classification_Groups#References) | file dependent, digits 1&2 of \['NSN','FSC'\]| 2 | \[0-9\]{2} | no |   
| Item_FSC | string | supply class the item belongs to; see [Federal Supply Group Number](https://en.wikipedia.org/wiki/List_of_NATO_Supply_Classification_Groups#References) | file dependent, digits 3&4 of \['NSN','FSC'\] | 2 | \[0-9\]{2} | no |   
| Item_CC | string | country code for where final assembly of item occurred (a.k.a. nation code; see [Federal Supply Group Number](https://en.wikipedia.org/wiki/National_Codification_Bureau) | file dependent, digits 5&6 of \['NSN'\] or digits 1&2 of \['NIIN'\]| 2 | \[0-9\]{2} | no |   
| Item_Code | string | supply class the item belongs to; see [Federal Supply Group Number](https://en.wikipedia.org/wiki/List_of_NATO_Supply_Classification_Groups#References) | file dependent, last 7 digits of \['NSN','NIIN'\] | 7 | \[0-9\]{7} | no |   
||| __Fields in All Files__ |||||   
| StateAbbreviation | string | two digit postal abbreviation for U.S. state or territory | State | 2 | \[A-Z\]\[A-Z\] | no |   
| RequestingAgency | string | descriptive name of requesting law enforcement agency | Station Name (LEA) | varies | varies | no |   
| ItemDescription | string | descriptive name of requested item | Item Name | varies | varies | no |   
| RecordDate | datetime64 | date | file dependent \['Ship Date','Date Shipped','Date Requested'\] | 29 | yyyy-mm-ddT00:00:00.000000000 | no |   
| AcquisitionValue | float | U.S. dollar amount paid when the item was originally purchased by the government | Acquisition Value | varies | [0-9]+.[0-9]{2} | no |   
| Quantity | integer | number of units requested | Quantity | varies | [0-9]+ | no |   
| UnitIncrement | string | units of requested item known as unit increments | UI | varies | varies | no |   
||| __Fields in AllStatesAndTerritories Only__ | __fill value 'not in file'__ ||||   
| NSN | string | [NATO Stock Number](https://en.wikipedia.org/wiki/NATO_Stock_Number) a government-assigned identifier for requested item | NSN | 9 | \[0-9\]{4}-\[0-9\]{2}-\[A-Z0-9\]{3}-\[A-Z0-9\]{4} | no |   
| DEMILCode | character | [demilitarization code](https://www.dla.mil/HQ/LogisticsOperations/Services/FIC/DEMILCoding/DEMILCodes/) for level of destruction required when the item leaves Department of Defense control | DEMIL Code | 1 | \[GPFDCEBQA\] | no |   
| DEMILIC | integer | [demilitarization itegrity code](https://www.dla.mil/HQ/LogisticsOperations/Services/FIC/DEMILCoding/DEMILCodes/) validity of DEMIL Code (a missing value means it has not yet been reviewed), see [FLIS manual](https://www.dla.mil/HQ/LogisticsOperations/TrainingandReference/FLISProcedures/) for more information | DEMIL IC | 1 | [0-9] or blank | yes |   
| StationType | string | level of government associated with requesting agency; needs further research | Station Type | 5 | 'State' | no |   
||| __Fields in Shipments_Cancellations Only__ | __fill value 'not in file'__ ||||   
| FSC | string | [Federal Supply Number](https://en.wikipedia.org/wiki/NATO_Stock_Number#Federal_Supply_Classification_Group_(FSCG)) consisting of the Federal Supply Group and Federal Supply Classification | FSC | 4 | \[0-9\]{4} | no |   
| NIIN | string | [National Item Identification Number](https://en.wikipedia.org/wiki/NATO_Stock_Number#National_Item_Identification_Number_(NIIN)) a Country Code followed by a 7-digit item identifier string | NIIN | 9 | \[0-9\]{9} | no |   
| Justification | string | descriptive text justifying request; needs further research | Justification | varies | varies | yes |   
||| __Fields in Shipments Only__ | __fill value 'not in file'__ ||||   
| RequisitionID | string | apparently unique identifier needs further research | Requisition ID | 14 | [A-z0-9]{14} | no |   
||| __Fields in Cancellations Only__ | __fill value 'not in file'__ ||||   
| CancelledBy | string | apparently agency that cancelled request; needs further research | Cancelled By | varies | varies | yes | 
| RTDRef | string | apparently unique identifier; needs further research | RTD Ref | 6 or 7 | [0-9]{7} | no |     
| ReasonCancelled | string | why request is cancelled; needs further research | Reason Cancelled | varies | varies | yes |   

## transfer_df created from DISP_AllStatesAndTerritories

#### Make sure you have set path_datafiles and LESOfile_all variables above.

In [None]:
trans_expected_columns = ['State', 'Station Name (LEA)',
                    'NSN', 'Item Name', 'Quantity', 'UI', 'Acquisition Value',
                    'DEMIL Code', 'DEMIL IC', 'Ship Date','Station Type']

trans_columns_dictionary = {'State':'StateAbbreviation', 'Station Name (LEA)':'RequestingAgency',
                      'NSN':'NSN', 'Item Name':'ItemDescription','Quantity':'Quantity',
                      'UI':'UnitIncrement', 'Acquisition Value':'AcquisitionValue',
                      'DEMIL Code':'DEMILCode','DEMIL IC':'DEMILIC',
                      'Ship Date':'RecordDate', 'Station Type':'StationType'}

In [None]:
# all sheets in the spreadsheet are read into a dictionary of dataframes
# see Check_DISP_AllStatesAndTerritoriesipynb for a full explanation
excel_dict = pd.read_excel("file:" + path_datafiles + LESOfile_all, sheet_name=None,
                           )

In [None]:
#CHECK What is the number of records in the original data?
total_transfers = 0
for k in excel_dict:
    total_transfers = total_transfers + len(excel_dict[k])
total_transfers

In [None]:
# Convert the dictionary to a single dataframe with the records for all states/territories.
#    rename the columns to new schema (see 'Columns in Merged Dataframe' above)
#    strip leading/trailing white space from object types
transfer_df = pd.concat(
    [pd.concat([v],ignore_index=True) for k,v in excel_dict.items()],ignore_index=True).\
    rename(columns=trans_columns_dictionary).\
    apply(lambda x: x.str.strip() if x.dtype == 'object' else x)

In [None]:
# Done with excel_dict, so release a bit of memory.
excel_dict.clear()

In [None]:
# Break 'NSN' into NATO Stock Number units. (see 'Columns in Merged Dataframe' above)
transfer_df = transfer_df.assign(Item_FSG=transfer_df['NSN'].str.replace('-','').str[:2].values,
                   Item_FSC=transfer_df['NSN'].str.replace('-','').str[2:4].values,
                   Item_CC=transfer_df['NSN'].str.replace('-','').str[4:6].values,
                   Item_Code=transfer_df['NSN'].str.replace('-','').str[6:].values,)

In [None]:
# Fill missing columns with 'not in file' value to distinguish them from NaN/null values.
transfer_df['FSC'],transfer_df['NIIN'] = 'not in file','not in file'
transfer_df['Justification'] = 'not in file'
transfer_df['RequisitionID'],transfer_df['CancelledBy'] = 'not in file','not in file'
transfer_df['RTDRef'],transfer_df['ReasonCancelled'] = 'not in file','not in file'
# Fill 'OriginatingFile' column.
transfer_df['OriginatingFile'] = originate_all

In [None]:
# Order the columns in preparation for merging.
transfer_df = transfer_df[ordered_columns_list]

###### Check Dataframe

In [None]:
transfer_df.shape
# Only DEMILIC should have NaN/null values; (see 'Columns in Merged Dataframe' above)
#transfer_df.isna().sum()

## shipments_df created from DISP_Shipments_Cancellations

#### Make sure you have set path_datafiles and LESOfile_qtr variables above.

In [None]:
ship_expected_columns = ['State', 'Station Name (LEA)', 'Requisition ID', 'FSC', 'NIIN',
                    'Item Name', 'UI', 'Quantity', 'Acquisition Value', 'Date Shipped',
                    'Justification']
ship_columns_dictionary = {'State':'StateAbbreviation', 'Station Name (LEA)':'RequestingAgency',
                      'Requisition ID':'RequisitionID', 'FSC':'FSC', 'NIIN':'NIIN',
                      'Item Name':'ItemDescription', 'UI':'UnitIncrement', 'Quantity':'Quantity',
                      'Acquisition Value':'AcquisitionValue', 'Date Shipped':'RecordDate', 
                      'Justification':'Justification'}

In [None]:
# Only 'SHIPMENTS' sheet in the original file is read into a dataframe.
# see Check_DISP_Shipments_Cancellations.ipynb for a full explanation
#    rename the columns to new schema (see 'Columns in Merged Dataframe' above)
#    strip leading/trailing white space from object types
shipments_df = pd.read_excel("file:" + path_datafiles + LESOfile_qtr, sheet_name='SHIPMENTS').\
                             rename(columns=ship_columns_dictionary).\
                             apply(lambda x: x.str.strip() if x.dtype == 'object' else x)

In [None]:
#CHECK What is the number of records in the original data?
shipments_df.shape

In [None]:
# Break 'FSC' and 'NIIN' into NATO Stock Number units. (see 'Columns in Merged Dataframe' above)
shipments_df = shipments_df.assign(Item_FSG=shipments_df['FSC'].astype(str).str[:2],
                   Item_FSC=shipments_df['FSC'].astype(str).str[2:4],
                   Item_CC=shipments_df['NIIN'].str[:2].values,
                   Item_Code=shipments_df['NIIN'].str[2:].values)

In [None]:
# Fill missing columns with 'not in file' value to distinguish them from NaN/null values.
shipments_df['NSN'],shipments_df['DEMILCode'] = 'not in file','not in file'
shipments_df['DEMILIC'],shipments_df['StationType'] = 'not in file','not in file'
shipments_df['CancelledBy'],shipments_df['RTDRef'] = 'not in file','not in file'
shipments_df['ReasonCancelled'] = 'not in file'
# Fill 'OriginatingFile' column.
shipments_df['OriginatingFile'] = originate_shipments

In [None]:
# Order the columns in preparation for merging.
shipments_df = shipments_df[ordered_columns_list]

###### Check Dataframe

In [None]:
shipments_df.shape
# Expect no NaN/null values; (see 'Columns in Merged Dataframe' above)
#shipments_df.isna().sum()

### Prepare cancellations_df From DISP_Shipments_Cancellations

#### Make sure you have set path_datafiles and LESOfile_qtr variables above.

In [None]:
canc_expected_columns = ['Cancelled By', 'RTD Ref', 'State', 'Station Name (LEA)',
                         'FSC', 'NIIN', 'Item Name', 'UI', 'Quantity', 'Acquisition Value',
                         'Date Requested', 'Justification', 'Reason Cancelled']
canc_columns_dictionary = {'Cancelled By':'CancelledBy', 'RTD Ref':'RTDRef', 
                           'State':'StateAbbreviation', 'Station Name (LEA)':'RequestingAgency',
                           'FSC':'FSC', 'NIIN':'NIIN', 'Item Name':'ItemDescription',
                           'UI':'UnitIncrement', 'Quantity':'Quantity', 'Acquisition Value':'AcquisitionValue',
                           'Date Requested':'RecordDate', 'Justification':'Justification',
                           'Reason Cancelled':'ReasonCancelled'}

In [None]:
# Only 'CANCELLATIONS' sheet in the original file is read into a dataframe.
# see Check_DISP_Shipments_Cancellations for a full explanation
#    rename the columns to new schema (see 'Columns in Merged Dataframe' above)
#    strip leading/trailing white space from object types
cancellations_df = pd.read_excel("file:" + path_datafiles + LESOfile_qtr, sheet_name='CANCELLATIONS').\
                             rename(columns=canc_columns_dictionary).\
                             apply(lambda x: x.str.strip() if x.dtype == 'object' else x)

In [None]:
# Break 'FSC' and 'NIIN' into NATO Stock Number units. (see 'Columns in Merged Dataframe' above)
cancellations_df = cancellations_df.assign(Item_FSG=cancellations_df['FSC'].astype(str).str[:2],
                   Item_FSC=cancellations_df['FSC'].astype(str).str[2:4],
                   Item_CC=cancellations_df['NIIN'].str[:2].values,
                   Item_Code=cancellations_df['NIIN'].str[2:].values)

In [None]:
# Fill missing columns with 'not in file' value to distinguish them from NaN/null values.
cancellations_df['NSN'],cancellations_df['DEMILCode'] = 'not in file','not in file'
cancellations_df['DEMILIC'],cancellations_df['StationType'] = 'not in file','not in file'
cancellations_df['RequisitionID'] = 'not in file'
# Fill 'OriginatingFile' column.
cancellations_df['OriginatingFile'] = originate_cancellations

In [None]:
# Order the columns in preparation for merging.
cancellations_df = cancellations_df[ordered_columns_list]

In [None]:
cancellations_df.shape
# Found NaN/null values in 'Justification' and 'ReasonCancelled'; (see 'Columns in Merged Dataframe' above)
#cancellations_df.isna().sum()

### Merge All Datasets

In [None]:
#TODO per Nicole...more information for user; explain this better
# https://sandimetz.com/speaking OOP coding principles

if list(transfer_df.columns) != list(shipments_df.columns):
    print('Columns do not match.')
elif list(transfer_df.columns) != list(cancellations_df.columns):
    print('Columns do not match.')
elif list(shipments_df.columns) != list(cancellations_df.columns):
    print('Columns do not match.')

In [None]:
all_data_df = pd.concat([transfer_df,shipments_df,cancellations_df],axis=0)

### Storing the Merged Data

In [None]:
# If you want to break the merged dataframe into files
# by state abbreviation
save_on_column = 'StateAbbreviation'

# If you want to break the merged dataframe into quarters
# and save it in tab-separated files by quarter.
#all_data_df['Quarter'] = pd.PeriodIndex(all_data_df.RecordDate, freq='Q')
#save_on_column = 'Quarter'

for i in list(all_data_df[save_on_column].unique()):
    my_file = Path(path_mergedfiles + 'LESO_' + str(i) + '.tsv')
    if my_file.exists():
        all_data_df[all_data_df[save_on_column] == i].to_csv(my_file, header=False,
                                                    index=False, mode='a', sep='\t', escapechar="\\")
    else:
        all_data_df[all_data_df[save_on_column] == i].to_csv(my_file, 
                                                    index=False, mode='w', sep='\t', escapechar="\\")

In [None]:
# If you want the whole dataframe in one file
#my_file = Path(path_mergedfiles + 'LESO_mergeall.tsv')
#if my_file.exists():
#    all_data_df.to_csv(my_file, header=False, index=False, mode='a', sep='\t', escapechar="\\")
#else:
#    all_data_df.to_csv(my_file, index=False, mode='w', sep='\t', escapechar="\\")

In [None]:
# Example of how to read the data file into a dataframe:
#my_file = Path(path_mergedfiles + filename)
#df = pd.read_csv(my_file,sep='\t',header=[0],index_col=None,
#                 quoting=csv.QUOTE_NONE, quotechar="",  escapechar="\\")
