In [None]:
import pandas as pd
import re

# !python --version    #Python 3.8.5
# pd.__version__       #1.1.2
# re.__version__       #2.2.1
#  datetime standard module

In [None]:
# useful functions
def make_dataframe(a_dict: dict) -> pd.DataFrame:
    """ Return dataframe of all values in dictionary."""
    a_df = pd.concat([pd.concat([v],ignore_index=True) for k,v in a_dict.items()],ignore_index=True).\
                    apply(lambda x: x.str.strip() if x.dtype == 'object' else x)
    a_df.index = a_df['Ship Date']
    a_df.index = a_df.index.normalize()
    a_df.index.name = 'Date'
    return a_df


def count_by_time(frame1: pd.DataFrame, frame2: pd.DataFrame,
                  start: str, end: str, time_unit: str) -> pd.DataFrame:
    """ return a single dataframe with only rows where column counts are different """
    count_frame1 = frame1.loc[start:end].groupby(pd.Grouper(freq=time_unit))['State'].count()
    count_frame2 = frame2.loc[start:end].groupby(pd.Grouper(freq=time_unit))['State'].count()
    counts_df = pd.DataFrame({'count1': count_frame1, 'count2': count_frame2}, index=count_frame2.index)
    return counts_df[(counts_df['count1'] != counts_df['count2'])]

### Prepare data

In [None]:
# Enter the path to the local copy of original data files:
path_datafiles = "../../data/"

# Get quarterly cumulative LESO Transferred Property data file from 
#     Defense Logicstics Agency Law Enforcement Support Office Public Information
# Orginal name of the data file should be in the form:
#      DISP_AllStatesAndTerritories_mmddyyyy.xlsx  
# Enter the local file name
LESO_Q1_file = "DISP_AllStatesAndTerritories_03312020.xlsx"
LESO_Q2_file = "DISP_AllStatesAndTerritories_06302020.xlsx"
#LESO_Q3_file = "DISP_AllStatesAndTerritories_009302020.xlsx"
#LESO_Q4_file = "DISP_AllStatesAndTerritories_12312020.xlsx"
start_date = '2017-01-01'
end_date = '2020-03-31'

In [None]:
# Load the data into dictionary of states (sheet names) and dataframes (sheet contents)
q1_dict = pd.read_excel("file:" + path_datafiles + LESO_Q1_file, sheet_name=None)
q2_dict = pd.read_excel("file:" + path_datafiles + LESO_Q2_file, sheet_name=None)
#q3_dict = pd.read_excel("file:" + path_datafiles + LESO_Q3_file, sheet_name=None)
#q4_dict = pd.read_excel("file:" + path_datafiles + LESO_Q4_file, sheet_name=None)

In [None]:
# Create dataframes from dictionaries for two quarters being compared
start_df = make_dataframe(q1_dict)
end_df = make_dataframe(q2_dict)
print('Shape of starting quarter:', start_df.loc[start_date:end_date].shape)
print('Shape of ending quarter:', end_df.loc[start_date:end_date].shape)

### Analyze data

In [None]:
# What years have discrepancies?
count_by_time(start_df, end_df, start_date, end_date, 'Y')
#ax = count_by_time(start_df, end_df, start_date, end_date, 'Y').plot.bar(rot=90,figsize=(10,5))

In [None]:
# What months have discrepancies in a given year?
year_start = '2019-01-01'
year_end = '2019-12-31'
count_by_time(start_df, end_df, year_start, year_end, 'M')
#ax = count_by_time(start_df, end_df, year_start, year_end, 'M').plot.bar(rot=90,figsize=(10,5))

###### First month with discrepancies

In [None]:
# What weeks have discrepancies in a month of a year?
month_start = '2019-05-01'
month_end = '2019-05-31'
count_by_time(start_df, end_df, month_start, month_end, 'W')
#ax = count_by_time(start_df, end_df, month_start, month_end, 'W').plot.bar(rot=90,figsize=(10,5))

In [None]:
# Which days have discrepances in a given week?
week_start = '2019-06-16'
week_end = '2019-06-23'
count_by_time(start_df, end_df, week_start, week_end, 'D')
#ax = count_by_time(start_df, end_df, week_start, week_end, 'D').plot.bar(rot=90,figsize=(10,5))

In [None]:
# Which day did the discrepancy occur?
day_of_interest = '2019-06-21'
merged_df = pd.merge(start_df.loc[day_of_interest], end_df.loc[day_of_interest], how='outer', indicator=True)
merged_df[merged_df['_merge'] == 'left_only']

In [None]:
# If the merged data doesn't find the discrepancy, then try the following
start_lea_count = start_df.loc[day_of_interest].groupby(['Station Name (LEA)'])['Station Name (LEA)'].count()
end_lea_count = end_df.loc[day_of_interest].groupby(['Station Name (LEA)'])['Station Name (LEA)'].count()
concat_count = pd.concat([start_lea_count, end_lea_count], axis=1)
concat_count.fillna(0).astype(int)

In [None]:
# if more than on month has discrepancies, repeat previous 4 cells as needed