In [2]:
# Standard Setup Import
from _Setup import *

In [3]:
# Import state-level data
state_df = pd.read_csv(state_data_csv_path)
state_df.head()

Unnamed: 0,Fiscal Year,Month Grouping,Month (abbv),Land Border Region,State,Demographic,Citizenship,Title of Authority,Encounter Count
0,2022,FYTD,DEC,Northern Land Border,AK,Single Adults,CANADA,Title 8,3
1,2022,FYTD,DEC,Northern Land Border,ID,Single Adults,CANADA,Title 8,6
2,2022,FYTD,DEC,Northern Land Border,ID,Single Adults,"CHINA, PEOPLES REPUBLIC OF",Title 8,1
3,2022,FYTD,DEC,Northern Land Border,ID,Single Adults,OTHER,Title 8,2
4,2022,FYTD,DEC,Northern Land Border,ME,Accompanied Minors,CANADA,Title 42,1


In [4]:
state_df.replace(to_replace="2025 (FYTD)", value=2025, inplace=True)
state_df.replace(to_replace='2024', value=2024, inplace=True)
print(state_df['Fiscal Year'].unique())

['2022' '2023' 2024 2025 '2020' '2021']


In [5]:
# Create a dictionary mapping month abbreviations (uppercase) to month numbers
month_abbr_to_num = {
    'JAN': 1, 'FEB': 2, 'MAR': 3, 'APR': 4, 'MAY': 5, 'JUN': 6,
    'JUL': 7, 'AUG': 8, 'SEP': 9, 'OCT': 10, 'NOV': 11, 'DEC': 12
}

# Function to convert Fiscal Year and Month (abbv) to a Year-Date
def convert_to_fiscal_year_date(row):
    month_num = month_abbr_to_num[row['Month (abbv)'].upper()]
    fiscal_year = int(row['Fiscal Year'])  # Convert Fiscal Year to integer
    
    # Adjust fiscal year for months January through September
    if month_num >= 10:  # Jan - Sep belong to the previous calendar year
        fiscal_year -= 1
    
    # Format the fiscal year and month into a date string
    return f"{fiscal_year}-{month_num:02d}-01"

# Apply the function to create a Year-Date column in fytd_df
state_df['Year-Date'] = state_df.apply(lambda row: convert_to_fiscal_year_date(row), axis=1)

# Convert the new column to datetime format
state_df['Year-Date'] = pd.to_datetime(state_df['Year-Date'], format='%Y-%m-%d')

# Display the dataframe with the new fiscal year-based Year-Date
print(state_df['Year-Date'].unique())

<DatetimeArray>
['2021-12-01 00:00:00', '2021-11-01 00:00:00', '2021-10-01 00:00:00',
 '2022-04-01 00:00:00', '2022-08-01 00:00:00', '2022-02-01 00:00:00',
 '2022-01-01 00:00:00', '2022-07-01 00:00:00', '2022-06-01 00:00:00',
 '2022-03-01 00:00:00', '2022-05-01 00:00:00', '2022-09-01 00:00:00',
 '2022-12-01 00:00:00', '2022-11-01 00:00:00', '2022-10-01 00:00:00',
 '2023-04-01 00:00:00', '2023-08-01 00:00:00', '2023-02-01 00:00:00',
 '2023-01-01 00:00:00', '2023-07-01 00:00:00', '2023-06-01 00:00:00',
 '2023-03-01 00:00:00', '2023-05-01 00:00:00', '2023-09-01 00:00:00',
 '2023-12-01 00:00:00', '2023-11-01 00:00:00', '2023-10-01 00:00:00',
 '2024-04-01 00:00:00', '2024-08-01 00:00:00', '2024-02-01 00:00:00',
 '2024-01-01 00:00:00', '2024-07-01 00:00:00', '2024-06-01 00:00:00',
 '2024-03-01 00:00:00', '2024-05-01 00:00:00', '2024-09-01 00:00:00',
 '2024-12-01 00:00:00', '2024-11-01 00:00:00', '2024-10-01 00:00:00',
 '2020-04-01 00:00:00', '2020-08-01 00:00:00', '2019-12-01 00:00:00',
 '20

In [6]:
state_df.to_csv(state_data_csv_path_cleaned, index=False)

Seems standard to use the last full year of data as the test set, luckily for us, the data ends in December 2024, so we'll have Jan-Dec 2024 as our test set. We can designate the rest of the data to be in the training set/validation

In [7]:
train_df = state_df[state_df['Year-Date'] < '2024-01-01']
test_df = state_df[state_df['Year-Date'] >= '2024-01-01']

In [9]:
train_df.to_csv(state_data_train, index = False)
test_df.to_csv(state_data_test, index = False)