## Clear and Reformat Population Dataset

In [241]:
import openpyxl
import pandas as pd
import numpy as np

In [242]:
workbook = openpyxl.load_workbook('co-est2023-pop.xlsx')
worksheet = workbook.active

In [243]:
start_row = 6
end_row = 3149
columns = ['A', 'B', 'C', 'D', 'E', 'F']

data = {col: [] for col in columns}

for row in range(start_row, end_row + 1):
    for col in columns:
        cell_value = worksheet[col + str(row)].value
        data[col].append(cell_value)

df = pd.DataFrame(data)

In [244]:
new_column_names = ['County', 'Estimates Base', '2020', '2021', '2022', '2023']
df.columns = new_column_names
df

Unnamed: 0,County,Estimates Base,2020,2021,2022,2023
0,".Autauga County, Alabama",58809,58915,59203,59726,60342
1,".Baldwin County, Alabama",231768,233227,239439,246531,253507
2,".Barbour County, Alabama",25229,24969,24533,24700,24585
3,".Bibb County, Alabama",22301,22188,22359,21986,21868
4,".Blount County, Alabama",59130,59107,59079,59516,59816
...,...,...,...,...,...,...
3139,".Sweetwater County, Wyoming",42271,42197,41626,41374,41249
3140,".Teton County, Wyoming",23323,23379,23605,23297,23232
3141,".Uinta County, Wyoming",20445,20457,20681,20727,20745
3142,".Washakie County, Wyoming",7679,7657,7719,7724,7710


In [245]:
### reformat and clear dataset
county_names =  df['County'].apply(lambda x: (x.split(',')[0][1:]))
state_names = df['County'].apply(lambda x: (x.split(',')[1]))
county_state_names = df['County'].apply(lambda x: (x[1:]))
df['County'] = county_names
df['State'] = state_names
df['County_State'] = county_state_names

In [246]:
### add fips number to the existing dataset
county_df = pd.read_csv('county_fips_master.csv', encoding='ISO-8859-1')
state_df = pd.read_csv('state_fips_master.csv', encoding='ISO-8859-1')

In [247]:
### check non-machtcing cases
# Create a boolean mask where False indicates the county is in county_df
mask = ~df['County'].isin(county_df['county_name'])

# Sum the non-matching entries (optional, if you want the count)
non_matching_count = np.sum(mask)

# Get the indices of non-matching entries
non_matching_indices = df.index[mask]

# If you need to see which are these counties
non_matching_counties = df['County'][mask]
non_matching_counties

df.shape

(3144, 8)

In [248]:
### discard non matching cases
df_clear = df[~mask]
df_clear.shape
county_df['county_state_name'] = county_df['county_name'] + ', ' +  county_df['state_name']
df_combine = pd.merge(df_clear, county_df[['county_state_name', 'fips']], left_on='County_State', right_on='county_state_name', how='left')
df_combine.drop(columns='county_state_name', inplace=True)

In [249]:
### process fips into five-digit strings
df_combine['state_county_fips'] = df_combine['fips'].astype(str).apply(lambda x: '0' + x if len(x) < 5 else x)
df_combine.drop(columns='fips', inplace=True)

In [250]:
### drop estimates base
df_combine.drop(columns='Estimates Base', inplace=True)

In [252]:
df_combine

Unnamed: 0,County,2020,2021,2022,2023,State,County_State,state_county_fips
0,Autauga County,58915,59203,59726,60342,Alabama,"Autauga County, Alabama",01001
1,Baldwin County,233227,239439,246531,253507,Alabama,"Baldwin County, Alabama",01003
2,Barbour County,24969,24533,24700,24585,Alabama,"Barbour County, Alabama",01005
3,Bibb County,22188,22359,21986,21868,Alabama,"Bibb County, Alabama",01007
4,Blount County,59107,59079,59516,59816,Alabama,"Blount County, Alabama",01009
...,...,...,...,...,...,...,...,...
3127,Sweetwater County,42197,41626,41374,41249,Wyoming,"Sweetwater County, Wyoming",56037
3128,Teton County,23379,23605,23297,23232,Wyoming,"Teton County, Wyoming",56039
3129,Uinta County,20457,20681,20727,20745,Wyoming,"Uinta County, Wyoming",56041
3130,Washakie County,7657,7719,7724,7710,Wyoming,"Washakie County, Wyoming",56043


In [251]:
df_combine.columns = ['county', '2020', '2021', '2022', '2023', 'state', 'county_state', 'id']

ValueError: Length mismatch: Expected axis has 8 elements, new values have 7 elements

In [None]:
df_combine.to_csv('state_county_population.csv', index=False)