In [38]:
import pandas as pd

In [39]:
df_original = pd.read_csv('../data/fy2023-property-assessment-data.csv')

In [40]:
# Remove duplicate parcels
df_cleaned = df_original.drop_duplicates(subset=['PID'], keep=False)

In [41]:
# Remove rows with 0 TOTAL_VALUE
df_cleaned = df_cleaned[df_cleaned['TOTAL_VALUE'] != 0]

In [42]:
# Remove Condo Parking Spaces (LU = CP)
df_cleaned = df_cleaned[df_cleaned['LU'] != 'CP']

In [43]:
# Remove unnecessary columns
df_cleaned = df_cleaned.drop(columns=['PID', 'CM_ID', 'GIS_ID', 'ST_NUM', 'ST_NAME', 'UNIT_NUM', 'BLDG_SEQ', 'NUM_BLDGS', 'OWN_OCC', 'OWNER', 'OWNER MAIL ADDRESS', 'RES_FLOOR', 'CD_FLOOR', 'RES_UNITS', 'COM_UNITS', 'RC_UNITS', 'LIVING_AREA', 'LAND_VALUE', 'BLDG_VALUE', 'GROSS_TAX', 'BTHRM_STYLE1', 'BTHRM_STYLE2', 'BTHRM_STYLE3', 'KITCHEN_STYLE1', 'KITCHEN_STYLE2', 'KITCHEN_STYLE3'])

In [47]:
df_cleaned = df_cleaned.drop(columns=['STRUCTURE_CLASS', 'ORIENTATION', 'CORNER_UNIT'])

In [48]:
# Create a meta dataframe
meta_df = pd.concat([df_cleaned.isna().sum(), df_cleaned.dtypes], axis=1, keys=['NA_Count', 'Type'])

# Remove rows with no NA values
meta_df[meta_df['NA_Count'] != 0]

Unnamed: 0,NA_Count,Type


In [49]:
# If the NA count is not 0, then we need to fill in the NA values
# We will fill in the NA values with -1 if the column is numeric
# and with 'NA' if the column is a string
for col in meta_df[meta_df['NA_Count'] != 0].index:
    if meta_df.loc[col, 'Type'] == 'object':
        df_cleaned[col] = df_cleaned[col].fillna('NA - Not Available or Not Applicable')
    else:
        df_cleaned[col] = df_cleaned[col].fillna(-1)

In [50]:
# Save the cleaned dataframe to a csv file
df_cleaned.to_csv('../data/cleaned/fy2023-property-assessment-data-cleaned.csv', index=False)