# Importing & Preprocessing

In [None]:
# Import the necessary modules.
import numpy as np
import pandas as pd

In [None]:
# Read the csv file and convert the 'Unique Key' column to the index of our Pandas DataFrame.
df = pd.read_csv("311_Service_Requests_2020.csv", index_col = "Unique Key", low_memory = False)

# Notice that the data type of 'Incident Zip' is float64, which does not look right.
df.info()
df.head()

In [None]:
# View the unique entries to 'Incident Zip' to see what is wrong.
df["Incident Zip"].unique()

In [None]:
# From the above zip codes, '83' and 'nan' are definite not valid zip codes, but we need to investigate '12345'.
print(df[df["Incident Zip"] > 11697]["Incident Zip"].unique()) # '11697' is the maximum zip code in NYC.
print(df[df["Incident Zip"] > 11697]["Incident Address"].unique()) # Get the addresses of the oversized zip code.

In [None]:
# From the above addresses, we see that '10030' is incorrectly coded as '12345', and we are ready to fix the zip codes.
def fix_zip(zip_code):
    """
    This function corrects all mistaken zip codes in our Pandas DataFrame and converts them to strings.
    """
    try:
        result = int(zip_code) # Try to convert a zip code to an integer.
        if result < 10001: # '10001' is the minimum zip code in NYC.
            return np.NaN
        elif result == 12345: # Recode '12345' as '10030'.
            return "10030"
        else:
            return str(result) # Return a correct zip code as a string.
    except:
        return np.NaN # If a zip code, namely 'nan', cannot be converted to an integer, return 'np.NaN'.

# Now, we apply the above function to 'Incident Zip' to fix the zip codes in our Pandas DataFrame.
df["Incident Zip"] = df["Incident Zip"].apply(fix_zip)