# Importing & Preprocessing

Please refer to the previous file ***Top10.ipynb*** for detailed comments on the following procedures.

In [None]:
import numpy as np
import pandas as pd
df = pd.read_csv("311_Service_Requests_2020.csv", index_col = "Unique Key", low_memory = False)
def fix_zip(zip_code):
    try:
        result = int(zip_code)
        if result < 10001:
            return np.NaN
        elif result == 12345:
            return "10030"
        else:
            return str(result)
    except:
        return np.NaN
df["Incident Zip"] = df["Incident Zip"].apply(fix_zip)

# Data Analysis

In [None]:
# We use Columbia University's zip code, '10027', again for this analysis.
df_cu = df[df["Incident Zip"] == "10027"]

In [None]:
# Use regular expression to check if there are multiple illegal parking incident types in the full Pandas DataFrame.
import re
pattern = r".*[Pp]{1}arking.*"
re.findall(pattern, "\n".join(df["Complaint Type"].unique())) # Join by newlines, since '.' in re will stop at newlines.

In [None]:
# There is no mistake in 'Complaint Type', so we proceed to the calculations.
parking_cu = df_cu[df_cu["Complaint Type"] == "Illegal Parking"].shape[0]
total_cu = df_cu.shape[0]
parking_nyc = df[df["Complaint Type"] == "Illegal Parking"].shape[0]
total_nyc = df.shape[0]

# Checking...
print(parking_cu == df_cu["Complaint Type"].value_counts()["Illegal Parking"])
print(parking_nyc == df["Complaint Type"].value_counts()["Illegal Parking"])

In [None]:
# Now we check if the fraction of illegal parking incidents is higher near Columbia University or NYC as a whole.
parking_proportion_cu = parking_cu / total_cu
parking_proportion_nyc = parking_nyc / total_nyc
print(f"The fraction of illegal parking incidents near Columbia University is\n{parking_proportion_cu}\n")
print(f"The fraction of illegal parking incidents in NYC is\n{parking_proportion_nyc}")

In [None]:
# Our final answer.
higher_parking_proportion = parking_proportion_cu > parking_proportion_nyc