In [13]:
# libraries
import numpy as np
import pandas as pd
import geopandas as gpd
import scipy.stats as stats 
import pickle

In [None]:
# Load nypd_geo - has stops and CT areas
input_filepath = './data/processed/geo/nypd_geo.pkl'

# Open the pickle file in read mode
with open(input_filepath, 'rb') as file:
    nypd_geo = pickle.load(file)


In [None]:
# load census numeric data
input_filepath = './data/processed/numeric/census.pkl'

# Open the pickle file in read mode
with open(input_filepath, 'rb') as file:
    census = pickle.load(file)

In [None]:
# load nypd numeric summary data

input_filepath = './data/processed/numeric/nypd_byrace.pkl'

# Open the pickle file in read mode
with open(input_filepath, 'rb') as file:
    nypd = pickle.load(file)

### Chi Squared Goodness of Fit Tests on PROPORTIONS

In [17]:
# make columns match exactly for chi2 test
censusX2 = census.drop(['Total', 'BoroName'], axis = 1)

# reorder columns to match nypd
column_order = list(nypd.columns)
censusX2 = censusX2.reindex(columns=column_order)

In [18]:
nypd_duplicates = nypd['BoroCT2020'].duplicated().any()
censusX2_duplicates = censusX2.index.duplicated().any()

if nypd_duplicates or censusX2_duplicates:
    print("Duplicate values found in 'BoroCT2020' column.")
else:
    print("No duplicate values found in 'BoroCT2020' column.")

No duplicate values found in 'BoroCT2020' column.


In [19]:
#set tract numbers as index row
nypd = nypd.set_index('BoroCT2020')
censusX2 = censusX2.set_index('BoroCT2020')

In [20]:
# Initialize an empty list to store results
results = []

# Define categories
categories = ['Black', 'white', 'Mixed Race', 'Asian', 'American Indigenous', 'Other']


In [None]:
# Loop through each census tract
for tract in nypd.index:
    # extract rows at matching indices from both dataframes
    observed_counts = np.array(nypd.loc[nypd.index == tract].values[0])
    expected_counts = np.array(censusX2.loc[censusX2.index == tract].values[0])

    # Calculate observed and expected proportions
    observed_prop = observed_counts / np.sum(observed_counts)
    expected_prop = expected_counts / np.sum(expected_counts)

    # Perform chi-squared test
    chi2_stat, p_val = stats.chisquare(f_obs=observed_prop, 
                                        f_exp=expected_prop)

    # Calculate critical chi2 value
    dof = len(categories) - 1
    critical_chi2 = np.percentile(np.random.chisquare(dof, 
                                                      size=100000), 95)

    # Append results to the list
    results.append({
        'BoroCT2020': tract,
        'p-value': p_val,
        'chi2_statistic': chi2_stat,
        'critical_chi2_value': critical_chi2
    })

In [22]:
# Create a DataFrame from the list of results
chi2_results  = pd.DataFrame(results)

In [25]:
# filter for significan p-values
tracts_significant = chi2_results[chi2_results['p-value'] < 0.05]

In [None]:
# save to csv
tracts_significant.to_csv('./data/interim/significant-tracts.csv', index=False)

In [None]:
# Define the file path
output_filepath = './data/interim/significant-tract.pkl'

# Open the file in write mode
with open(output_filepath, 'wb') as file:
    # Save the processed DataFrame as a pickle object
    pickle.dump(tracts_significant, file)