In [2]:
#Arrests vs Date & Housing Units - Brooklyn

# importing necessary modules

import os
import json
import requests
import pandas as pd
import numpy

#changing os

#os.chdir() change os if necessary

#setting the json urls that data will be scrapped from

url_arrests = 'https://data.cityofnewyork.us/resource/8h9b-rp9u.json?$limit=10000'
url_affhousing = 'https://data.cityofnewyork.us/resource/hg8x-zxpr.json?$limit=10000'

#outlining questions for analysis

print("\n")
print("This is a program to evaluate the following questions about New York City using data from the last 2 weeks of December 2023")
print("\n")
print(" * What is the relationship between affordable housing in different neighborhoods in Brooklyn and arrests in that neighborhood? \n"
      " * Does income affect crime and the types of crimes committed? \n"
      " * Which neighborhood has the most frequent arrests? \n"
      " * Is there a relationship between arrests and whether it is a holiday or a regular day? \n", sep="\n")

# building a function to build a pd df from json urls

pd.set_option('display.max_rows', 300)
pd.set_option('display.max_columns', 300)

def url_to_df (url_string):
    dl = requests.get(url_string)
    dl_json = dl.json()
    df = pd.json_normalize(dl_json)
    return df

def assign_neighborhood(zip):
    if zip == '11201':
        return 'Brooklyn Heights'
    elif zip == '11211':
        return 'South Williamsburg'
    elif zip == '11238':
        return 'Prospect Heights'
    elif zip == '11215':
        return 'South Slope'
    elif zip == '11232':
        return 'Sunset Park'
    elif zip == '11214':
        return 'Bensonhurst'
    elif zip == '11208':
        return 'East New York'
    elif zip == '11234':
        return 'Bergen Beach'
    elif zip == '11212':
        return 'Brownsville'
    elif zip == '11235':
        return 'Sheepshead Bay'

# performing the url to df function for arrests

arrests_df = url_to_df(url_arrests)
print("Dataframe for Arrests in Brooklyn in the Last 2 Weeks of December 2023")
print("\n")

# filtering/cleaning arrests data


f_arrests_df = arrests_df.loc[:,['arrest_date', 'ofns_desc', 'arrest_boro', 'arrest_precinct']]
f_arrests_df = f_arrests_df.loc[f_arrests_df['arrest_boro'] == 'K']
f_arrests_df = f_arrests_df[f_arrests_df['arrest_precinct'].isin(["84", "94", "77", "78", "72", "62", "75", "63", "73", "61"])]
f_arrests_df['arrest_precinct']=f_arrests_df['arrest_precinct'].replace(["84", "94", "77", "78", "72", "62", "75", "63", "73", "61"],
                                                                        ["11201","11211", "11238", "11215", "11232", "11214", "11208",
                                                                         "11234", "11212", "11235"])
f_arrests_df['arrest_date'] = f_arrests_df['arrest_date'].str.slice(0, 10)
f_arrests_df=f_arrests_df[f_arrests_df['arrest_date'].isin(['2023-12-31', '2023-12-30', '2023-12-29', '2023-12-28', 
                                                            '2023-12-27', '2023-12-26', '2023-12-25', '2023-12-25',
                                                            '2023-12-24','2023-12-23', '2023-12-22', '2023-12-21',
                                                           '2023-12-20', '2023-12-19', '2023-12-18', '2023-12-17'])]
f_arrests_df = f_arrests_df.dropna()
f_arrests_df.reset_index(drop=True, inplace=True)
f_arrests_df.rename(columns={'arrest_precinct':'zipcode'}, inplace=True)
f_arrests_df.drop(columns={'arrest_boro'}, inplace=True)
f_arrests_df['neighborhood'] = f_arrests_df['zipcode'].apply(assign_neighborhood)
print(f_arrests_df)

# saving filtered arrests dataframe to csv file

f_arrests_df.to_csv('Filtered Arrests Data', index=False)

#listing neighborhoods based on zipcode

print("\n")
print("Neighborhood by Zipcode")

zip_hood = f_arrests_df.loc[:,['zipcode']]
zip_hood.drop_duplicates(inplace=True)
zip_hood['neighborhood'] = zip_hood['zipcode'].apply(assign_neighborhood)
print("\n")
print(zip_hood)

# analysing f_arrests_df

print("\n")
print("The Frequency of Each Crime:")
print("\n")
crime_count = f_arrests_df['ofns_desc'].value_counts()
print(crime_count)

print("\n")
print("The Frequency of Arrests per Zipcode:")
print("\n")
crime_district_freq = f_arrests_df['zipcode'].value_counts()
print(crime_district_freq)

print("\n")
print("The Frequency of Arrests per Date:")
print("\n")
crime_date_freq = f_arrests_df['arrest_date'].value_counts()
print(crime_date_freq)

#analysing f_arrests_df using describe()

print("\n")
print("Overall Description of Arrests Data:")
print("\n")
f_arrests_desc = f_arrests_df.describe()
print(f_arrests_desc)
print("\n")

# performing the url to df function for housing

affhousing_df = url_to_df(url_affhousing)
print("\n")
print("Dataframe for Affordable Housing Across 10 Zipcodes in Brooklyn:")
print("\n")

# filtering/cleaning housing data

f_affhousing_df = affhousing_df.loc[:,['borough', 'postcode', 'extremely_low_income_units', 'very_low_income_units', 
                                       'low_income_units', 'moderate_income_units', 'middle_income_units', 'other_income_units']]
f_affhousing_df = f_affhousing_df.loc[f_affhousing_df['borough'] == 'Brooklyn']
f_affhousing_df = f_affhousing_df[f_affhousing_df['postcode'].isin(["11201","11211", "11238", "11215", "11232",
                                                                    "11214", "11208", "11234", "11212", "11235"])]
f_affhousing_df = f_affhousing_df.dropna()
f_affhousing_df.reset_index(drop=True, inplace=True)
f_affhousing_df.rename(columns={'postcode':'zipcode'}, inplace=True)
f_affhousing_df.drop(columns={'borough'}, inplace=True)
f_affhousing_df['neighborhood'] = f_affhousing_df['zipcode'].apply(assign_neighborhood)
print(f_affhousing_df)

#saving filtered affordable housing dataframe to csv file

f_affhousing_df.to_csv('Filtered Affordable Housing Data', index=False)

#analysing f_affhousing_df

print("\n")
print("The Frequency of Affordable Housing Projects Built per Zipcode:")
print("\n")
project_count_zip = f_affhousing_df['zipcode'].value_counts()
print(project_count_zip)

print("\n")
print("The Frequency of Different Income Level Housing Projects per Zipcode:")
print("\n")

print("Number of Extremely Low Income Units per Zipcode:")
print("\n")
f_affhousing_df['extremely_low_income_units'] = f_affhousing_df['extremely_low_income_units'].astype(int)
ext_low_zip_counts = f_affhousing_df.groupby('zipcode')['extremely_low_income_units'].sum()
ext_low_zip_counts = ext_low_zip_counts.sort_values(ascending=False)
print(ext_low_zip_counts)
print("\n")

print("Number of Low Income Units per Zipcode:")
print("\n")
f_affhousing_df['low_income_units'] = f_affhousing_df['low_income_units'].astype(int)
low_zip_counts = f_affhousing_df.groupby('zipcode')['low_income_units'].sum()
low_zip_counts = low_zip_counts.sort_values(ascending=False)
print(low_zip_counts)
print("\n")

print("Number of Middle Income Units per Zipcode:")
print("\n")
f_affhousing_df['middle_income_units'] = f_affhousing_df['middle_income_units'].astype(int)
mid_zip_counts = f_affhousing_df.groupby('zipcode')['middle_income_units'].sum()
mid_zip_counts = mid_zip_counts.sort_values(ascending=False)
print(mid_zip_counts)
print("\n")

#resetting the previous filtering of housing data

f_affhousing_df = affhousing_df.loc[:,['borough', 'postcode', 'extremely_low_income_units', 'very_low_income_units', 
                                       'low_income_units', 'moderate_income_units', 'middle_income_units', 'other_income_units']]
f_affhousing_df = f_affhousing_df.loc[f_affhousing_df['borough'] == 'Brooklyn']
f_affhousing_df = f_affhousing_df[f_affhousing_df['postcode'].isin(["11201","11211", "11238", "11215", "11232",
                                                                    "11214", "11208", "11234", "11212", "11235"])]
f_affhousing_df = f_affhousing_df.dropna()
f_affhousing_df.reset_index(drop=True, inplace=True)
f_affhousing_df.rename(columns={'postcode':'zipcode'}, inplace=True)
f_affhousing_df.drop(columns={'borough'}, inplace=True)

#analysing f_affhousing_df using describe()

print("Overall Description of Housing Data:")
print("\n")
f_affhousing_desc = f_affhousing_df.describe()
print(f_affhousing_desc)
print("\n")

#merging dfs to analyze relationship between zipcodes, neighborhoods, arrests, and low income units

print("Merged Dataframe of Zip Codes, Arrests, and Housing Units:")
print("\n")
merged_df = pd.merge(zip_hood, crime_district_freq, on= 'zipcode', how= 'outer')
merged_df = pd.merge(merged_df, ext_low_zip_counts, on= 'zipcode', how= 'outer')
merged_df = pd.merge(merged_df, low_zip_counts, on= 'zipcode', how= 'outer')
merged_df = pd.merge(merged_df, mid_zip_counts, on= 'zipcode', how= 'outer')
merged_df.rename(columns={'count':'arrest_counts'}, inplace=True)
merged_df = merged_df.sort_values(by='arrest_counts', ascending=False)
merged_df.reset_index(drop=True, inplace=True)
print(merged_df)
print("\n")

#correlation between arrest counts and income units

print("Correlation Between Arrest Counts and Extremely Low Income Units per Zipcode:")
print("\n")
arrest_ext_corr = merged_df['arrest_counts'].corr(merged_df['extremely_low_income_units'])
print(arrest_ext_corr)
abs_arrest_ext_corr = abs(arrest_ext_corr)

if abs_arrest_ext_corr >= 0 and abs_arrest_ext_corr < 0.20:
    print('Very Weak Correlation')
elif abs_arrest_ext_corr >= 0.20 and abs_arrest_ext_corr < 0.40:
    print('Weak Correlation')
elif abs_arrest_ext_corr >= 0.40 and abs_arrest_ext_corr < 0.60:
    print('Moderate Correlation')
elif abs_arrest_ext_corr >= 0.60 and abs_arrest_ext_corr < 0.80:
    print('Strong Correlation')
elif abs_arrest_ext_corr >= 0.80 and abs_arrest_ext_corr <= 1.00:
    print('Very Strong Correlation')
else:
    None

print("\n")
print("Correlation Between Arrest Counts and Low Income Units per Zipcode:")
print("\n")
arrest_low_corr = merged_df['arrest_counts'].corr(merged_df['low_income_units'])
print(arrest_low_corr)
abs_arrest_low_corr = abs(arrest_low_corr)

if abs_arrest_low_corr >= 0 and abs_arrest_low_corr < 0.20:
    print('Very Weak Correlation')
elif abs_arrest_low_corr >= 0.20 and abs_arrest_low_corr < 0.40:
    print('Weak Correlation')
elif abs_arrest_low_corr >= 0.40 and abs_arrest_low_corr < 0.60:
    print('Moderate Correlation')
elif abs_arrest_low_corr >= 0.60 and abs_arrest_low_corr < 0.80:
    print('Strong Correlation')
elif abs_arrest_low_corr >= 0.80 and abs_arrest_low_corr <= 1.00:
    print('Very Strong Correlation')
else:
    None

print("\n")
print("Correlation Between Arrest Counts and Middle Income Units per Zipcode:")
print("\n")
arrest_mid_corr = merged_df['arrest_counts'].corr(merged_df['middle_income_units'])
print(arrest_mid_corr)
abs_arrest_mid_corr = abs(arrest_mid_corr)

if abs_arrest_mid_corr >= 0 and abs_arrest_mid_corr < 0.20:
    print('Very Weak Correlation')
elif abs_arrest_mid_corr >= 0.20 and abs_arrest_mid_corr < 0.40:
    print('Weak Correlation')
elif abs_arrest_mid_corr >= 0.40 and abs_arrest_mid_corr < 0.60:
    print('Moderate Correlation')
elif abs_arrest_mid_corr >= 0.60 and abs_arrest_mid_corr < 0.80:
    print('Strong Correlation')
elif abs_arrest_mid_corr >= 0.80 and abs_arrest_mid_corr <= 1.00:
    print('Very Strong Correlation')
else:
    None

print("\n")
print("End of Program.")

print("\n")




This is a program to evaluate the following questions about New York City using data from the last 2 weeks of December 2023


 * What is the relationship between affordable housing in different neighborhoods in Brooklyn and arrests in that neighborhood? 
 * Does income affect crime and the types of crimes committed? 
 * Which neighborhood has the most frequent arrests? 
 * Is there a relationship between arrests and whether it is a holiday or a regular day? 

Dataframe for Arrests in Brooklyn in the Last 2 Weeks of December 2023


    arrest_date                     ofns_desc zipcode      neighborhood
0    2023-12-31                FELONY ASSAULT   11238  Prospect Heights
1    2023-12-31                 PETIT LARCENY   11201  Brooklyn Heights
2    2023-12-31  ASSAULT 3 & RELATED OFFENSES   11208     East New York
3    2023-12-31                 GRAND LARCENY   11201  Brooklyn Heights
4    2023-12-31               DANGEROUS DRUGS   11212       Brownsville
..          ...              