# Evaluating The Significance In Offense Counts Among Zip Codes
### Ho: Crime offences are equaly likely among all zip codes
### Ha: Some zip codes are more prone to crime than others

In [1]:
import pandas as pd
import scipy.stats as stats

In [5]:
# read the csv data file into a DataFrame
file = pd.read_csv("Resources/combined_flood_crime_and_school_data.csv")
data = pd.DataFrame(file)
data.columns

Index(['Unnamed: 0', 'Unnamed: 0.1', 'Address', 'Latitude', 'Longitude',
       'Flood Description', 'Flood Zone', 'Flood Risk', 'SITE_ADDR_1',
       'ACCOUNT', 'SITE_ADDR_3', 'NEIGHBORHOOD_CODE', 'NEIGHBORHOOD_GROUP',
       'TOTAL_BUILDING_AREA', 'TOTAL_LAND_AREA', 'ACREAGE', 'LAND_VALUE_2019',
       'TOTAL_APPRAISED_VALUE_2019', 'TOTAL_MARKET_VALUE_2019',
       'LAND_VALUE_2018', 'TOTAL_APPRAISED_VALUE_2018',
       'TOTAL_MARKET_VALUE_2018', 'pct_change_land_value_2018_2019',
       'pct_change_total_appraised_value_2018_2019',
       'pct_change_total_market_value_2018_2019', 'SQ_FT', 'NEIGHBORHOOD',
       'Offense Count', 'crime_rank', 'Elementary Public School ID',
       'Elementary Public School Name', 'Elementary Public School Rating',
       'Elementary Charter School ID', 'Elementary Charter School Name',
       'Elementary Charter School Rating', 'Public Middle School ID',
       'Public Middle School Name', 'Public Middle School Rating',
       'Charter Middle School 

In [12]:
# extract the columns of interest
data_2 = data[['SITE_ADDR_3',"Offense Count"]]
data_2.head()

Unnamed: 0,SITE_ADDR_3,Offense Count
0,77002.0,4870
1,77002.0,4870
2,77002.0,4870
3,77002.0,4870
4,77002.0,4870


In [15]:
data_2 = data_2.groupby(['SITE_ADDR_3']).mean()

In [16]:
data_2.head()

Unnamed: 0_level_0,Offense Count
SITE_ADDR_3,Unnamed: 1_level_1
77002.0,4870
77005.0,1026
77006.0,3759
77019.0,2330
77025.0,2270


In [17]:
# assuming there is no difference in crime among the different zip codes 
# calculate the expected offense count as equal for each zip code
data_2['Expected'] = data_2['Offense Count'].sum()/len(data_2['Offense Count'])
data_2

Unnamed: 0_level_0,Offense Count,Expected
SITE_ADDR_3,Unnamed: 1_level_1,Unnamed: 2_level_1
77002.0,4870,2539.777778
77005.0,1026,2539.777778
77006.0,3759,2539.777778
77019.0,2330,2539.777778
77025.0,2270,2539.777778
77027.0,2014,2539.777778
77030.0,1485,2539.777778
77054.0,3053,2539.777778
77098.0,2051,2539.777778


In [None]:
# The degree of freedom
df = len(data_2['Offense Count'])-1

In [20]:
# With a significance of 0.05, the confidence level is 1.00-0.05 = 0.95.
critical_value = stats.chi2.ppf(q = 0.95, df = df)
critical_value

15.50731305586545

In [22]:
# Run the chi square test with stats.chisquare()
stats.chisquare(data_2['Offense Count'], data_2['Expected'])

Power_divergenceResult(statistic=4416.153119258028, pvalue=0.0)

### Because the chisquare value (44416.15) is significantly higher than the critical value (15.51), we can reject the null hypothesis and assume that the observed crime among the different zip codes is actually significant.