In [None]:
import pandas as pd 
import seaborn as sns
from matplotlib import pyplot as plt
import os
import geopandas as gpd
import sys
import scipy.stats as stats

In [None]:
cwd = os.getcwd()
print(cwd)
dataPath = os.path.join(cwd, 'data')
austin_housing = pd.read_csv(os.path.join(dataPath, 'crime-housing-austin-2015.csv'))
zip_codes = pd.read_csv(os.path.join(dataPath, 'AustinZipCodes.csv'))

In [None]:
austin_housing.head()
austin_housing.columns

In [None]:
# zip_codes.head()
zip_codes.columns
zip_codes.tail()

In [None]:
""" 
Karl's Spatial Analysis 
"""

""" 
Map of Austin TX  
https://openaustin.carto.com/u/oa-admin/tables/austin_area_zip_codes/public?redirected=true 
"""

df = austin_housing.copy()
# Date format: 01-Jan-15.  d=day, b=abbreviated month,y=2-digit year
# df['Report_Date'] = pd.to_datetime(df['Report_Date'], format='%d-%b-%y')

# Aggregate crimes by zip code
crime_zip = df[['Zip_Code_Crime', 'Highest_Offense_Desc']].copy()
crime_zip.rename(columns={'Zip_Code_Crime': 'Zip_Code'}, inplace=True)
crime_zip.dropna(inplace=True)
crime_zip['Zip_Code'] = crime_zip['Zip_Code'].astype('int').astype('str')
crime_zip.groupby(['Zip_Code', 'Highest_Offense_Desc']).count()
crime_zip['Total_Crime'] = crime_zip.groupby(['Zip_Code', 'Highest_Offense_Desc'])['Highest_Offense_Desc'].transform('count')

# Process the crime_zip df to get zip codes and remove duplicates
crime_zip.drop_duplicates(subset=['Zip_Code'], inplace=True)
crime_zip.sort_values(by=['Total_Crime'], ascending=False, inplace=True)
crime_zip.reset_index(drop=True, inplace=True)
crime_zip.drop(columns=['Highest_Offense_Desc'], inplace=True)
crime_zip['Total_Crime'] = crime_zip['Total_Crime'].astype('int')

# Remove zip codes that do not show up in the AustinZipCodes.csv
# display the missing zip codes
missing_zip_codes = crime_zip[~crime_zip['Zip_Code'].isin(zip_codes['Zip Code'].astype('str'))]
crime_zip = crime_zip[crime_zip['Zip_Code'].isin(zip_codes['Zip Code'].astype('str'))]
print(f'missing zip codes: {missing_zip_codes}')

# Load the GeoDataFrame and merge it with crime_zip
austinMap = gpd.read_file(os.path.join(dataPath, 'austin_area_zip_codes.geojson'))
austinMap['zipcode'] = austinMap['zipcode'].astype('str')

# Merge num crimes per zip with the austin map 
austinMapCrimeZip = austinMap.copy()
fig, ax = plt.subplots(1, 1, figsize=(10, 10))
austinMapCrimeZip = austinMapCrimeZip.merge(crime_zip, left_on='zipcode', right_on='Zip_Code', how='left')

# Plot the GeoDataFrame
austinMapCrimeZip.plot(ax=ax, figsize=(10, 10), column='Total_Crime', cmap='OrRd', linewidth=0.5, edgecolor='black', legend=True)
ax.set_title('Total Crime by ZipCode in Austin TX', fontsize=16)
plt.show()



"""
Make a new column, titled theft_crime_rate, that is the number of crimes per 1000 people
"""
crime_rate = crime_zip[['Zip_Code', 'Total_Crime']].copy()
population_zip = pd.read_csv(os.path.join(dataPath, 'AustinZipCodes.csv'))
population_zip = population_zip[['Zip Code', 'Population']].copy()
population_zip['Zip Code'] = population_zip['Zip Code'].astype('str')

# Convert population into an int
population_zip['Population'] = population_zip['Population'].str.replace(',', '')
population_zip['Population'] = population_zip['Population'].astype('int')

# Merge population_zip with crime_rate
crime_rate = crime_rate.merge(population_zip, left_on='Zip_Code', right_on='Zip Code', how='left')
crime_rate['crime_per_1000'] = crime_rate['Total_Crime'] / (crime_rate['Population'] / 1000)
crime_rate.sort_values(by=['crime_per_1000'], ascending=False, inplace=True)
crime_rate.drop(columns=['Total_Crime', 'Zip Code', 'Population'], inplace=True)
crime_rate['crime_per_1000'] = crime_rate['crime_per_1000'].fillna(0)
# crime_rate.dropna(inplace=True)
print(crime_rate.head(5))
print(crime_rate.info())

# Plot the population with the geojson map 
# Analysis: How correlated is the number of crimes to the population of a zip code?
austinMapCrimeRate = austinMap.copy()
fig, ax = plt.subplots(1, 1, figsize=(10, 10))
austinMapCrimeRate = austinMapCrimeRate.merge(crime_rate, left_on='zipcode', right_on='Zip_Code', how='left')
austinMapCrimeRate.plot(ax=ax, figsize=(10, 10), column='crime_per_1000', 
                        cmap='OrRd', linewidth=0.5, edgecolor='black', 
                        legend=True, )
ax.set_title('Total Crime per 1000 residents in Austin TX', fontsize=16)
plt.show()


# Conduct a t-test on crime rates and zip codes
# Calculate Pearson correlation coefficient and p-value
# Perform Kruskal-Wallis test
statistic, p_value = stats.kruskal(*[group["crime_per_1000"] for name, group in crime_rate.groupby("Zip_Code")])

# Output results
print(f"Kruskal-Wallis H-statistic: {statistic}")
print(f"P-value: {p_value}")

print("The Kruskal-Wallis test is NOT significant. We fail to reject the null hypothesis that the crime rates are the same across all zip codes.")
print("Conclusion: The crime rates are not significantly different across all zip codes.")


