In [None]:
import pandas as pd 
import seaborn as sns
from scipy import stats
from matplotlib import pyplot as plt
import os
import sys

In [None]:
cwd = os.getcwd()
print(cwd)
dataPath = os.path.join(cwd, 'data')
austin_housing = pd.read_csv(os.path.join(dataPath, 'crime-housing-austin-2015.csv'))
zip_codes = pd.read_csv(os.path.join(dataPath, 'AustinZipCodes.csv'))

In [None]:
austin_housing.head()
austin_housing.columns

In [None]:
zip_codes.head()
zip_codes.columns

In [None]:
""" 
Karl's Analysis 
"""
import geopandas as gpd

df = austin_housing.copy()
# Date format: 01-Jan-15.  d=day, b=abbreviated month,y=2-digit year
# df['Report_Date'] = pd.to_datetime(df['Report_Date'], format='%d-%b-%y')


# Aggregate crimes by zip code
crime_zip = df[['Zip_Code_Crime', 'Highest_Offense_Desc']].copy()
crime_zip.dropna(inplace=True)
crime_zip.groupby(['Zip_Code_Crime', 'Highest_Offense_Desc']).count()
crime_zip['Total_Crime'] = crime_zip.groupby(['Zip_Code_Crime', 'Highest_Offense_Desc'])['Highest_Offense_Desc'].transform('count')
crime_zip['Zip_Code_Crime'] = crime_zip['Zip_Code_Crime'].astype('int64')
crime_zip['Zip_Code_Crime'] = crime_zip['Zip_Code_Crime'].astype('str')
# print(crime_zip.columns)
# print(crime_zip.head(5))

# Create a df with the number of THEFTS per zip code. Drop duplicates (dunno y there are duplicates)
theft_by_zip = crime_zip[crime_zip['Highest_Offense_Desc'] == 'THEFT'].copy().reset_index(drop=True)
theft_by_zip.drop(columns=['Highest_Offense_Desc'], inplace=True)
theft_by_zip.drop_duplicates(inplace=True)

""" 
Map of Austin TX  
https://openaustin.carto.com/u/oa-admin/tables/austin_area_zip_codes/public?redirected=true 
"""
# Load the GeoDataFrame
austinMap = gpd.read_file(os.path.join(dataPath, 'austin_area_zip_codes.geojson'))
austinMap['zipcode'] = austinMap['zipcode'].astype('str')

# Merge THEFT counts with austinMap
austinMap = austinMap.merge(theft_by_zip, left_on='zipcode', right_on='Zip_Code_Crime', how='left')
# Save GEOJSON file to look at data
# austinMap.to_file(os.path.join(dataPath, 'austinMap.geojson'), driver='GeoJSON')

# Plot the GeoDataFrame
fig, ax = plt.subplots(1, 1, figsize=(10, 10))
austinMap.plot(ax=ax, column='Total_Crime', cmap='OrRd', linewidth=0.5, edgecolor='black', legend=True)
ax.set_title('THEFT by ZipCode in Austin TX', fontsize=16)
plt.show()



In [None]:
""" 
Bethany's Analysis
"""
# grab crimes and dates
crime_by_date = df[['Report_Date', 'Highest_Offense_Desc']].copy()
# crime_by_date = crime_by_date[crime_by_date.Highest_Offense_Desc != "THEFT"]
# crime_by_date = crime_by_date[crime_by_date.Highest_Offense_Desc != "BURGLARY OF VEHICLE"]
# crime_by_date = crime_by_date[crime_by_date.Highest_Offense_Desc != "THEFT BY SHOPLIFTING"]
# crime_by_date = crime_by_date[crime_by_date.Highest_Offense_Desc != "BURGLARY OF RESIDENCE"]



# create month column, define order of months
crime_by_date["Month"] = crime_by_date['Report_Date'].apply(lambda x : x.split('-')[1])
month_order = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
crime_by_date["Month"] = pd.Categorical(crime_by_date['Month'], categories=month_order, ordered=True)

# groupby month and crime. Sum the total of each crime per month
crime_by_month = crime_by_date.groupby(["Month", "Highest_Offense_Desc"]).agg(Total_Reportings=('Highest_Offense_Desc', 'count')).reset_index()

# groupby month, grab the rows for the crimes with the highest total reportings that month
crime_by_month_idx = crime_by_month.groupby("Month")["Total_Reportings"].idxmax()
crime_of_the_month = crime_by_month.loc[crime_by_month_idx]

# crime_of_the_month

In [None]:
plt.figure(figsize=(10, 5))

sns.barplot(data=crime_of_the_month, x='Month', y='Total_Reportings', hue='Highest_Offense_Desc').set_title("Crime of the Month")

In [None]:
"""
Also Bethany's Analysis
"""
month_order = [ 'Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec' ]

# create a new column in crime_by_month df with the ordinal value for the months
crime_by_month["Month_Ordinal"] = (crime_by_month["Month"].apply(lambda x: month_order.index(x)).astype(int))
crime_by_month_sorted = crime_by_month.sort_values(by="Month_Ordinal", ascending=True)

display(crime_by_month_sorted)

# identify the crimes with more than 10 reportings total (29 columns)
crimes_over_10 = crime_by_month[crime_by_month.Total_Reportings >= 10]["Highest_Offense_Desc"].unique()

# define a dictionary for storing the pearson r results
cor_results = {}

# plot a regplot for each crime over time.
fig, axes = plt.subplots(15, 2, figsize=(20, 100))
axes = axes.flatten()
for i, crime in enumerate(crimes_over_10):
    crime_data = crime_by_month[crime_by_month['Highest_Offense_Desc'] == crime]
    sns.regplot(
        data=crime_data,
        x="Month_Ordinal",
        y="Total_Reportings",
        ax=axes[i],
        ci=None
    )
    axes[i].set_title(f'Reg Plot for {crime}')
    axes[i].set_xticks(range(len(month_order)))
    axes[i].set_xticklabels(month_order)
    
    # store pearson r result per crime
    cor = stats.pearsonr(crime_data.Month_Ordinal, crime_data.Total_Reportings)
    cor_results[crime] = cor

# turn result dictionary into dataframe, rename columns
result_df = pd.DataFrame(cor_results).T
result_df.columns = ["statistic", "pvalue"]

In [None]:
# display pearson r values for each crime
display(result_df)

# display only ones with very low pvalues
display(result_df[result_df.pvalue <= 0.05])


In [None]:
"""
EXPERIMENTAL ZONE. STAY CLEAR UNTIL PARSED OUT.
"""
chosen_crime = "THEFT OF AUTO PARTS"
month_order = [ 'Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec' ]

fig, axes = plt.subplots(4, 3, figsize=(20, 30))
axes = axes.flatten()
month_cor = {}
for i in range(len(month_order)):
    end = month_order[0]
    month_order.remove(end)
    month_order.append(end)
#     print(month_order)
    crime_by_month["Month_Ordinal"] = (crime_by_month["Month"].apply(lambda x: month_order.index(x)).astype(int))
    crime_data = crime_by_month[crime_by_month['Highest_Offense_Desc'] == chosen_crime]
    sns.regplot(
        data=crime_data,
        x="Month_Ordinal",
        y="Total_Reportings",
        ax=axes[i],
        ci=None
    )
    axes[i].set_title(f'Reg Plot for {chosen_crime} {month_order[0]} to {month_order[len(month_order)-1]}')
    axes[i].set_xticks(range(len(month_order)))
    axes[i].set_xticklabels(month_order)
    
    # store pearson r result per range
    cor = stats.pearsonr(crime_data.Month_Ordinal, crime_data.Total_Reportings)
    month_cor[f"{month_order[0]} to {month_order[len(month_order)-1]}"] = cor


results_df = pd.DataFrame(month_cor).T
results_df.columns = ["statistic", "pvalue"]
results_df