In [37]:
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
from scipy import stats
from scipy.stats import ttest_ind
from scipy.stats import pearsonr

# Load the dataset
file_path = 'crime-housing-austin-2015.csv'
df = pd.read_csv(file_path)

## What is the relationship between housing affordability and crime rates?

In [None]:
# Change percentages to floats so that they can be graphed
df['Ownerunitsaffordabletoaverageteacher'] = df['Ownerunitsaffordabletoaverageteacher'].str.replace('%', '').astype('float')
df['Ownerunitsaffordabletoaveragetechworker'] = df['Ownerunitsaffordabletoaveragetechworker'].str.replace('%', '').astype('float')

# Drop N/A values in dataset for the demographics we're interested in
df.dropna(subset=['Ownerunitsaffordabletoaveragetechworker', 'Ownerunitsaffordabletoaverageteacher'])

# Look at the distribution for housing affordability
sns.distplot(df['Ownerunitsaffordabletoaveragetechworker'], label='tech worker', hist=False)
sns.distplot(df['Ownerunitsaffordabletoaverageteacher'], label='teacher', hist=False)
plt.xlabel('Percent of Affordable Owner Units in the Area of the Crime')
plt.ylabel('Density')
plt.title('Housing Affordability')
plt.legend()
plt.show()

## Interesting Observations:

Most crimes are reported in places where the average tech worker can afford to own a house/unit. That's not terribly surprising, because the average tech worker can probably afford to own a house/unit in most places.

It's strange that there is a spike in places where less than 40% of average tech workers can afford to buy a house.

There are a fair amount of crimes reported in places where less than 40% of teachers can afford to buy a house. 

There are relatively few crimes recorded for places where only about half of tech workers and only about half of teachers can afford to buy a house.

There is a spike in crimes recorded for places where about 70% of teachers can afford to buy a house.

In general, it looks like there are more crimes in more affordable (cheaper) housing areas.

In [None]:
# Create a column for how much time passes between a crime being reported and a crime being cleared
df['Clearance_Time'] = pd.to_datetime(df['Clearance_Date'], format='%d-%b-%y') - pd.to_datetime(df['Report_Date'], format='%d-%b-%y')
df['Clearance_Time'] = df['Clearance_Time'].dt.days
df = df.dropna(subset=['Clearance_Time', 'Clearance_Status'])

# Make a table for each demographic
# Areas where less than half of tech workers can afford to rent a place
lowAffordability = df[df['Ownerunitsaffordabletoaveragetechworker'] < 50]
# Areas where more than half of teachers can afford to rent a place
highAffordability = df[df['Ownerunitsaffordabletoaverageteacher'] > 50]

totalCrimes = len(lowAffordability) + len(highAffordability)
lowAffordabilityCrimesPercent = len(lowAffordability) / totalCrimes
highAffordabilityCrimesPercent = len(highAffordability) / totalCrimes
display(f"Percent of low affordability crimes: {lowAffordabilityCrimesPercent}")
display(f"Percent of high affordability crimes: {highAffordabilityCrimesPercent}")
display(f"There are {highAffordabilityCrimesPercent - lowAffordabilityCrimesPercent} % more crimes recorded in our high-affordability housing areas than in our low-affordability housing areas")

It looks like there isn't a significant difference in the quantity of crimes recorded for very expensive areas vs very cheap areas. That's somewhat surprising.

## Is there a difference between low-affordability housing areas and high-affordability housing areas in the time it takes for a crime in those areas to be cleared?

In [None]:
# Make a table for each clearance status in low affordability areas
lowAffordabilityArrest = lowAffordability[lowAffordability['Clearance_Status'] == 'C']
lowAffordabilityException = lowAffordability[lowAffordability['Clearance_Status'] == 'O']
lowAffordabilityNot = lowAffordability[lowAffordability['Clearance_Status'] == 'N']

# Make a table for each clearance status in high affordability areas
highAffordabilityArrest = highAffordability[highAffordability['Clearance_Status'] == 'C']
highAffordabilityException = highAffordability[highAffordability['Clearance_Status'] == 'O']
highAffordabilityNot = highAffordability[highAffordability['Clearance_Status'] == 'N']

# Take the mean of each clearance time by type, and housing affordability
display("Low Affordability areas:")
display(f"mean clearance time when cleared by arrest: {lowAffordabilityArrest['Clearance_Time'].mean()}")
display(f"percent of crimes cleared by arrest: {len(lowAffordabilityArrest) / len(lowAffordability)}")
display(f"mean clearance time when cleared by exception: {lowAffordabilityException['Clearance_Time'].mean()}")
display(f"percent of crimes cleared by exception: {len(lowAffordabilityException) / len(lowAffordability)}")
display(f"mean clearance time when not cleared: {lowAffordabilityNot['Clearance_Time'].mean()}")
display(f"percent of crimes not cleared: {len(lowAffordabilityNot) / len(lowAffordability)}")

display("High Affordability areas:")
display(f"mean clearance time when cleared by arrest: {highAffordabilityArrest['Clearance_Time'].mean()}")
display(f"percent of crimes cleared by arrest: {len(highAffordabilityArrest) / len(highAffordability)}")
display(f"mean clearance time when cleared by exception: {highAffordabilityException['Clearance_Time'].mean()}")
display(f"percent of crimes cleared by exception: {len(highAffordabilityException) / len(highAffordability)}")
display(f"mean clearance time when not cleared: {highAffordabilityNot['Clearance_Time'].mean()}")
display(f"percent of crimes not cleared: {len(highAffordabilityNot) / len(highAffordability)}")

# Compare the clearance time of each kind of status between low and high affordability housing areas
sns.distplot(x=lowAffordabilityArrest['Clearance_Time'], label='low', hist=False)
sns.distplot(x=highAffordabilityArrest['Clearance_Time'], label='high', hist=False)
plt.xlabel('Clearance Time (Days)')
plt.ylabel('Density')
plt.title('Distribution of Clearance Time for Crimes Cleared by Arrest')
plt.legend()
plt.show()

sns.distplot(x=lowAffordabilityException['Clearance_Time'], label='low', hist=False)
sns.distplot(x=highAffordabilityException['Clearance_Time'], label='high', hist=False)
plt.xlabel('Clearance Time (Days)')
plt.ylabel('Density')
plt.title('Distribution of Clearance Time for Crimes Cleared by Exception')
plt.legend()
plt.show()

sns.distplot(x=lowAffordabilityNot['Clearance_Time'], label='low', hist=False)
sns.distplot(x=highAffordabilityNot['Clearance_Time'], label='high', hist=False)
plt.xlabel('Clearance Time (Days)')
plt.ylabel('Density')
plt.title('Distribution of Clearance Time for Crimes Not Cleared')
plt.legend()
plt.show()

It seems that low-affordability housing areas (expensive housing) has lower clearance time than high-affordability housing areas (cheap housing). This could be because crimes being prosecuted in more expensive areas are given a higher priority, or that police departments in those areas have better funding. Let's run a t-test to see if this is statistically significant or not.

In [None]:
display(stats.ttest_ind(highAffordabilityArrest['Clearance_Time'], lowAffordabilityArrest['Clearance_Time']))
display(stats.ttest_ind(highAffordabilityException['Clearance_Time'], lowAffordabilityException['Clearance_Time']))
display(stats.ttest_ind(highAffordabilityNot['Clearance_Time'], lowAffordabilityNot['Clearance_Time']))

Now compare clearance time overall for each housing group:

In [None]:
sns.distplot(x=lowAffordability['Clearance_Time'], label='low', hist=False)
sns.distplot(x=highAffordability['Clearance_Time'], label='high', hist=False)
plt.xlabel('Clearance Time (Days)')
plt.ylabel('Density')
plt.title('Distribution of Clearance Time Overall')
plt.legend()
plt.show()

display(f"average clearance time for low affordability housing (in days): {lowAffordability['Clearance_Time'].mean()}")
display(f"average clearance time for high affordability housing (in days): {highAffordability['Clearance_Time'].mean()}")
display(stats.ttest_ind(highAffordability['Clearance_Time'], lowAffordability['Clearance_Time']))

Vast majority of crimes are not cleared. Minority of crimes are cleared by exception.

It takes less time for crimes to be cleared in areas with more expensive (low-affordability) housing.

# Is there a significant relationship between crime clearance rates and changes in median home values and median rent across districts in Austin?

In [None]:
# Crime Clearance Rate vs. Change in Median Home Value (2000-2012)
crime_clearance_rate = df.groupby('Council_District')['Clearance_Status'].apply(lambda x: (x == 'C').mean())
median_home_value_change = df.groupby('Council_District')['Changeinmedianhomevalue2000-2012'].apply(lambda x: float(x.iloc[0].strip('%')))

plt.figure(figsize=(8, 6))
plt.scatter(crime_clearance_rate, median_home_value_change, color='green')
plt.title('Crime Clearance Rate vs. Change in Median Home Value (2000-2012)')
plt.xlabel('Crime Clearance Rate')
plt.ylabel('Change in Median Home Value (%)')
plt.show()

# Crime Clearance Rate vs. Change in Median Rent (2000-2012)
median_rent_change = df.groupby('Council_District')['Changeinmedianrent2000-2012'].apply(lambda x: float(x.iloc[0].strip('%')))

plt.figure(figsize=(8, 6))
plt.scatter(crime_clearance_rate, median_rent_change, color='blue')
plt.title('Crime Clearance Rate vs. Change in Median Rent (2000-2012)')
plt.xlabel('Crime Clearance Rate')
plt.ylabel('Change in Median Rent (%)')
plt.show()

The first scatter plot shows the crime clearance rates on the x-axis and the change in median home values on the y-axis for each district. On the other hand second scatter plot displays crime clearance rates on the x-axis and the change in median rent on the y-axis. Each point represents a district.

In [None]:
# Group districts by higher and lower clearance rates
median_clearance_rate = crime_clearance_rate.median()
high_clearance_districts = crime_clearance_rate[crime_clearance_rate >= median_clearance_rate].index
low_clearance_districts = crime_clearance_rate[crime_clearance_rate < median_clearance_rate].index

# Extract data for high and low clearance districts
high_clearance_home_value_change = median_home_value_change[high_clearance_districts]
low_clearance_home_value_change = median_home_value_change[low_clearance_districts]

high_clearance_rent_change = median_rent_change[high_clearance_districts]
low_clearance_rent_change = median_rent_change[low_clearance_districts]

t_stat_home_value, p_value_home_value = ttest_ind(high_clearance_home_value_change, low_clearance_home_value_change)
t_stat_rent, p_value_rent = ttest_ind(high_clearance_rent_change, low_clearance_rent_change)

print(f"T-test for Home Value Change: t-statistic = {t_stat_home_value}, p-value = {p_value_home_value}")
print(f"T-test for Rent Change: t-statistic = {t_stat_rent}, p-value = {p_value_rent}")


**T-Test for Home Value Change:**
* t-statistic: 0.8188
* p-value: 0.4366

**t-statistic:** A t-statistic of 0.8188 indicates that the difference in the mean change in median home values between districts with high and low crime clearance rates is small. The t-statistic measures how many standard deviations the means of the two groups are apart. A small t-value (close to 0) suggests that the two groups are not very different in terms of their means.

**p-value:** A p-value of 0.4366 is much higher than the commonly used threshold of 0.05. This means that the probability of observing this difference (or a more extreme one) purely by chance is around 43.66%. Since this p-value is high, we fail to reject the null hypothesis.

**Conclusion:**
No significant difference: The difference in home value changes between districts with high and low crime clearance rates is not statistically significant. In simpler terms, there is no strong evidence to suggest that higher crime clearance rates are associated with changes in median home values.

**T-Test for Rent Change:**
* t-statistic: 1.0180
* p-value: 0.3385

**t-statistic:** A t-statistic of 1.0180 is slightly higher than the t-statistic for home values but still small, indicating that the difference in the mean change in median rent between districts with high and low crime clearance rates is also small.

**p-value:** A p-value of 0.3385 is again higher than 0.05. This means the probability of observing this difference in rent change purely by chance is around 33.85%. Like the home value change, this p-value is too high to consider the result statistically significant.

**Conclusion:**
No significant difference: The difference in rent changes between districts with high and low crime clearance rates is also not statistically significant. This means there isn't sufficient evidence to conclude that higher crime clearance rates are associated with changes in median rents.

Both tests show that there is no statistically significant difference between the districts with high crime clearance rates and those with low clearance rates in terms of:

* The change in median home value.
* The change in median rent.

This means that, based on the data, we cannot conclude that crime clearance rates have a meaningful impact on changes in property values or rents. The differences that do exist between the two groups could have arisen by chance, as indicated by the high p-values in both tests.

In [None]:
# Perform Pearson correlation test for crime clearance rate vs. change in median home value
pearson_home_value_corr, p_value_home_value_corr = pearsonr(crime_clearance_rate, median_home_value_change)

# Perform Pearson correlation test for crime clearance rate vs. change in median rent
pearson_rent_corr, p_value_rent_corr = pearsonr(crime_clearance_rate, median_rent_change)

# Print the Pearson correlation coefficients and corresponding p-values
print(f"Pearson Correlation for Crime Clearance Rate vs. Change in Median Home Value: {pearson_home_value_corr}, p-value: {p_value_home_value_corr}")
print(f"Pearson Correlation for Crime Clearance Rate vs. Change in Median Rent: {pearson_rent_corr}, p-value: {p_value_rent_corr}")


**Crime Clearance Rate vs. Change in Median Home Value:**

**Pearson Correlation Coefficient (r):** 0.0267
This value is very close to zero, indicating almost no linear relationship between crime clearance rates and changes in median home values across districts.

**p-value:** 0.9417

This p-value is significantly higher than the commonly used threshold of 0.05, suggesting that the correlation is not statistically significant. In other words, the weak relationship observed could easily be due to random chance.

**Conclusion:** There is no significant correlation between crime clearance rates and changes in median home values. The data does not suggest that clearing more crimes has any noticeable effect on home value changes.

**Crime Clearance Rate vs. Change in Median Rent:**

**Pearson Correlation Coefficient (r):** 0.4093

This shows a moderate positive relationship between crime clearance rates and changes in median rent. However, the relationship is not particularly strong.

**p-value:** 0.2402

This p-value is higher than 0.05, indicating that the correlation is not statistically significant. The observed relationship could be due to chance, and there is no strong evidence to support a significant impact of clearance rates on rent changes.

**Conclusion:** While there is a moderate positive correlation between crime clearance rates and rent changes, the relationship is not statistically significant. This means the data does not provide strong evidence that higher crime clearance rates lead to greater increases in rent.

# How do crime clearance rates for violent crimes vary across districts in Austin, and what patterns can we observe regarding the likelihood of crimes being cleared by arrest, exception, or not cleared at all?

In [None]:
# Clearance Rates for Violent Crimes by District in Austin (2015)
violent_crimes_keywords = ['Robbery', 'Assault', 'Homicide', 'Murder']
violent_crimes_data = df[df['Highest_NIBRS_UCR_Offense_Description'].str.contains('|'.join(violent_crimes_keywords), case=False, na=False)]
clearance_rate_by_district = violent_crimes_data.groupby(['Council_District', 'Highest_NIBRS_UCR_Offense_Description'])['Clearance_Status'].value_counts(normalize=True).unstack().fillna(0)
clearance_rate_by_district.columns = ['Cleared by Arrest', 'Not Cleared', 'Cleared by Exception']

clearance_rate_by_district.plot(kind='bar', stacked=True, figsize=(14, 8), color=['green', 'red', 'orange'])
plt.title('Clearance Rates for Violent Crimes by District in Austin (2015)')
plt.xlabel('District and Crime Type')
plt.ylabel('Proportion of Cases')
plt.legend(title='Clearance Status')
plt.tight_layout()
plt.show()

The chart shows variability across districts in terms of how violent crimes are resolved. Some districts see more success in clearing crimes through arrests, while others struggle to clear cases or rely more on exceptional circumstances.

There is no district where all violent crimes are cleared by arrest, indicating that across the city, solving violent crimes is complex and challenging.