# Question 4: Is different race a factor influencing the case resolution time?
Hypothesis: different race would have a different case resolution time in the same neighborhood

# Data Munging

In [43]:
import pandas as pd 
import numpy as np
from sklearn.linear_model import LinearRegression
from scipy import stats
import matplotlib.pyplot as plt

In [42]:
data = pd.read_excel('311_Cases_cleaned.xlsx')
data.head()

Unnamed: 0.1,Unnamed: 0,neighborhood,year,mobile.dumm,case,month,request,notes,photo.dumm,income,...,resolution_rate,CaseID,latitude,longitude,date,case_29,registered,turnout,turnout_one_lag,turnout_two_lag
0,1,Bayview,2012,1,Street and Sidewalk Cleaning,September,General Cleaning,Case Completed - resolved:,0,56718.0,...,1,1403458,37.735714,-122.390938,09/10/2012 06:50:38 AM,Street and Sidewalk Cleaning,19566,61.23,47.25,69.69
1,2,Bayview,2012,0,Street and Sidewalk Cleaning,December,Bulky Items,Case Completed - resolved:,0,56718.0,...,1,1744682,37.734825,-122.387383,12/01/2012 03:02:55 PM,Street and Sidewalk Cleaning,19566,61.23,47.25,69.69
2,3,Bayview,2012,1,Street and Sidewalk Cleaning,August,General Cleaning,Case Completed - resolved:,0,56718.0,...,1,1300261,37.73407,-122.384796,08/18/2012 07:20:42 PM,Street and Sidewalk Cleaning,19566,61.23,47.25,69.69
3,4,Bayview,2012,0,Street and Sidewalk Cleaning,October,General Cleaning,Case Completed - resolved:,0,56718.0,...,1,1616379,37.733616,-122.38723,10/30/2012 08:57:12 AM,Street and Sidewalk Cleaning,19566,61.23,47.25,69.69
4,5,Bayview,2012,0,Abandoned Vehicle,March,Abandoned Vehicle - SUV,DPT Abandoned Vehicles - Gone on Arrival,0,56718.0,...,0,1094069,37.730515,-122.387863,03/28/2012 05:50:27 PM,Abandoned Vehicle,19566,61.23,47.25,69.69


In [None]:
data = data.dropna()
data.head()

Unnamed: 0.1,Unnamed: 0,neighborhood,year,mobile.dumm,case,month,request,notes,photo.dumm,income,...,resolution_rate,CaseID,latitude,longitude,date,case_29,registered,turnout,turnout_one_lag,turnout_two_lag
0,1,Bayview,2012,1,Street and Sidewalk Cleaning,September,General Cleaning,Case Completed - resolved:,0,56718.0,...,1,1403458,37.735714,-122.390938,09/10/2012 06:50:38 AM,Street and Sidewalk Cleaning,19566,61.23,47.25,69.69
1,2,Bayview,2012,0,Street and Sidewalk Cleaning,December,Bulky Items,Case Completed - resolved:,0,56718.0,...,1,1744682,37.734825,-122.387383,12/01/2012 03:02:55 PM,Street and Sidewalk Cleaning,19566,61.23,47.25,69.69
2,3,Bayview,2012,1,Street and Sidewalk Cleaning,August,General Cleaning,Case Completed - resolved:,0,56718.0,...,1,1300261,37.73407,-122.384796,08/18/2012 07:20:42 PM,Street and Sidewalk Cleaning,19566,61.23,47.25,69.69
3,4,Bayview,2012,0,Street and Sidewalk Cleaning,October,General Cleaning,Case Completed - resolved:,0,56718.0,...,1,1616379,37.733616,-122.38723,10/30/2012 08:57:12 AM,Street and Sidewalk Cleaning,19566,61.23,47.25,69.69
4,5,Bayview,2012,0,Abandoned Vehicle,March,Abandoned Vehicle - SUV,DPT Abandoned Vehicles - Gone on Arrival,0,56718.0,...,0,1094069,37.730515,-122.387863,03/28/2012 05:50:27 PM,Abandoned Vehicle,19566,61.23,47.25,69.69


In [None]:
grouped_data = data.groupby('neighborhood').agg({
    'resolution_time': 'mean',  
    'num_cases': 'sum',
    'white': 'mean',          
    'black': 'mean',         
    'asian': 'mean',       
}).reset_index()  

In [None]:
grouped_data

Unnamed: 0,neighborhood,resolution_time,num_cases,white,black,asian
0,Bayview,645.132856,147735576,0.130083,0.260681,0.408174
1,Bernal Heights,1300.604011,125140302,0.582355,0.051077,0.171284
2,Chinatown,1541.666593,250740408,0.136574,0.008009,0.814989
3,Excelsior,704.772993,111074123,0.262627,0.021906,0.49418
4,Haight Ashbury,773.104064,34671037,0.793441,0.038801,0.091813
5,Inner Richmond,947.104992,186015244,0.537812,0.017385,0.367065
6,Inner Sunset,937.407031,59528911,0.564878,0.020142,0.325147
7,Marina,971.976605,37465423,0.843114,0.008231,0.099242
8,Mission,751.962209,5883890923,0.566238,0.030077,0.137568
9,Noe Valley,1139.779873,25059948,0.750441,0.025735,0.12868


# Model

In [None]:
X = grouped_data[['white', 'black', 'asian']]
y = grouped_data['resolution_time']

In [None]:
# Fit the linear model
model = LinearRegression()

linear_fit = model.fit(X,y)

In [None]:
coefficients = linear_fit.coef_
intercept = linear_fit.intercept_

predictions = linear_fit.predict(X)

MSE = np.mean((y - predictions)**2)
variance = MSE * (np.linalg.inv(np.dot(X.T, X)).diagonal())
standard_errors = np.sqrt(variance)
t_values = coefficients / standard_errors
p_values = [2 * (1 - stats.t.cdf(np.abs(i), (len(X) - 1))) for i in t_values]

In [None]:
results = pd.DataFrame({
    'Coefficients': coefficients,
    'Standard Errors': standard_errors,
    't values': t_values,
    'p values': p_values
}, index=['white', 'black', 'asian'])

In [None]:
results

Unnamed: 0,Coefficients,Standard Errors,t values,p values
white,3302.559826,231.615099,14.258828,1.337552e-11
black,5596.298518,1492.118149,3.750573,0.001354206
asian,2878.458237,369.738635,7.785116,2.505272e-07


The p-values are very low for all three racial demographic variables, which indicates that changes in the racial composition are associated with changes in resolution times.

Based on these results, we can conclude that there is a very strong statistical relationship between the racial demographics and the resolution time.