## 1. Import a 311 NYC service request.

In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
%matplotlib inline
import sklearn
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [2]:
dataframe = pd.read_csv(r'C:\Users\jlod9\OneDrive\Desktop\AI__Projects\Data_Science_with_Python\Projects\311_Service_Requests_from_2010_to_Present.csv')

In [3]:
dataframe.head()

Unnamed: 0,Unique Key,Created Date,Closed Date,Agency,Agency Name,Complaint Type,Descriptor,Location Type,Incident Zip,Incident Address,...,Bridge Highway Name,Bridge Highway Direction,Road Ramp,Bridge Highway Segment,Garage Lot Name,Ferry Direction,Ferry Terminal Name,Latitude,Longitude,Location
0,32310363,12/31/2015 11:59:45 PM,01-01-16 0:55,NYPD,New York City Police Department,Noise - Street/Sidewalk,Loud Music/Party,Street/Sidewalk,10034.0,71 VERMILYEA AVENUE,...,,,,,,,,40.865682,-73.923501,"(40.86568153633767, -73.92350095571744)"
1,32309934,12/31/2015 11:59:44 PM,01-01-16 1:26,NYPD,New York City Police Department,Blocked Driveway,No Access,Street/Sidewalk,11105.0,27-07 23 AVENUE,...,,,,,,,,40.775945,-73.915094,"(40.775945312321085, -73.91509393898605)"
2,32309159,12/31/2015 11:59:29 PM,01-01-16 4:51,NYPD,New York City Police Department,Blocked Driveway,No Access,Street/Sidewalk,10458.0,2897 VALENTINE AVENUE,...,,,,,,,,40.870325,-73.888525,"(40.870324522111424, -73.88852464418646)"
3,32305098,12/31/2015 11:57:46 PM,01-01-16 7:43,NYPD,New York City Police Department,Illegal Parking,Commercial Overnight Parking,Street/Sidewalk,10461.0,2940 BAISLEY AVENUE,...,,,,,,,,40.835994,-73.828379,"(40.83599404683083, -73.82837939584206)"
4,32306529,12/31/2015 11:56:58 PM,01-01-16 3:24,NYPD,New York City Police Department,Illegal Parking,Blocked Sidewalk,Street/Sidewalk,11373.0,87-14 57 ROAD,...,,,,,,,,40.73306,-73.87417,"(40.733059618956815, -73.87416975810375)"


In [4]:
dataframe.shape

(300698, 53)

In [5]:
dataframe.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300698 entries, 0 to 300697
Data columns (total 53 columns):
Unique Key                        300698 non-null int64
Created Date                      300698 non-null object
Closed Date                       298534 non-null object
Agency                            300698 non-null object
Agency Name                       300698 non-null object
Complaint Type                    300698 non-null object
Descriptor                        294784 non-null object
Location Type                     300567 non-null object
Incident Zip                      298083 non-null float64
Incident Address                  256288 non-null object
Street Name                       256288 non-null object
Cross Street 1                    251419 non-null object
Cross Street 2                    250919 non-null object
Intersection Street 1             43858 non-null object
Intersection Street 2             43362 non-null object
Address Type                      29

### 2. Read or convert the columns ‘Created Date’ and Closed Date’ to datetime datatype and create a new column ‘Request_Closing_Time’ as the time elapsed between request creation and request closing.

#### explore package/model datetime

In [6]:
dataframe['Created Date'] = pd.to_datetime(dataframe['Created Date'])

In [7]:
dataframe['Closed Date'] = pd.to_datetime(dataframe['Closed Date'])

In [8]:
dataframe['Request_Closing_Time'] = dataframe['Closed Date'] - dataframe['Created Date']

In [9]:
dataframe['Request_Closing_Time'] = dataframe['Request_Closing_Time'].dt.seconds

### 5. Statistical Analysis - Whether the average response time across complaint types is similar or not (overall)

Define the Hypothesis:
- H0:  average response time across complaints are equal 
- HA:  average response time across complaints are not equal
---
- alpha = 1% or CI = 99%
---
- z (samples > 30) or t (samples < 30)
---
- p-value -> It is the probability that the Null Hypothesis is true given the data we have.
---
- if p value is > 0.005 -> Bulk region (Fail to Reject Null Hypothesis)
- if p value is < 0.005 -> critical region (accept alternate)

#### We are dealing with mulitple means so we will do an analysis of variance using the F-statistic, had it only been comparing two means we would use t-test

In [10]:
complaint_response_time = dataframe[['Unique Key', 'Complaint Type','Request_Closing_Time']].groupby(
    by=['Complaint Type', 'Request_Closing_Time'])
complaint_response_time_mean = complaint_response_time.mean()

In [11]:
complaint_response_time_mean

Unnamed: 0_level_0,Unnamed: 1_level_0,Unique Key
Complaint Type,Request_Closing_Time,Unnamed: 2_level_1
Agency Issues,4073.0,30675430.0
Agency Issues,9736.0,30928156.0
Agency Issues,10628.0,30460208.0
Agency Issues,24686.0,30709273.0
Agency Issues,27120.0,31059332.0
...,...,...
Vending,83160.0,31917905.0
Vending,83540.0,30736759.0
Vending,84183.0,30736567.0
Vending,85640.0,30611104.0


In [12]:
dataframe['Request_Closing_Time'] = dataframe['Request_Closing_Time'].fillna(0)

In [13]:
import scipy.stats as stats

Fstatitic, pvalue = stats.f_oneway(dataframe['Request_Closing_Time'][dataframe['Complaint Type'] == 'Noise - Street/Sidewalk'],
                   dataframe['Request_Closing_Time'][dataframe['Complaint Type'] == 'Blocked Driveway'],
                   dataframe['Request_Closing_Time'][dataframe['Complaint Type'] == 'Illegal Parking'],        
                   dataframe['Request_Closing_Time'][dataframe['Complaint Type'] == 'Derelict Vehicle'],
                   dataframe['Request_Closing_Time'][dataframe['Complaint Type'] == 'Noise - Commercial'],
                   dataframe['Request_Closing_Time'][dataframe['Complaint Type'] == 'Noise - House of Worship'],             
                   dataframe['Request_Closing_Time'][dataframe['Complaint Type'] == 'Posting Advertisement'],
                   dataframe['Request_Closing_Time'][dataframe['Complaint Type'] == 'Noise - Vehicle'],
                   dataframe['Request_Closing_Time'][dataframe['Complaint Type'] == 'Animal Abuse'],        
                   dataframe['Request_Closing_Time'][dataframe['Complaint Type'] == 'Vending'],
                   dataframe['Request_Closing_Time'][dataframe['Complaint Type'] == 'Traffic'],
                   dataframe['Request_Closing_Time'][dataframe['Complaint Type'] == 'Drinking'],
                   dataframe['Request_Closing_Time'][dataframe['Complaint Type'] == 'Bike/Roller/Skate Chronic'],
                   dataframe['Request_Closing_Time'][dataframe['Complaint Type'] == 'Panhandling'],
                   dataframe['Request_Closing_Time'][dataframe['Complaint Type'] == 'Noise - Park'],        
                   dataframe['Request_Closing_Time'][dataframe['Complaint Type'] == 'Homeless Encampment'],
                   dataframe['Request_Closing_Time'][dataframe['Complaint Type'] == 'Urinating in Public'],
                   dataframe['Request_Closing_Time'][dataframe['Complaint Type'] == 'Graffiti'],
                   dataframe['Request_Closing_Time'][dataframe['Complaint Type'] == 'Disorderly Youth'],
                   dataframe['Request_Closing_Time'][dataframe['Complaint Type'] == 'Illegal Fireworks'],
                   dataframe['Request_Closing_Time'][dataframe['Complaint Type'] == 'Ferry Complaint'],
                   dataframe['Request_Closing_Time'][dataframe['Complaint Type'] == 'Agency Issues'],
                   dataframe['Request_Closing_Time'][dataframe['Complaint Type'] == 'Squeegee'],
                   dataframe['Request_Closing_Time'][dataframe['Complaint Type'] == 'Animal in a Park'],
                  )

In [14]:
print('The Fstatistic is', round(Fstatitic,4), '\nThe p-value is', pvalue)

The Fstatistic is 426.6598 
The p-value is 0.0


In [15]:
print(pvalue,'< 0.005 so we reject the Null Hypothesis and accept the Alternative Hypothesis.\nWe can say with 99% confidence that average response time across complaints are not equal')

0.0 < 0.005 so we reject the Null Hypothesis and accept the Alternative Hypothesis.
We can say with 99% confidence that average response time across complaints are not equal


### 5. Statistical Analysis - Are the type of complaint or service requested and location related?

#### - HO: There is no relation between type of complaint or service requested and location. (no association),  p-value > 0.05
#### - HA: There is a relation between type of complaint or service requested and location. (association), p-value < 0.05

In [16]:
com_by_loc = dataframe[['Unique Key','Location Type','Complaint Type']].groupby(by=['Location Type','Complaint Type']).count()

In [17]:
com_by_loc = com_by_loc.unstack()

In [18]:
com_by_loc = com_by_loc.fillna(0)
com_by_loc

Unnamed: 0_level_0,Unique Key,Unique Key,Unique Key,Unique Key,Unique Key,Unique Key,Unique Key,Unique Key,Unique Key,Unique Key,Unique Key,Unique Key,Unique Key,Unique Key,Unique Key,Unique Key,Unique Key,Unique Key,Unique Key,Unique Key,Unique Key
Complaint Type,Homeless Encampment,Drinking,Noise - Commercial,Urinating in Public,Animal Abuse,Ferry Complaint,Derelict Vehicle,Traffic,Noise - House of Worship,Animal in a Park,...,Vending,Posting Advertisement,Bike/Roller/Skate Chronic,Disorderly Youth,Graffiti,Blocked Driveway,Illegal Parking,Noise - Street/Sidewalk,Noise - Vehicle,Squeegee
Location Type,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
Bridge,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Club/Bar/Restaurant,0.0,366.0,16973.0,21.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Commercial,0.0,0.0,0.0,0.0,62.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Ferry,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Highway,15.0,0.0,0.0,0.0,0.0,0.0,14.0,186.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
House and Store,0.0,0.0,0.0,0.0,93.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
House of Worship,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,929.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Park,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Park/Playground,353.0,98.0,0.0,38.0,123.0,0.0,0.0,0.0,0.0,0.0,...,106.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Parking Lot,0.0,0.0,0.0,0.0,110.0,0.0,0.0,0.0,0.0,0.0,...,0.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [19]:
com_by_loc.shape

(18, 23)

In [20]:
com_by_loc.columns

MultiIndex([('Unique Key',       'Homeless Encampment'),
            ('Unique Key',                  'Drinking'),
            ('Unique Key',        'Noise - Commercial'),
            ('Unique Key',       'Urinating in Public'),
            ('Unique Key',              'Animal Abuse'),
            ('Unique Key',           'Ferry Complaint'),
            ('Unique Key',          'Derelict Vehicle'),
            ('Unique Key',                   'Traffic'),
            ('Unique Key',  'Noise - House of Worship'),
            ('Unique Key',          'Animal in a Park'),
            ('Unique Key',         'Illegal Fireworks'),
            ('Unique Key',              'Noise - Park'),
            ('Unique Key',               'Panhandling'),
            ('Unique Key',                   'Vending'),
            ('Unique Key',     'Posting Advertisement'),
            ('Unique Key', 'Bike/Roller/Skate Chronic'),
            ('Unique Key',          'Disorderly Youth'),
            ('Unique Key',     

In [21]:
import scipy 
from scipy.stats import chi2_contingency

In [22]:
chi2, pvalue, dof, Exp_val_table = scipy.stats.chi2_contingency(com_by_loc)  ## we create an array that contains the expacted value of each occurence 
Exp_val_table

array([[2.93245766e-02, 8.51058167e-03, 2.36692651e-01, 3.93256745e-03,
        5.17355531e-02, 1.33081809e-05, 1.17843942e-01, 2.98968283e-02,
        6.18165001e-03, 6.65409044e-06, 1.11788719e-03, 2.68891795e-02,
        2.04280576e-03, 2.52855437e-02, 4.31850469e-03, 2.84129662e-03,
        1.90306986e-03, 7.51912219e-04, 5.12411542e-01, 5.01226016e-01,
        3.23395449e-01, 1.13651865e-01, 2.66163617e-05],
       [2.54537324e+02, 7.38718489e+01, 2.05449221e+03, 3.41346854e+01,
        4.49064601e+02, 1.15515010e-01, 1.02288541e+03, 2.59504470e+02,
        5.36567221e+01, 5.77575050e-02, 9.70326084e+00, 2.33398078e+02,
        1.77315540e+01, 2.19478519e+02, 3.74846207e+01, 2.46624546e+01,
        1.65186464e+01, 6.52659806e+00, 4.44773219e+03, 4.35064182e+03,
        2.80707250e+03, 9.86498185e+02, 2.31030020e-01],
       [9.09061873e-01, 2.63828032e-01, 7.33747218e+00, 1.21909591e-01,
        1.60380215e+00, 4.12553607e-04, 3.65316219e+00, 9.26801678e-01,
        1.91631150e-01

In [23]:
pvalue

0.0

In [24]:
chi2

1638407.5805696272

In [25]:
print('The p-value is', pvalue,'which is less than 0.05 so we reject the Null Hypothesis and accept the Alternative Hypothesis.\nThere is a relation between type of complaint or service requested and location.')

The p-value is 0.0 which is less than 0.05 so we reject the Null Hypothesis and accept the Alternative Hypothesis.
There is a relation between type of complaint or service requested and location.
