# ANALYSIS OF NYPD MOTOR VEHICLE COLLISIONS
* DataSource: https://data.cityofnewyork.us/Public-Safety/NYPD-Motor-Vehicle-Collisions/h9gi-nx95/data
---         
            
### The full analysis will consist of the following:
* WHEN accidents happen
* WHY accidents happen
* WHERE accidents happen
* Is it possible to the contributing factors of accidents?
---
        
### This analysis will consider the WHY 

#### CONTRIBUTING FACTORS of the accidents in NYC Boroughs

* Find total number of accidents by all contributing factors
* Analyse contributing factors by category: injured vs killed
* Find the biggest contributing factors related to the accidents over the years in the available dataset

In [2]:
# Import required packages
import pandas as pd
import numpy as np
import datetime


#Loading NYPD dataset
#Displaying data using pandas dataframe
#The correct encoding must be used to read the CSV in pandas ISO-8859-1

data_load = "NYPD-Data.csv"
data = pd.read_csv(data_load, encoding="ISO-8859-1", low_memory=False)

df = pd.DataFrame(data)
df.head()


Unnamed: 0,DATE,TIME,BOROUGH,ZIP CODE,LATITUDE,LONGITUDE,LOCATION,ON STREET NAME,CROSS STREET NAME,OFF STREET NAME,...,NUMBER OF PERSONS KILLED,NUMBER OF PEDESTRIANS INJURED,NUMBER OF PEDESTRIANS KILLED,NUMBER OF CYCLIST INJURED,NUMBER OF CYCLIST KILLED,NUMBER OF MOTORIST INJURED,NUMBER OF MOTORIST KILLED,CONTRIBUTING FACTOR VEHICLE 1,UNIQUE KEY,VEHICLE TYPE CODE 1
0,4/22/2019,0:05,MANHATTAN,10002,40.716476,-73.99283,"(40.716476, -73.99283)",ELDRIDGE STREET,HESTER STREET,,...,0.0,0,0,0,0,0,0,Tow Hitch Defective,4119391,Station Wagon/Sport Utility Vehicle
1,4/22/2019,0:15,BROOKLYN,11238,40.68175,-73.96748,"(40.68175, -73.96748)",ATLANTIC AVENUE,VANDERBILT AVENUE,,...,0.0,0,0,0,0,0,0,Unsafe Lane Changing,4119116,Tractor Truck Diesel
2,4/22/2019,0:15,QUEENS,11434,40.664017,-73.7686,"(40.664017, -73.7686)",FARMERS BOULEVARD,BREWER BOULEVARD,,...,0.0,0,0,0,0,1,0,Unspecified,4118449,Station Wagon/Sport Utility Vehicle
3,4/22/2019,0:30,BROOKLYN,11219,40.622326,-73.99806,"(40.622326, -73.99806)",,,6615 NEW UTRECHT AVENUE,...,0.0,0,0,0,0,0,0,Driver Inattention/Distraction,4119011,Pick-up Truck
4,4/22/2019,0:30,BROOKLYN,11228,40.607464,-74.01486,"(40.607464, -74.01486)",,,236 BAY 8 STREET,...,0.0,0,0,0,0,0,0,Fell Asleep,4118458,Convertible


In [3]:
df.drop('BOROUGH', axis='columns', inplace=True)
df.drop('TIME', axis='columns', inplace=True)
df.drop('ZIP CODE', axis='columns', inplace=True)
df.drop('ON STREET NAME', axis='columns', inplace=True)
df.drop('OFF STREET NAME', axis='columns', inplace=True)
df.drop('CROSS STREET NAME', axis='columns', inplace=True)
df.drop('UNIQUE KEY', axis='columns', inplace=True)

In [4]:
#Total registed accidents in the dataset
accidents = data_df['UNIQUE KEY'].count()
print('Total number of registed accidents:', accidents)

NameError: name 'data_df' is not defined

In [None]:
#Checking data types
df.dtypes

In [None]:
# Rename columns
df.columns = ['date', 'latitude', 'longitude', 'location', 'injured', 'killed', 
              'pedestrian_injured', 'pedestrian_killed', 'cyclist_injured', 'cyclist_killed',
             'motorist_injured', 'motorist_killed', 'cont_factor', 'vehicle']    
df.head()

In [None]:
#Converting 'date' to datetime and 'injured' and 'killed' to numeric
df['date'] = pd.to_datetime(df['date'])

df.dtypes


In [None]:
#Extract year from 'DATE' and create separate collumns
df['year'] = pd.DatetimeIndex(df['date']).year
df['month'] = pd.DatetimeIndex(df['date']).month

df.head()

In [None]:
#Replace values based on spelling 
df['cont_factor'].replace('Drugs (Illegal)', 'Drugs (illegal)', inplace=True)


In [None]:
df['cont_factor'].replace('Cell Phone (hand-Held)', 'Cell Phone (hand-held) ', inplace=True)


In [None]:
df['cont_factor'].replace('Cell Phone (hand-held)', 'Cell Phone (hand-held) ', inplace=True)


In [None]:
df['date'].min()


In [None]:
df['date'].max()


In [None]:
#Finding percentage values of 'contributing factors' in the dataset
df["cont_factor"].value_counts(normalize=True)



## ANALYSING THE CONTRIBUTING FACTORS BY YEAR

In [None]:
#Analyse 'MAJOR CONTRIBUTING FACTORS' by 'YEAR'
# df.groupby(['YEAR', 'MAJOR CONTRIBUTING FACTOR']).max().head()
df.groupby(['year']).MAJOR CONTRIBUTING FACTOR.value_counts(normalize=True)

In [None]:
#Analyse 'MAJOR CONTRIBUTING FACTORS' BY 'YEAR' and 'MONTH'
df.groupby(['cont_factor', 'year', 'month']).sum().head()

In [None]:
print('2018: NUMBER OF ACCIDENTS BY CATEGORY THAT CAUSED INJURY OR DEATH:')
df[(df.date.dt.year > 2017) & (df.date.dt.year <= 2018)].max()


In [None]:
print('2017: NUMBER OF ACCIDENTS BY CATEGORY THAT CAUSED INJURY OR DEATH:')
d = df[(df.date.dt.year > 2016) & (df.date.dt.year <= 2017)]


In [None]:
# Descriptive statistics by 'YEAR' and 'MAJOR CONTRIBUTING FACTOR'
df.groupby(['year', 'cont_factor']).describe().head()

In [None]:
#Save clean file in exccel to make analysis in Tableau
df.to_excel("factors.xlsx", index=False)
# df.to_csv("factors_csv.csv", index=False)
