# Importing the required libraries

- pandas is used for manipulating data
- numpy is used for handling numerical data and filling in NaN type of values
- tabulate is used for prettify'ing printing tables

In [None]:
import pandas as pd
import numpy as np
import tabulate

## Setting Display Options

In [2]:
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)

In [None]:
# Loading the dataset to a dataframe
df = pd.read_csv('Motor_Vehicle_Collisions_-_Crashes.csv')

  df = pd.read_csv('Motor_Vehicle_Collisions_-_Crashes.csv')


In [None]:
# Checking the dataset/dataframe size
df.shape

(2155718, 29)

In [None]:
# Printing 20 entries from the begining to see how the dataset looks like
print(tabulate.tabulate(df.head(20), tablefmt="rounded_grid", headers=df.columns))

╭────┬──────────────┬──────────────┬───────────┬────────────┬────────────┬─────────────┬─────────────────────────┬──────────────────────────────┬────────────────────────────────┬────────────────────────────────────────┬─────────────────────────────┬────────────────────────────┬─────────────────────────────────┬────────────────────────────────┬─────────────────────────────┬────────────────────────────┬──────────────────────────────┬─────────────────────────────┬─────────────────────────────────┬─────────────────────────────────┬─────────────────────────────────┬─────────────────────────────────┬─────────────────────────────────┬────────────────┬─────────────────────────────────────┬─────────────────────────────────────┬───────────────────────┬───────────────────────┬───────────────────────╮
│    │ CRASH DATE   │ CRASH TIME   │ BOROUGH   │   ZIP CODE │   LATITUDE │   LONGITUDE │ LOCATION                │ ON STREET NAME               │ CROSS STREET NAME              │ OFF STREET NAME     

In [None]:
# Checking the datatypes of the loaded dataframe
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2155718 entries, 0 to 2155717
Data columns (total 29 columns):
 #   Column                         Dtype  
---  ------                         -----  
 0   CRASH DATE                     object 
 1   CRASH TIME                     object 
 2   BOROUGH                        object 
 3   ZIP CODE                       object 
 4   LATITUDE                       float64
 5   LONGITUDE                      float64
 6   LOCATION                       object 
 7   ON STREET NAME                 object 
 8   CROSS STREET NAME              object 
 9   OFF STREET NAME                object 
 10  NUMBER OF PERSONS INJURED      float64
 11  NUMBER OF PERSONS KILLED       float64
 12  NUMBER OF PEDESTRIANS INJURED  int64  
 13  NUMBER OF PEDESTRIANS KILLED   int64  
 14  NUMBER OF CYCLIST INJURED      int64  
 15  NUMBER OF CYCLIST KILLED       int64  
 16  NUMBER OF MOTORIST INJURED     int64  
 17  NUMBER OF MOTORIST KILLED      int64  
 18  CO

In [None]:
# Checking the number of null/missing entries
df.isnull().sum()

CRASH DATE                             0
CRASH TIME                             0
BOROUGH                           667415
ZIP CODE                          667683
LATITUDE                          239663
LONGITUDE                         239663
LOCATION                          239663
ON STREET NAME                    463684
CROSS STREET NAME                 822176
OFF STREET NAME                  1784407
NUMBER OF PERSONS INJURED             18
NUMBER OF PERSONS KILLED              31
NUMBER OF PEDESTRIANS INJURED          0
NUMBER OF PEDESTRIANS KILLED           0
NUMBER OF CYCLIST INJURED              0
NUMBER OF CYCLIST KILLED               0
NUMBER OF MOTORIST INJURED             0
NUMBER OF MOTORIST KILLED              0
CONTRIBUTING FACTOR VEHICLE 1       7382
CONTRIBUTING FACTOR VEHICLE 2     341042
CONTRIBUTING FACTOR VEHICLE 3    2000331
CONTRIBUTING FACTOR VEHICLE 4    2120349
CONTRIBUTING FACTOR VEHICLE 5    2146057
COLLISION_ID                           0
VEHICLE TYPE COD

In [8]:
# Printing all the unique values present in each column
for column in df.columns:
  print(column, ":", df[column].unique(), "\n")

CRASH DATE : ['09/11/2021' '03/26/2022' '11/01/2023' ... '11/07/2024' '11/10/2024'
 '11/24/2024'] 

CRASH TIME : ['2:39' '11:45' '1:29' ... '3:14' '6:33' '2:14'] 

BOROUGH : [nan 'BROOKLYN' 'BRONX' 'MANHATTAN' 'QUEENS' 'STATEN ISLAND'] 

ZIP CODE : [nan 11230.0 11208.0 11233.0 10475.0 11207.0 10017.0 11413.0 11434.0
 11217.0 11226.0 10463.0 10001.0 11372.0 10301.0 11215.0 11211.0 10455.0
 11385.0 11418.0 11225.0 11220.0 11411.0 10452.0 10466.0 10453.0 10019.0
 11221.0 11203.0 11419.0 11101.0 11106.0 11223.0 11422.0 11213.0 10128.0
 11218.0 11692.0 11420.0 11205.0 11212.0 10022.0 10011.0 10314.0 10461.0
 11004.0 10025.0 11373.0 10018.0 11234.0 10462.0 10472.0 11206.0 11236.0
 11210.0 11238.0 11209.0 10065.0 11249.0 11432.0 10032.0 11104.0 10002.0
 10456.0 10468.0 11201.0 11219.0 11222.0 11235.0 10012.0 10305.0 10024.0
 10458.0 11228.0 11361.0 10035.0 11354.0 11377.0 11374.0 10467.0 11433.0
 10016.0 10013.0 11369.0 10457.0 10027.0 10028.0 11691.0 10014.0 10310.0
 11231.0 10469.0 10033.0 

<div style="text-align: center;">
<br>
    <h1>1. Filling Missing Values</h1>
</div>

<div style="text-align: center; font-size: 20px;color: #333; padding: 10px; background-color: #FFFFFF; border-radius: 5px;">
    <strong> 1 A. Contributing Factors</strong>
</div>


In [None]:
# Filling in the empty values with 'Unspecified' for "CONTRIBUTING FACTOR VEHICLE 1,2,3,4,5"

# Initially we replace the values with nan and then fill it with 'Unspecified'
contributing_factor_columns = df.filter(regex='CONTRIBUTING').columns
for i in contributing_factor_columns:
    df[i] = df[i].replace(
        ["nan", "NaN", "None", "", " ", "N/A", "1", "80"], np.nan
    )

    df[i] = df[i].fillna("Unspecified")

In [10]:
# Checking if the operation was successful

print(df["CONTRIBUTING FACTOR VEHICLE 1"].isna().sum())
print(df["CONTRIBUTING FACTOR VEHICLE 2"].isna().sum())
print(df["CONTRIBUTING FACTOR VEHICLE 3"].isna().sum())
print(df["CONTRIBUTING FACTOR VEHICLE 4"].isna().sum())
print(df["CONTRIBUTING FACTOR VEHICLE 5"].isna().sum())

0
0
0
0
0


In [None]:
# Checking the columns to see how the data looks

print(df["CONTRIBUTING FACTOR VEHICLE 1"].unique(), "\n")
print(df["CONTRIBUTING FACTOR VEHICLE 2"].unique(), "\n")
print(df["CONTRIBUTING FACTOR VEHICLE 3"].unique(), "\n")
print(df["CONTRIBUTING FACTOR VEHICLE 4"].unique(), "\n")
print(df["CONTRIBUTING FACTOR VEHICLE 5"].unique(), "\n")

['Aggressive Driving/Road Rage' 'Pavement Slippery' 'Unspecified'
 'Following Too Closely' 'Passing Too Closely'
 'Failure to Yield Right-of-Way' 'Driver Inexperience'
 'Passing or Lane Usage Improper' 'Turning Improperly'
 'Unsafe Lane Changing' 'Unsafe Speed' 'Reaction to Uninvolved Vehicle'
 'Steering Failure' 'Traffic Control Disregarded' 'Other Vehicular'
 'Driver Inattention/Distraction' 'Accelerator Defective'
 'Oversized Vehicle'
 'Pedestrian/Bicyclist/Other Pedestrian Error/Confusion'
 'Alcohol Involvement' 'View Obstructed/Limited' 'Illnes'
 'Lost Consciousness' 'Brakes Defective' 'Backing Unsafely' 'Glare'
 'Passenger Distraction' 'Fell Asleep' 'Obstruction/Debris'
 'Tinted Windows' 'Animals Action' 'Drugs (illegal)' 'Pavement Defective'
 'Other Lighting Defects' 'Outside Car Distraction'
 'Driverless/Runaway Vehicle' 'Tire Failure/Inadequate' 'Fatigued/Drowsy'
 'Headlights Defective' 'Failure to Keep Right' 'Physical Disability'
 'Eating or Drinking' 'Cell Phone (hands-free


<div style="text-align: center; font-size: 20px;color: #333; padding: 10px; background-color: #FFFFFF; border-radius: 5px;">
    <strong> 1 B. Deaths and Injuries</strong>
</div>


In [None]:
# Selecting the columns with headers containing the phrase 'NUMBER' and filling them with 0.

str_cols = df.filter(regex='NUMBER').columns
df[str_cols] = df[str_cols].fillna(0)

In [None]:
# Checking if the operation was successful

df[str_cols].isnull().sum()

NUMBER OF PERSONS INJURED        0
NUMBER OF PERSONS KILLED         0
NUMBER OF PEDESTRIANS INJURED    0
NUMBER OF PEDESTRIANS KILLED     0
NUMBER OF CYCLIST INJURED        0
NUMBER OF CYCLIST KILLED         0
NUMBER OF MOTORIST INJURED       0
NUMBER OF MOTORIST KILLED        0
dtype: int64


<div style="text-align: center; font-size: 20px;color: #333; padding: 10px; background-color: #FFFFFF; border-radius: 5px;">
    <strong> 1 C. LATITUDE & LONGITUDE</strong>
</div>


In [None]:
# Filling in missing values for LATITUDE LONGITUDE and ZIP CODE -> replacing 0 with NaN 

df[['LATITUDE', 'LONGITUDE', 'ZIP CODE']] = df[['LATITUDE', 'LONGITUDE', 'ZIP CODE']].replace(0, np.nan)

In [None]:
# Checking if the operation was successful

(df[['LATITUDE', 'LONGITUDE', 'ZIP CODE']] == 0).sum()


LATITUDE     0
LONGITUDE    0
ZIP CODE     0
dtype: int64

 
<div style="text-align: center;">
<br>
    <h1>2. Shifting Attributes</h1>
</div>


<div style="text-align: center; font-size: 20px;color: #333; padding: 10px; background-color: #FFFFFF; border-radius: 5px;">
    <strong> 2 A. Car Types</strong>
</div>


In [None]:
# If vehicle type 1 NA, we can copy the data from other vehicle type to this column, so we don't have drop it.

# Selecting the columns with the phrase 'VEHICLE TYPE CODE' in it's header
vehicle_columns = df.filter(regex='VEHICLE TYPE CODE').columns

# Storing a copy of the original data for comparison after the operation
df_before_bfill = df[vehicle_columns].copy()

In [None]:
# Seeing how many entries have null values

df[vehicle_columns].isnull().sum()

VEHICLE TYPE CODE 1      15087
VEHICLE TYPE CODE 2     423849
VEHICLE TYPE CODE 3    2006218
VEHICLE TYPE CODE 4    2121619
VEHICLE TYPE CODE 5    2146357
dtype: int64

In [None]:
# Applying the back-fill to shift values left
df[vehicle_columns] = df[vehicle_columns].bfill(axis=1)

# Clearing the original positions from where values were taken and shifted to earlier columns
for i in range(len(vehicle_columns) - 1):  
    mask = df_before_bfill[vehicle_columns[i]].isna() & df_before_bfill[vehicle_columns[i + 1]].notna()
    df.loc[mask, vehicle_columns[i + 1]] = None
    

In [None]:
# Checking if the operation was successful

df[vehicle_columns].isnull().sum()

VEHICLE TYPE CODE 1      15076
VEHICLE TYPE CODE 2     423860
VEHICLE TYPE CODE 3    2006218
VEHICLE TYPE CODE 4    2121619
VEHICLE TYPE CODE 5    2146357
dtype: int64

In [20]:
df.shape

(2155718, 29)


<div style="text-align: center; font-size: 20px;color: #333; padding: 10px; background-color: #FFFFFF; border-radius: 5px;">
    <strong> 2 B. Prioritize e-vehicles to car type 1</strong>
</div>


In [21]:
# Need to write code

<div style="text-align: center;">
<br>
    <h1>3. Changing Case</h1>
</div>


 

<div style="text-align: center; font-size: 20px;color: #333; padding: 10px; background-color: #FFFFFF; border-radius: 5px;">
    <strong> 3 A. Combining the "ON STREET NAME", "CROSS STREET NAME" and "OFF STREET NAME" to a new field called "Addresses"</strong>
</div>


In [None]:
# Creating a new column in dataframe where merged address is stored ('ON STREET NAME', 'CROSS STREET NAME', 'OFF STREET NAME')
import re

def merge_addresses(row):
    fields = []
    
    if pd.notna(row["ON STREET NAME"]):
        fields.append(row["ON STREET NAME"].strip().lower())
    
    if pd.notna(row["CROSS STREET NAME"]):
        fields.append(row["CROSS STREET NAME"].strip().lower())
    
    if pd.notna(row["OFF STREET NAME"]):
        fields.append(row["OFF STREET NAME"].strip().lower())
    
    if fields:
        unclean_address = " ".join(fields) + " new york"
        return re.sub(r'\s+', ' ', unclean_address)  # Removeing extra spaces
    return ""

# Applying the function to every row of the dataset
df["Addresses"] = df.apply(merge_addresses, axis=1)

In [None]:
# Checking if the operation was successful

print(tabulate.tabulate(df[['Addresses']].head(10), tablefmt="rounded_grid", headers=["Addresses"]))

╭────┬────────────────────────────────────────────╮
│    │ Addresses                                  │
├────┼────────────────────────────────────────────┤
│  0 │ whitestone expressway 20 avenue new york   │
├────┼────────────────────────────────────────────┤
│  1 │ queensboro bridge upper new york           │
├────┼────────────────────────────────────────────┤
│  2 │ ocean parkway avenue k new york            │
├────┼────────────────────────────────────────────┤
│  3 │ throgs neck bridge new york                │
├────┼────────────────────────────────────────────┤
│  4 │ brooklyn bridge new york                   │
├────┼────────────────────────────────────────────┤
│  5 │ west 54 street new york                    │
├────┼────────────────────────────────────────────┤
│  6 │ hutchinson river parkway new york          │
├────┼────────────────────────────────────────────┤
│  7 │ west 35 street henry hudson river new york │
├────┼────────────────────────────────────────────┤
│  8 │ 61 ed


<div style="text-align: center; font-size: 20px;color: #333; padding: 10px; background-color: #FFFFFF; border-radius: 5px;">
    <strong> 3 B. Vehicle types</strong>
</div>


In [None]:
# Selecting the columns with header containing the phrase 'VEHICLE TYPE CODE'

str_cols = df.filter(regex='VEHICLE TYPE CODE').columns
for i in str_cols:
    df[i] = df[i].str.casefold()

In [None]:
# Checking if the operation was successful

print("Sample data after casefold transformation:")
print(tabulate.tabulate(df[str_cols].head(), tablefmt="rounded_grid", headers=df[str_cols]))

Sample data after casefold transformation:
╭────┬─────────────────────────────────────┬───────────────────────┬───────────────────────┬───────────────────────┬───────────────────────╮
│    │ VEHICLE TYPE CODE 1                 │ VEHICLE TYPE CODE 2   │ VEHICLE TYPE CODE 3   │   VEHICLE TYPE CODE 4 │   VEHICLE TYPE CODE 5 │
├────┼─────────────────────────────────────┼───────────────────────┼───────────────────────┼───────────────────────┼───────────────────────┤
│  0 │ sedan                               │ sedan                 │ nan                   │                   nan │                   nan │
├────┼─────────────────────────────────────┼───────────────────────┼───────────────────────┼───────────────────────┼───────────────────────┤
│  1 │ sedan                               │ nan                   │ nan                   │                   nan │                   nan │
├────┼─────────────────────────────────────┼───────────────────────┼───────────────────────┼───────────────────


<div style="text-align: center; font-size: 20px;color: #333; padding: 10px; background-color: #FFFFFF; border-radius: 5px;">
    <strong> 3 C. Contributing Factors</strong>
</div>


In [None]:
# Selecting the columns with header containing the phrase 'CONTRIBUTING'

str_cols = df.filter(regex='CONTRIBUTING').columns
for i in str_cols:
    df[i] = df[i].str.casefold()

In [None]:
# Checking if the operation was successful

print("Sample data after casefold transformation:")
print(tabulate.tabulate(df[str_cols].head(), tablefmt="rounded_grid", headers=df[str_cols]))

Sample data after casefold transformation:
╭────┬─────────────────────────────────┬─────────────────────────────────┬─────────────────────────────────┬─────────────────────────────────┬─────────────────────────────────╮
│    │ CONTRIBUTING FACTOR VEHICLE 1   │ CONTRIBUTING FACTOR VEHICLE 2   │ CONTRIBUTING FACTOR VEHICLE 3   │ CONTRIBUTING FACTOR VEHICLE 4   │ CONTRIBUTING FACTOR VEHICLE 5   │
├────┼─────────────────────────────────┼─────────────────────────────────┼─────────────────────────────────┼─────────────────────────────────┼─────────────────────────────────┤
│  0 │ aggressive driving/road rage    │ unspecified                     │ unspecified                     │ unspecified                     │ unspecified                     │
├────┼─────────────────────────────────┼─────────────────────────────────┼─────────────────────────────────┼─────────────────────────────────┼─────────────────────────────────┤
│  1 │ pavement slippery               │ unspecified                    


<div style="text-align: center;">
<br>
    <h1>4. Fixing Date and Time</h1>
</div>

In [None]:
# Creating a specific format to store date and time that the crash happened

df['CRASH DATE & TIME'] = pd.to_datetime(df['CRASH DATE']+ ' ' +df['CRASH TIME'] + ':00')

In [None]:
# Checking if the operation was successful

print(tabulate.tabulate(df[['CRASH DATE & TIME']].head(10).astype(str), tablefmt="rounded_grid", headers=["CRASH DATE & TIME"]))

╭────┬─────────────────────╮
│    │ CRASH DATE & TIME   │
├────┼─────────────────────┤
│  0 │ 2021-09-11 02:39:00 │
├────┼─────────────────────┤
│  1 │ 2022-03-26 11:45:00 │
├────┼─────────────────────┤
│  2 │ 2023-11-01 01:29:00 │
├────┼─────────────────────┤
│  3 │ 2022-06-29 06:55:00 │
├────┼─────────────────────┤
│  4 │ 2022-09-21 13:21:00 │
├────┼─────────────────────┤
│  5 │ 2023-04-26 13:30:00 │
├────┼─────────────────────┤
│  6 │ 2023-11-01 07:12:00 │
├────┼─────────────────────┤
│  7 │ 2023-11-01 08:01:00 │
├────┼─────────────────────┤
│  8 │ 2023-04-26 22:20:00 │
├────┼─────────────────────┤
│  9 │ 2021-09-11 09:35:00 │
╰────┴─────────────────────╯



<div style="text-align: center;">
<br>
    <h1>5. Removing Duplicate values</h1>
</div>

In [None]:
# Specifying the columns to exclude
columns_to_exclude = ['CRASH DATE', 'CRASH TIME', 'NeedToProcess', 'Addresses', 'NLat', 'NLong', 'Location']

valid_location_mask = df[['LATITUDE', 'LONGITUDE']].notna().all(axis=1)

# Selecting only the columns to include in counting non-null values
columns_to_count = df.columns.difference(columns_to_exclude)

# Counting non-null values per row for the selected columns
df['non_null_count'] = df[columns_to_count].notna().sum(axis=1)

# Filtering out rows with valid locations before finding duplicates
df_filtered = df[valid_location_mask]

# Sorting by non-null count in descending order for easiness
df_sorted = df_filtered.sort_values(by=['CRASH DATE & TIME', 'LATITUDE', 'LONGITUDE', 'non_null_count'], ascending=[True, True, True, False])

# Keeping the row with the most data and drop the rest
df_cleaned = df_sorted.drop_duplicates(subset=['CRASH DATE & TIME', 'LATITUDE', 'LONGITUDE'], keep='first')

# # Droping the helper column
df = df_cleaned.drop(columns=['non_null_count'])


In [None]:
# Checking if the operation was successful

print(df.shape)

(1905432, 31)



<div style="text-align: center;">
<br>
    <h1>6 A. Based on crash time create a new attribute (morning, afternoon, evening, night)</h1>
</div>

In [None]:

# Filling in the 'Morning', 'Afternoon', 'Evening', 'Night' in the new column 'Time of Day' based on the time of the crash

def get_times_of_day(hour):
    if 5 <= hour < 12:
        return 'morning'
    elif 12 <= hour < 17:
        return 'afternoon'
    elif 17 <= hour < 21:
        return 'evening'
    else:
        return 'night'

# Applying the function on each and every entry of the dataset
df['Time of Day'] = df['CRASH DATE & TIME'].dt.hour.apply(get_times_of_day)

In [None]:
# Checking if the operation was successful

print(tabulate.tabulate(df[[ 'CRASH TIME','Time of Day']].head(10), tablefmt="rounded_grid", headers=['CRASH TIME', "Time of Day"], showindex=False))

╭──────────────┬───────────────╮
│ CRASH TIME   │ Time of Day   │
├──────────────┼───────────────┤
│ 0:05         │ night         │
├──────────────┼───────────────┤
│ 0:05         │ night         │
├──────────────┼───────────────┤
│ 0:10         │ night         │
├──────────────┼───────────────┤
│ 0:10         │ night         │
├──────────────┼───────────────┤
│ 0:20         │ night         │
├──────────────┼───────────────┤
│ 0:22         │ night         │
├──────────────┼───────────────┤
│ 0:23         │ night         │
├──────────────┼───────────────┤
│ 0:25         │ night         │
├──────────────┼───────────────┤
│ 0:30         │ night         │
├──────────────┼───────────────┤
│ 0:30         │ night         │
╰──────────────┴───────────────╯



<div style="text-align: center;">
<br>
    <h1>7. Changing Data Type</h1>
</div>


<div style="text-align: center; font-size: 20px;color: #333; padding: 10px; background-color: #FFFFFF; border-radius: 5px;">
    <strong> 7 A. ZIP CODE</strong>
</div>


In [None]:
df["ZIP CODE"] = pd.to_numeric(df["ZIP CODE"], errors="coerce")  # Converting non-numeric values to np.NaN -> Changing the datatype

# Filling NaN values with 0 for all the values
df["ZIP CODE"] = df["ZIP CODE"].fillna(0).astype("Int64")

In [None]:
# Checking if the operation was successful

print(df["ZIP CODE"].dtype)

Int64



<div style="text-align: center; font-size: 20px;color: #333; padding: 10px; background-color: #FFFFFF; border-radius: 5px;">
    <strong> 7 B. NUMBER OF PERSONS INJURED</strong>
</div>


In [None]:
# Changing the datatype
df['NUMBER OF PERSONS INJURED'] = pd.to_numeric(df['NUMBER OF PERSONS INJURED'], errors='coerce')

df["NUMBER OF PERSONS INJURED"] = df["NUMBER OF PERSONS INJURED"].fillna(0).astype("Int64")

In [None]:
# Checking if the operation was successful

print(df["NUMBER OF PERSONS INJURED"].dtype)

Int64



<div style="text-align: center; font-size: 20px;color: #333; padding: 10px; background-color: #FFFFFF; border-radius: 5px;">
    <strong> 7 C. NUMBER OF PERSONS KILLED</strong>
</div>


In [None]:
# Changing the datatypes
df['NUMBER OF PERSONS KILLED'] = pd.to_numeric(df['NUMBER OF PERSONS KILLED'], errors='coerce')

df["NUMBER OF PERSONS KILLED"] = df["NUMBER OF PERSONS KILLED"].fillna(0).astype("Int64")

In [None]:
# Checking if the operation was successful

print(df["NUMBER OF PERSONS KILLED"].dtype)

Int64



<div style="text-align: center;">
<br>
    <h1>8. Dropping Columns</h1>
</div>


<div style="text-align: center; font-size: 20px;color: #333; padding: 10px; background-color: #FFFFFF; border-radius: 5px;">
    <strong> 8 A. Dropping "ON STREET NAME", "CROSS STREET NAME", "OFF STREET NAME", "CRASH DATE", "CRASH TIME"</strong>
</div>


In [None]:
# Checking all the columns present in the dataframe

print(tabulate.tabulate([df.columns.tolist()], tablefmt="rounded_grid", headers=['Columns'], showindex=False))

╭────────────┬────────────┬─────────┬──────────┬──────────┬───────────┬──────────┬────────────────┬───────────────────┬─────────────────┬───────────────────────────┬──────────────────────────┬───────────────────────────────┬──────────────────────────────┬───────────────────────────┬──────────────────────────┬────────────────────────────┬───────────────────────────┬───────────────────────────────┬───────────────────────────────┬───────────────────────────────┬───────────────────────────────┬───────────────────────────────┬──────────────┬─────────────────────┬─────────────────────┬─────────────────────┬─────────────────────┬─────────────────────┬───────────┬───────────────────┬─────────────╮
│            │            │         │          │          │           │          │                │                   │                 │                           │                          │                               │                              │                           │                  

In [None]:
# Dropping the columns we don't need

df = df.drop(columns=["ON STREET NAME", "CROSS STREET NAME", "OFF STREET NAME", "CRASH DATE", "CRASH TIME"])
# df = df.drop(columns=["ON STREET NAME", "CROSS STREET NAME", "OFF STREET NAME", "CRASH DATE", "CRASH TIME", "NeedToProcess", "Addresses", "NLat", "NLong", "Location"])

In [None]:
df = df[['CRASH DATE & TIME'] + [col for col in df.columns if col != 'CRASH DATE & TIME']]

In [None]:
# Checking if the operation was successful

print(tabulate.tabulate(df.head(5), tablefmt="rounded_grid", headers=df.columns))

╭─────────┬─────────────────────┬───────────┬────────────┬────────────┬─────────────┬───────────────────────────┬─────────────────────────────┬────────────────────────────┬─────────────────────────────────┬────────────────────────────────┬─────────────────────────────┬────────────────────────────┬──────────────────────────────┬─────────────────────────────┬─────────────────────────────────┬─────────────────────────────────┬─────────────────────────────────┬─────────────────────────────────┬─────────────────────────────────┬────────────────┬───────────────────────┬───────────────────────────────┬───────────────────────┬───────────────────────┬───────────────────────┬────────────────────────────────────┬───────────────╮
│         │ CRASH DATE & TIME   │ BOROUGH   │   ZIP CODE │   LATITUDE │   LONGITUDE │ LOCATION                  │   NUMBER OF PERSONS INJURED │   NUMBER OF PERSONS KILLED │   NUMBER OF PEDESTRIANS INJURED │   NUMBER OF PEDESTRIANS KILLED │   NUMBER OF CYCLIST INJURED │   

In [None]:
# Writing to CSV for fetching location, latitude and longitude from the API

df.to_csv("Intermediate.csv", index=False)

---  
**End Of Data File - 1 Continue to File DataCleaning2.ipynb**
