In [2]:
#Importing libraries 
import pandas as pd
import numpy as np

In [3]:
# Reading the new file 
df = pd.read_csv("Intermediate3.csv")

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1748162 entries, 0 to 1748161
Data columns (total 30 columns):
 #   Column                         Dtype  
---  ------                         -----  
 0   CRASH DATE & TIME              object 
 1   BOROUGH                        object 
 2   ZIP CODE                       int64  
 3   LATITUDE                       float64
 4   LONGITUDE                      float64
 5   LOCATION                       object 
 6   NUMBER OF PERSONS INJURED      int64  
 7   NUMBER OF PERSONS KILLED       int64  
 8   NUMBER OF PEDESTRIANS INJURED  int64  
 9   NUMBER OF PEDESTRIANS KILLED   int64  
 10  NUMBER OF CYCLIST INJURED      int64  
 11  NUMBER OF CYCLIST KILLED       int64  
 12  NUMBER OF MOTORIST INJURED     int64  
 13  NUMBER OF MOTORIST KILLED      int64  
 14  CONTRIBUTING FACTOR VEHICLE 1  object 
 15  CONTRIBUTING FACTOR VEHICLE 2  object 
 16  CONTRIBUTING FACTOR VEHICLE 3  object 
 17  CONTRIBUTING FACTOR VEHICLE 4  object 
 18  CO

In [5]:
# Count the number of entries where ZIP CODE is zero and LATITUDE and LONGITUDE are present
num_entries = df[(df['ZIP CODE'] == 0) & (df['LATITUDE'].notna()) & (df['LONGITUDE'].notna())].shape[0]
print(f"Number of entries with ZIP CODE as 0 and valid LATITUDE and LONGITUDE: {num_entries}")

Number of entries with ZIP CODE as 0 and valid LATITUDE and LONGITUDE: 442214



<div style="text-align: center; font-size: 20px;color: #333; padding: 10px; background-color: #FFFFFF; border-radius: 5px;">
    <strong> 9 E. Filling values for ZIP CODE, BOROUGH -> Using Nominatam Local Instance to Fill the data</strong>
</div>


**🚨 DO NOT RUN THIS THE FOLLOWING CELL 🚨**  
⚠️ This operation requires a local nominatim instance running in a docker contrainer for make queries. ⚠️  


In [None]:
#This code will help us to get the ZIP code and borough from the latitude and longitude

from geopy.geocoders import Nominatim
import pandas as pd
import concurrent.futures

# Initialize the geolocator
geolocator = Nominatim(user_agent="geoapiExercises", domain="localhost:8080", scheme="http")

# Function to get ZIP code and borough from latitude and longitude
def get_location_info(lat, lon):
    try:
        location = geolocator.reverse((lat, lon), exactly_one=True, timeout=20)
        if location and 'postcode' in location.raw['address']:
            postcode = location.raw['address']['postcode']
            borough = location.raw['address'].get('borough', 'Unknown')
            return postcode, borough
        else:
            print(f"No postcode found for coordinates: ({lat}, {lon})")
            return None, None
    except Exception as e:
        print(f"Error: {e}")
        return None, None

# Function to process a row and return the updated ZIP CODE and borough
def process_row(index, row):
    if row['ZIP CODE'] == 0 and pd.notna(row['LATITUDE']) and pd.notna(row['LONGITUDE']):
        zip_code, borough = get_location_info(row['LATITUDE'], row['LONGITUDE'])
        row['ZIP CODE'] = zip_code
        row['BOROUGH'] = borough
    return index, row['ZIP CODE'], row['BOROUGH']

# Count the number of entries where ZIP CODE is zero and LATITUDE and LONGITUDE are present
num_entries = df[(df['ZIP CODE'] == 0) & (df['LATITUDE'].notna()) & (df['LONGITUDE'].notna())].shape[0]
print(f"Number of entries with ZIP CODE as 0 and valid LATITUDE and LONGITUDE: {num_entries}")

# Process the DataFrame in parallel using ThreadPoolExecutor
with concurrent.futures.ThreadPoolExecutor(max_workers=100) as executor:
    futures = {executor.submit(process_row, index, row): index for index, row in df.iterrows()}
    for future in concurrent.futures.as_completed(futures):
        index, zip_code, borough = future.result()
        df.at[index, 'ZIP CODE'] = zip_code
        df.at[index, 'BOROUGH'] = borough

Number of entries with ZIP CODE as 0 and valid LATITUDE and LONGITUDE: 442214


  df.at[index, 'ZIP CODE'] = zip_code


No postcode found for coordinates: (40.688206, -74.103)


In [None]:
# Previous step inserted nan in zip code, replacing it with 0, and borough with 'Unknown'
df['ZIP CODE'] = df['ZIP CODE'].replace(np.nan, 0)
df['BOROUGH'] = df['BOROUGH'].replace('Unknown', np.nan)

In [None]:
# Change the borough to lowercase
df['BOROUGH'] = df['BOROUGH'].str.lower()

In [None]:
# Changing the borough names to the correct format
df['BOROUGH'] = df['BOROUGH'].replace('the bronx', 'bronx')

In [None]:
# Set ZIP CODE as 0 which are greater than 12000 and less than 10000 as they are outside the new york city area
df.loc[(df['ZIP CODE'] < 10000) | (df['ZIP CODE'] > 12000), 'ZIP CODE'] = 0

In [17]:
# Count the number of entries where ZIP CODE is zero and LATITUDE and LONGITUDE are present after update
num_entries = df[(df['ZIP CODE'] == 0) & (df['LATITUDE'].notna()) & (df['LONGITUDE'].notna())].shape[0]
print(f"Number of entries with ZIP CODE as 0 and valid LATITUDE and LONGITUDE after update: {num_entries}")

Number of entries with ZIP CODE as 0 and valid LATITUDE and LONGITUDE after update: 0


In [23]:
# Count the number of entries where BOROUGH is invalid and LATITUDE and LONGITUDE are present after removing entries with ZIP CODE as 0
num_entries = df[(df['BOROUGH'].isna()) & (df['LATITUDE'].notna()) & (df['LONGITUDE'].notna())].shape[0]
print(f"Number of entries with invalid BOROUGH and valid LATITUDE and LONGITUDE after removing entries with ZIP CODE as 0: {num_entries}")

Number of entries with invalid BOROUGH and valid LATITUDE and LONGITUDE after removing entries with ZIP CODE as 0: 741


In [24]:
# Printing 20 entries again to check final dataframe
import tabulate
print(tabulate.tabulate(df.head(100), tablefmt="rounded_grid", headers=df.columns))

╭────┬─────────────────────┬───────────────┬────────────┬────────────┬─────────────┬───────────────────────────┬─────────────────────────────┬────────────────────────────┬─────────────────────────────────┬────────────────────────────────┬─────────────────────────────┬────────────────────────────┬──────────────────────────────┬─────────────────────────────┬───────────────────────────────────────────────────────┬─────────────────────────────────┬─────────────────────────────────┬─────────────────────────────────┬─────────────────────────────────┬───────────────────────┬───────────────────────┬───────────────────────┬───────────────────────┬───────────────────────┬──────────────────────────────────────────────────────┬───────────────┬──────────┬──────────┬─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┬────────────────╮
│    │ CRASH DATE & TIME   │ BOROUGH       │   ZIP CODE │   LATITUDE 

In [25]:
df.shape

(1748161, 30)

In [26]:
df.to_csv('final_data.csv', index=False, header=True)

---  
**End Of Data Cleaning File**  
