In [1]:
# Install geopy
!pip install geopy



In [2]:
# Import necessary libraries
import pandas as pd
from geopy.distance import geodesic

# Load datasets
crash_data_path = 'resources/crash_data_2.csv'
weather_data_path = 'resources/new_york_weather.csv'

crash_data = pd.read_csv(crash_data_path)
weather_data = pd.read_csv(weather_data_path)

In [3]:
# Convert crash date and time to a single datetime column
crash_data['crash_datetime'] = pd.to_datetime(
    crash_data['crash_date'] + ' ' + crash_data['crash_time'], errors='coerce'
)

In [5]:
# Round crash datetime to the nearest hour
crash_data['crash_datetime'] = crash_data['crash_datetime'].dt.round('h')

In [6]:
# Remove timezone information from weather datetime and convert to datetime
weather_data['dt_iso_cleaned'] = weather_data['dt_iso'].str.replace(' \+0000 UTC', '', regex=True)
# Parse the cleaned column into datetime
weather_data['weather_datetime'] = pd.to_datetime(
    weather_data['dt_iso_cleaned'], format='%Y-%m-%d %H:%M:%S', errors='coerce'
)

In [7]:
# Select relevant columns for merging
crash_data_relevant = crash_data[['crash_datetime', 'number_of_persons_injured', 
                                  'number_of_persons_killed','number_of_pedestrians_injured', 
                                  'number_of_pedestrians_killed', 'number_of_cyclist_injured',
                                  'number_of_cyclist_killed', 'number_of_motorist_injured', 
                                  'number_of_motorist_killed', 'collision_id', 'latitude', 'longitude']]
weather_data_relevant = weather_data[['weather_datetime', 'temp', 'visibility', 
                                      'humidity', 'rain_1h', 'weather_main', 'weather_description', 
                                     'lat', 'lon']]


In [8]:
# Merge crash and weather data on datetime
merged_data = pd.merge(crash_data_relevant, weather_data_relevant, 
                                   left_on='crash_datetime', right_on='weather_datetime', how='inner')

In [10]:
# Define the calculate_distance function
def calculate_distance(row):
    try:
        crash_coords = (row['latitude'], row['longitude'])
        weather_coords = (row['lat'], row['lon'])
        return geodesic(crash_coords, weather_coords).meters
    except Exception as e:
        print(f"Error calculating distance for row {row.name}: {e}")
        return None

In [19]:
# Filter out rows with missing coordinates
merged_data_cleaned = merged_data.dropna(subset=['latitude', 'longitude', 'lat', 'lon'])

In [16]:
# Test the function on a single row
sample_row = merged_data_cleaned.iloc[0]
print("Sample row:", sample_row)
distance_test = calculate_distance(sample_row)
print("Distance for sample row:", distance_test)

Sample row: crash_datetime                   2022-01-01 08:00:00
number_of_persons_injured                          0
number_of_persons_killed                           0
number_of_pedestrians_injured                      0
number_of_pedestrians_killed                       0
number_of_cyclist_injured                          0
number_of_cyclist_killed                           0
number_of_motorist_injured                         0
number_of_motorist_killed                          0
collision_id                                 4491400
latitude                                   40.771477
longitude                                  -73.91824
weather_datetime                 2022-01-01 08:00:00
temp                                           50.32
visibility                                   10000.0
humidity                                          88
rain_1h                                          NaN
weather_main                                  Clouds
weather_description               

In [20]:
# Ensure merged_data_cleaned is a standalone DataFrame
merged_data_cleaned = merged_data_cleaned.copy()

# Apply the distance calculation
merged_data_cleaned.loc[:, 'distance'] = merged_data_cleaned.apply(calculate_distance, axis=1)

# Drop rows with invalid distances
merged_data_cleaned = merged_data_cleaned.dropna(subset=['distance'])

# Filter by proximity threshold
merged_data_filtered = merged_data_cleaned[merged_data_cleaned['distance'] <= 10000]

In [21]:
# Add a binary target variable for crashes
merged_data_filtered.loc[:, 'crash_occurred'] = 1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged_data_filtered.loc[:, 'crash_occurred'] = 1


In [22]:
# Save the merged data to a CSV file
output_path = 'resources/merged_crash_weather_data.csv'
merged_data.to_csv(output_path, index=False)