In [None]:
# Install geopy
!pip install geopy

In [35]:
# Import necessary libraries
import pandas as pd
from geopy.distance import geodesic
#pd.options.mode.chained_assignment = None  # Disable the warning

# Load datasets
crash_data_path = 'resources/crash_data_2.csv'
weather_data_path = 'resources/new_york_weather.csv'

crash_data = pd.read_csv(crash_data_path)
weather_data = pd.read_csv(weather_data_path)

In [39]:
# Convert crash date and time to a single datetime column
crash_data['crash_datetime'] = pd.to_datetime(
    crash_data['crash_date'] + ' ' + crash_data['crash_time'], errors='coerce'
)

In [40]:
# Remove timezone information from weather datetime and convert to datetime
weather_data['dt_iso_cleaned'] = weather_data['dt_iso'].str.replace(' \+0000 UTC', '', regex=True)
# Parse the cleaned column into datetime
weather_data['weather_datetime'] = pd.to_datetime(
    weather_data['dt_iso_cleaned'], format='%Y-%m-%d %H:%M:%S', errors='coerce'
)

In [41]:
# Normalize both datetime columns to the same precision (if necessary)
# This ensures no truncation of time during the join
weather_data['weather_datetime'] = weather_data['weather_datetime'].dt.floor('h')  # Round to the nearest hour
crash_data['crash_datetime'] = crash_data['crash_datetime'].dt.floor('h')  # Round to the nearest hour

In [42]:
# Select relevant columns for merging
crash_data_relevant = crash_data[['crash_datetime', 'number_of_persons_injured', 
                                  'number_of_persons_killed','number_of_pedestrians_injured', 
                                  'number_of_pedestrians_killed', 'number_of_cyclist_injured',
                                  'number_of_cyclist_killed', 'number_of_motorist_injured', 
                                  'number_of_motorist_killed', 'collision_id', 'latitude', 'longitude']]
weather_data_relevant = weather_data[['weather_datetime', 'temp', 'visibility', 
                                      'humidity', 'rain_1h', 'weather_main', 'weather_description', 
                                     'lat', 'lon']]


In [43]:
print(crash_data_relevant['crash_datetime'].head())
print(weather_data_relevant['weather_datetime'].head())

0   2022-01-01 07:00:00
1   2022-01-01 04:00:00
2   2022-01-01 07:00:00
3   2022-01-01 05:00:00
4   2022-01-01 01:00:00
Name: crash_datetime, dtype: datetime64[ns]
0   2022-01-01 00:00:00
1   2022-01-01 01:00:00
2   2022-01-01 02:00:00
3   2022-01-01 03:00:00
4   2022-01-01 04:00:00
Name: weather_datetime, dtype: datetime64[ns]


In [51]:
# Merge crash and weather data on datetime
merged_data = pd.merge(weather_data, crash_data, left_on='weather_datetime', right_on='crash_datetime', how='inner')

In [52]:
# Ensure 'weather_datetime' and 'crash_datetime' are datetime objects
merged_data['weather_datetime'] = pd.to_datetime(merged_data['weather_datetime'], errors='coerce')
merged_data['crash_datetime'] = pd.to_datetime(merged_data['crash_datetime'], errors='coerce')

# Now safely apply .dt.strftime() to convert to string with full date and time format
merged_data['weather_datetime'] = merged_data['weather_datetime'].dt.strftime('%Y-%m-%d %H:%M:%S')
merged_data['crash_datetime'] = merged_data['crash_datetime'].dt.strftime('%Y-%m-%d %H:%M:%S')

# Verify the result
print(merged_data[['weather_datetime', 'crash_datetime']].head())

      weather_datetime       crash_datetime
0  2022-01-01 07:00:00  2022-01-01 07:00:00
1  2022-01-01 04:00:00  2022-01-01 04:00:00
2  2022-01-01 07:00:00  2022-01-01 07:00:00
3  2022-01-01 05:00:00  2022-01-01 05:00:00
4  2022-01-01 01:00:00  2022-01-01 01:00:00


In [53]:
# Fill NaT values in crash_datetime with a placeholder or keep as NaT
merged_data['crash_datetime'] = merged_data['crash_datetime'].fillna(pd.NaT)

# Force both columns to retain their full precision
merged_data['weather_datetime'] = pd.to_datetime(merged_data['weather_datetime'])
merged_data['crash_datetime'] = pd.to_datetime(merged_data['crash_datetime'])

# Verify the result: Check that both datetime columns are in full precision
print(merged_data[['weather_datetime', 'crash_datetime']].head())
print(merged_data.dtypes)  # Confirm both columns are still of type datetime64[ns]

     weather_datetime      crash_datetime
0 2022-01-01 07:00:00 2022-01-01 07:00:00
1 2022-01-01 04:00:00 2022-01-01 04:00:00
2 2022-01-01 07:00:00 2022-01-01 07:00:00
3 2022-01-01 05:00:00 2022-01-01 05:00:00
4 2022-01-01 01:00:00 2022-01-01 01:00:00
dt                                        int64
dt_iso                                   object
timezone                                  int64
city_name                                object
lat                                     float64
lon                                     float64
temp                                    float64
visibility                              float64
dew_point                               float64
feels_like                              float64
temp_min                                float64
temp_max                                float64
pressure                                  int64
sea_level                               float64
grnd_level                              float64
humidity                    

In [27]:
# Define the calculate_distance function
def calculate_distance(row):
    try:
        crash_coords = (row['latitude'], row['longitude'])
        weather_coords = (row['lat'], row['lon'])
        return geodesic(crash_coords, weather_coords).meters
    except Exception as e:
        print(f"Error calculating distance for row {row.name}: {e}")
        return None

In [28]:
# Filter out rows with missing coordinates
merged_data_cleaned = merged_data.dropna(subset=['latitude', 'longitude', 'lat', 'lon'])

In [29]:
# Test the function on a single row
sample_row = merged_data_cleaned.iloc[0]
print("Sample row:", sample_row)
distance_test = calculate_distance(sample_row)
print("Distance for sample row:", distance_test)

Sample row: dt                                                  1641020400
dt_iso                           2022-01-01 07:00:00 +0000 UTC
timezone                                                -18000
city_name                                             New York
lat                                                  40.712775
lon                                                 -74.005973
temp                                                     50.05
visibility                                             10000.0
dew_point                                                46.94
feels_like                                               48.94
temp_min                                                 48.94
temp_max                                                 52.14
pressure                                                  1012
sea_level                                                  NaN
grnd_level                                                 NaN
humidity                                   

In [30]:
# Ensure merged_data_cleaned is a standalone DataFrame
merged_data_cleaned = merged_data_cleaned.copy()

# Apply the distance calculation
merged_data_cleaned.loc[:, 'distance'] = merged_data_cleaned.apply(calculate_distance, axis=1)

# Drop rows with invalid distances
merged_data_cleaned = merged_data_cleaned.dropna(subset=['distance'])

# Filter by proximity threshold
merged_data_filtered = merged_data_cleaned[merged_data_cleaned['distance'] <= 10000]

In [31]:
merged_data.head()

Unnamed: 0,dt,dt_iso,timezone,city_name,lat,lon,temp,visibility,dew_point,feels_like,...,contributing_factor_vehicle_3,contributing_factor_vehicle_4,contributing_factor_vehicle_5,collision_id,vehicle_type_code1,vehicle_type_code2,vehicle_type_code_3,vehicle_type_code_4,vehicle_type_code_5,crash_datetime
0,1641020400,2022-01-01 07:00:00 +0000 UTC,-18000,New York,40.712775,-74.005973,50.05,10000.0,46.94,48.94,...,,,,4491400,Sedan,,,,,2022-01-01 07:00:00
1,1641009600,2022-01-01 04:00:00 +0000 UTC,-18000,New York,40.712775,-74.005973,49.75,10000.0,46.35,47.91,...,,,,4491626,Sedan,Sedan,,,,2022-01-01 04:00:00
2,1641020400,2022-01-01 07:00:00 +0000 UTC,-18000,New York,40.712775,-74.005973,50.05,10000.0,46.94,48.94,...,,,,4491734,Sedan,,,,,2022-01-01 07:00:00
3,1641013200,2022-01-01 05:00:00 +0000 UTC,-18000,New York,40.712775,-74.005973,49.73,,46.62,47.89,...,,,,4491857,Sedan,Sedan,,,,2022-01-01 05:00:00
4,1640998800,2022-01-01 01:00:00 +0000 UTC,-18000,New York,40.712775,-74.005973,49.91,10000.0,46.2,48.76,...,,,,4491344,Sedan,Station Wagon/Sport Utility Vehicle,,,,2022-01-01 01:00:00


In [32]:
# Add a binary target column
merged_data_filtered['crash_occurred'] = merged_data_filtered['crash_datetime'].notnull().astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged_data_filtered['crash_occurred'] = merged_data_filtered['crash_datetime'].notnull().astype(int)


In [33]:
# Save the merged data to a CSV file
output_path = 'resources/merged_crash_weather_data.csv'
merged_data_filtered.to_csv(output_path, index=False)