In [1]:
import pandas as pd

# Load the accident and weather datasets
accident_data = pd.read_csv("xydata_trafic.csv")
weather_data = pd.read_csv("xydata_weather.csv")

# Ensure the Date column is in datetime format
accident_data["Date"] = pd.to_datetime(accident_data["Date"])
weather_data["Date"] = pd.to_datetime(weather_data["Date"])

# Verify that both datasets have 'Date' and 'Time Period'
assert "Date" in accident_data.columns and "Time_Period" in accident_data.columns, "Accident data is missing required columns."
assert "Date" in weather_data.columns and "Time_Period" in weather_data.columns, "Weather data is missing required columns."

# # Determine the start and end START_DT of the filtered data
accident_start_date = accident_data['Date'].min().date()
accident_end_date = accident_data['Date'].max().date()
weather_start_date = weather_data['Date'].min().date()
weather_end_date = weather_data['Date'].max().date()
print(f"Traffic Incidents Data Covers from {accident_start_date} to {accident_end_date}")
print(f"Wealther Incidents Data Covers from {weather_start_date} to {weather_end_date}")


Traffic Incidents Data Covers from 2016-12-06 to 2024-11-14
Wealther Incidents Data Covers from 2015-01-01 to 2024-12-31


In [2]:
# Filter the data for the specified date range
accident_data_filtered = accident_data[
    (accident_data["Date"] >= "2017-01-01") & (accident_data["Date"] <= "2024-10-31")
]
weather_data_filtered = weather_data[
    (weather_data["Date"] >= "2017-01-01") & (weather_data["Date"] <= "2024-10-31")
]
# Verify the filtered data
print(f"Filtered accident data shape: {accident_data_filtered.shape}")
print(f"Filtered weather data shape: {weather_data_filtered.shape}")

Filtered accident data shape: (11444, 33)
Filtered weather data shape: (11444, 12)


In [3]:
# Merge datasets on 'Date' and 'Time Period'
merged_data = pd.merge(
    accident_data_filtered,
    weather_data_filtered,
    on=["Date", "Time_Period"],
    how="inner",
    suffixes=('_accident', '_weather')
)

# Save the merged dataset to a new CSV file
merged_data.to_csv("xydata_merged.csv", index=False)

# Display summary and first few rows
print(f"Merged Data Covers from {merged_data['Date'].min()} to {merged_data['Date'].max()}")
print(f"Total Rows in Merged Data: {len(merged_data)}")
print(merged_data.head())
print("Columns in dataset:")
print(merged_data.columns.tolist())

Merged Data Covers from 2017-01-01 00:00:00 to 2024-10-31 00:00:00
Total Rows in Merged Data: 11444
        Date  Time_Period  Cluster0  Cluster1  Cluster2  Cluster3  Cluster4  \
0 2017-01-01            0         0         0         0         0         0   
1 2017-01-01            1         0         0         0         0         0   
2 2017-01-01            2         0         0         0         0         0   
3 2017-01-01            3         0         0         0         0         0   
4 2017-01-02            0         0         0         0         0         0   

   Cluster5  C0D-1HA  C0D-2HA  ...  Temp (°C)  Dew Point Temp (°C)  \
0         0      0.0      0.0  ...   0.085667           -14.783333   
1         0      0.0      0.0  ...   0.005667           -16.100000   
2         0      0.0      0.0  ...   2.896000           -17.183333   
3         0      0.0      0.0  ...   1.142667           -12.583333   
4         0      0.0      0.0  ...   1.675000           -25.783333   

   R

In [4]:
# Filter the data for the specified date range
filtered_data = merged_data[
    (merged_data["Date"] >= "2017-12-01") & (merged_data["Date"] <= "2018-03-31")
]
# Save the merged dataset to a new CSV file
filtered_data.to_csv("xydata_2017winter.csv", index=False)

In [5]:
# Filter the data for the specified date range
filtered_data = merged_data[
    (merged_data["Date"] >= "2018-12-01") & (merged_data["Date"] <= "2019-03-31")
]
# Save the merged dataset to a new CSV file
filtered_data.to_csv("xydata_2018winter.csv", index=False)

In [6]:
# Filter the data for the specified date range
filtered_data = merged_data[
    (merged_data["Date"] >= "2019-12-01") & (merged_data["Date"] <= "2020-03-31")
]
# Save the merged dataset to a new CSV file
filtered_data.to_csv("xydata_2019winter.csv", index=False)

In [7]:
# Filter the data for the specified date range
filtered_data = merged_data[
    (merged_data["Date"] >= "2020-12-01") & (merged_data["Date"] <= "2021-03-31")
]
# Save the merged dataset to a new CSV file
filtered_data.to_csv("xydata_2020winter.csv", index=False)

In [8]:
# Filter the data for the specified date range
filtered_data = merged_data[
    (merged_data["Date"] >= "2021-12-01") & (merged_data["Date"] <= "2022-03-31")
]
# Save the merged dataset to a new CSV file
filtered_data.to_csv("xydata_2021winter.csv", index=False)

In [9]:
# Filter the data for the specified date range
filtered_data = merged_data[
    (merged_data["Date"] >= "2022-12-01") & (merged_data["Date"] <= "2023-03-31")
]
# Save the merged dataset to a new CSV file
filtered_data.to_csv("xydata_2022winter.csv", index=False)

In [10]:
# Filter the data for the specified date range
filtered_data = merged_data[
    (merged_data["Date"] >= "2023-12-01") & (merged_data["Date"] <= "2024-03-31")
]
# Save the merged dataset to a new CSV file
filtered_data.to_csv("xydata_2023winter.csv", index=False)