In [1]:
import pandas as pd
import folium
from sklearn.cluster import KMeans
from shapely.geometry import Point
from scipy.spatial import ConvexHull
from itertools import product
from datetime import timedelta

# Load the dataset
traffic_df = pd.read_csv('Traffic_Incidents_20241114.csv')  # Adjust this path if necessary
# Use KMeans clustering to 
n_clusters = 6
# set these according to your actual data
n_history_days = 14


In [2]:


# Ensure START_DT column is in datetime format
traffic_df['START_DT'] = pd.to_datetime(traffic_df['START_DT'], errors='coerce')  # Handle potential parsing errors

# Drop rows where START_DT could not be converted to datetime (if any)
#traffic_df = traffic_df.dropna(subset=['START_DT'])

# Create separate Date and Time columns
traffic_df['Date'] = traffic_df['START_DT'].dt.date  # Extract the date
traffic_df['Time'] = traffic_df['START_DT'].dt.strftime('%H:%M:%S')  # Extract the time in 24-hour format as string
traffic_df['Date'] = pd.to_datetime(traffic_df['Date'])
# Sort by START_DT to ensure both date and time are in order
traffic_df = traffic_df.sort_values(by='START_DT', ascending=True)

# Rearrange columns to place Date and Time after START_DT (optional)
columns_order = ['Date', 'Time'] + [col for col in traffic_df.columns if col not in ['Date', 'Time']]
traffic_df = traffic_df[columns_order]

# Determine the start and end START_DT of the filtered data
start_date = traffic_df['Date'].min().date()
end_date = traffic_df['Date'].max().date()
print(f"\Traffic Incidents Data Covers from {start_date} to {end_date}")

# Generate a complete date range
all_dates = pd.date_range(start=start_date, end=end_date).date
# Identify recorded dates in the dataset
recorded_dates = set(traffic_df['Date'].dt.date)
# Identify missing dates
missing_dates = sorted(set(all_dates) - recorded_dates)

if len(missing_dates) > 0:
    print(f"Missing Dates or no accidence date List (first 10):\n {missing_dates[:10]}")

# Count missing and recorded dates
print(f"Recorded Days: {len(recorded_dates)}")
print(f"All Days: {len(all_dates)}")
print(f"Missing Days: {len(missing_dates)}")

# Verify that no rows exist for the identified missing dates
missing_rows = traffic_df[traffic_df['Date'].dt.date.isin(missing_dates)]

if not missing_rows.empty:
    print("\nUnexpectedly Found Rows for Missing Dates:")
    print(missing_rows)
else:
    print("\nConfirmed: No rows exist for the identified missing dates.")

# Check for missing values in each column  
missing_values = traffic_df.isnull().sum()
total_rows = len(traffic_df)

# Create a DataFrame for a clear overview
missing_summary = pd.DataFrame({
    'Column': traffic_df.columns,
    'Missing Values': missing_values,
    'Percentage Missing': (missing_values / total_rows) * 100
})

# Display missing summary
missing_summary.sort_values(by='Percentage Missing', ascending=False, inplace=True)
print("\nMissing Values Summary:")
print(missing_summary)
    
# Save the modified DataFrame to a CSV file
# traffic_df.to_csv("justToCheck1.csv", index=False)

  print(f"\Traffic Incidents Data Covers from {start_date} to {end_date}")


\Traffic Incidents Data Covers from 2016-12-06 to 2024-11-14
Missing Dates or no accidence date List (first 10):
 [datetime.date(2016, 12, 21), datetime.date(2016, 12, 22), datetime.date(2016, 12, 23), datetime.date(2016, 12, 24), datetime.date(2016, 12, 25), datetime.date(2016, 12, 26), datetime.date(2016, 12, 27), datetime.date(2016, 12, 28), datetime.date(2016, 12, 29), datetime.date(2016, 12, 30)]
Recorded Days: 2744
All Days: 2901
Missing Days: 157

Confirmed: No rows exist for the identified missing dates.

Missing Values Summary:
                      Column  Missing Values  Percentage Missing
QUADRANT            QUADRANT           14059           27.693187
MODIFIED_DT      MODIFIED_DT           14057           27.689247
DESCRIPTION      DESCRIPTION               2            0.003940
Date                    Date               0            0.000000
Time                    Time               0            0.000000
INCIDENT INFO  INCIDENT INFO               0            0.000000
ST

In [3]:
kmeans = KMeans(n_clusters, random_state=0)
traffic_df['Cluster_KMeans'] = kmeans.fit_predict(traffic_df[['Latitude', 'Longitude']])

# Create a map to visualize clusters
map_houston = folium.Map(location=[traffic_df['Latitude'].mean(), traffic_df['Longitude'].mean()], zoom_start=11)
colors = ['#3186cc', '#FF5733', '#33FF57', '#f1c40f', '#8e44ad', '#3498db', '#e74c3c']
color_dict = {k: colors[k % len(colors)] for k in range(n_clusters)}

for cluster_id in range(n_clusters):
    cluster_points = traffic_df[traffic_df['Cluster_KMeans'] == cluster_id]
    points = cluster_points[['Longitude', 'Latitude']].values

    if len(points) > 2:
        hull = ConvexHull(points)
        hull_points = [(points[vertex][1], points[vertex][0]) for vertex in hull.vertices]
        folium.Polygon(locations=hull_points,
                       color=color_dict[cluster_id],
                       fill=True,
                       fill_color=color_dict[cluster_id],
                       fill_opacity=0.5,
                       tooltip=f'Cluster {cluster_id}, Points: {len(points)}').add_to(map_houston)

    for _, row in cluster_points.iterrows():
        folium.CircleMarker(location=(row['Latitude'], row['Longitude']),
                            radius=2,
                            color=color_dict[cluster_id],
                            fill=True,
                            fill_opacity=0.7).add_to(map_houston)

map_houston.save("Traffic_Map_Detailed.html")
print("Map saved as 'Traffic_Map_Detailed.html'")

# Save the modified DataFrame to a CSV file
# traffic_df.to_csv("justToCheck2.csv", index=False)

Map saved as 'Traffic_Map_Detailed.html'


In [5]:
# Function to classify descriptions into simplified categories
def classify_description(description):
    if isinstance(description, str):  # Check if the value is a string
        description = description.lower()  # Convert to lowercase for consistency
        if "single vehicle" in description:
            return 2  # Single vehicle incident
        elif "two vehicle" in description or "2 vehicle" in description:
            return 3  # Two vehicle incident
        elif "multi-vehicle" in description or "multi vehicle" in description:
            return 4  # Multi-vehicle incident
        elif "pedestrian" in description:
            return 5  # Pedestrian involved
        elif "stalled" in description or "blocking" in description:
            return 6  # Stalled or obstructive
        else:
            return 1  # Other/Unknown
    else:
        return 1  # Default for non-string values

# Apply classification function to the DESCRIPTION column
traffic_df['Incident_Type'] = traffic_df['DESCRIPTION'].apply(classify_description)

# Verify the classification
incident_type_counts = traffic_df['Incident_Type'].value_counts()

# Display the counts of each classified type
print("\nClassified Incident Types and Their Counts:")
print(incident_type_counts)

# Save the modified DataFrame to a CSV file
# traffic_df.to_csv("justToCheck3.csv", index=False)


Classified Incident Types and Their Counts:
Incident_Type
3    15645
1    15096
6     9328
4     4915
2     3917
5     1866
Name: count, dtype: int64


In [6]:
# Define time period mapping function with numeric values
def assign_time_period(hour):
    if 6 <= hour < 12:
        return 0  # Morning
    elif 12 <= hour < 18:
        return 1  # Lunch
    elif 18 <= hour < 24:
        return 2  # Evening
    else:
        return 3  # Midnight

# Create the 'Time_Period' column with numeric codes
traffic_df['Time_Period'] = pd.to_datetime(traffic_df['START_DT']).dt.hour.apply(assign_time_period)

# Verify the modified DataFrame
print("\nModified DataFrame with 'Time_Period':")
print(traffic_df[['START_DT', 'Time_Period']].head())

# Save the modified DataFrame to a CSV file
# traffic_df.to_csv("justToCheck4.csv", index=False)


Modified DataFrame with 'Time_Period':
                 START_DT  Time_Period
6413  2016-12-06 10:00:00            0
17172 2016-12-06 14:36:00            1
8955  2016-12-06 16:25:00            1
32623 2016-12-06 16:26:00            1
32280 2016-12-06 17:05:00            1


In [7]:
# Define the start and end dates (shift start date backward by 14 days to include history)
start_date = traffic_df['Date'].min()  # Shift start date backward by 14 days to include history
end_date = traffic_df['Date'].max()
print(f"Traffic Incidents Data Covers from {start_date} to {end_date}")

# Generate all combinations of dates and time periods
all_dates = pd.date_range(start=start_date, end=end_date)
time_periods = [0, 1, 2, 3]  # Four time periods in a day

# Create a DataFrame for all combinations of dates and time periods
date_time_combinations = list(product(all_dates, time_periods))
xydata = pd.DataFrame(date_time_combinations, columns=['Date', 'Time_Period'])

# Determine the number of unique clusters
n_clusters = traffic_df['Cluster_KMeans'].max() + 1

# Add columns for clusters based on n_clusters value
for cluster_num in range(n_clusters):
    xydata[f'Cluster{cluster_num}'] = 0  # Initialize with 0 value for every cluster

# Aggregate traffic_df to get counts of incidents per cluster
traffic_counts = traffic_df.groupby(['Date', 'Time_Period', 'Cluster_KMeans']).size().reset_index(name='Count')

# Update the counts in xydata DataFrame
for cluster_num in range(n_clusters):
    # Extract the data for the current cluster
    cluster_data = traffic_counts[traffic_counts['Cluster_KMeans'] == cluster_num]
    # Iterate through the rows and update the counts in xydata
    for _, row in cluster_data.iterrows():
        mask = (xydata['Date'] == row['Date']) & (xydata['Time_Period'] == row['Time_Period'])
        xydata.loc[mask, f'Cluster{cluster_num}'] += int(row['Count'])

# Export the result to CSV
# xydata.to_csv("justToCheck5.csv", index=False)


Traffic Incidents Data Covers from 2016-12-06 00:00:00 to 2024-11-14 00:00:00


In [8]:
# Generate new columns for each cluster for each history day
for cluster_num in range(n_clusters):
    for day in range(1, n_history_days + 1):
        column_name = f'C{cluster_num}D-{day}HA'
        xydata[column_name] = 0  # Initialize with 0 or another suitable value

# Display the DataFrame to verify new columns
print(xydata.head())

# Export the result to CSV
# xydata.to_csv("justToCheck6.csv", index=False)

        Date  Time_Period  Cluster0  Cluster1  Cluster2  Cluster3  Cluster4  \
0 2016-12-06            0         0         0         0         0         0   
1 2016-12-06            1         0         1         0         1         0   
2 2016-12-06            2         0         0         2         0         0   
3 2016-12-06            3         0         0         0         0         0   
4 2016-12-07            0         1         1         1         1         3   

   Cluster5  C0D-1HA  C0D-2HA  ...  C5D-5HA  C5D-6HA  C5D-7HA  C5D-8HA  \
0         1        0        0  ...        0        0        0        0   
1         4        0        0  ...        0        0        0        0   
2         2        0        0  ...        0        0        0        0   
3         0        0        0  ...        0        0        0        0   
4         1        0        0  ...        0        0        0        0   

   C5D-9HA  C5D-10HA  C5D-11HA  C5D-12HA  C5D-13HA  C5D-14HA  
0        0       

In [9]:
# Ensure the DataFrame is sorted by Date for correct shifting
xydata.sort_values(by=['Date', 'Time_Period'], inplace=True)

# Generate new columns for each cluster for each history day and fill with shifted data
for cluster_num in range(n_clusters):
    for day in range(1, n_history_days + 1):
        column_name = f'C{cluster_num}D-{day}HA'
        # Shift data within each time period group
        xydata[column_name] = xydata.groupby('Time_Period')[f'Cluster{cluster_num}'].shift(day)

# Display the DataFrame to verify new columns
print(xydata.head())

# Optional: Export the result to CSV
# xydata.to_csv("justToCheck7.csv", index=False)

        Date  Time_Period  Cluster0  Cluster1  Cluster2  Cluster3  Cluster4  \
0 2016-12-06            0         0         0         0         0         0   
1 2016-12-06            1         0         1         0         1         0   
2 2016-12-06            2         0         0         2         0         0   
3 2016-12-06            3         0         0         0         0         0   
4 2016-12-07            0         1         1         1         1         3   

   Cluster5  C0D-1HA  C0D-2HA  ...  C5D-5HA  C5D-6HA  C5D-7HA  C5D-8HA  \
0         1      NaN      NaN  ...      NaN      NaN      NaN      NaN   
1         4      NaN      NaN  ...      NaN      NaN      NaN      NaN   
2         2      NaN      NaN  ...      NaN      NaN      NaN      NaN   
3         0      NaN      NaN  ...      NaN      NaN      NaN      NaN   
4         1      0.0      NaN  ...      NaN      NaN      NaN      NaN   

   C5D-9HA  C5D-10HA  C5D-11HA  C5D-12HA  C5D-13HA  C5D-14HA  
0      NaN       

In [10]:
# List of cluster columns to transform
cluster_columns = [f'Cluster{i}' for i in range(cluster_num+1)]  
# Apply transformation to each cluster column
for column in cluster_columns:
    xydata[column] = xydata[column].apply(lambda x: 1 if x > 0 else 0)

# Display the DataFrame to verify the changes
print(xydata.head())

# Optional: Export the result to CSV
xydata.to_csv("xydata_trafic.csv", index=False)

        Date  Time_Period  Cluster0  Cluster1  Cluster2  Cluster3  Cluster4  \
0 2016-12-06            0         0         0         0         0         0   
1 2016-12-06            1         0         1         0         1         0   
2 2016-12-06            2         0         0         1         0         0   
3 2016-12-06            3         0         0         0         0         0   
4 2016-12-07            0         1         1         1         1         1   

   Cluster5  C0D-1HA  C0D-2HA  ...  C5D-5HA  C5D-6HA  C5D-7HA  C5D-8HA  \
0         1      NaN      NaN  ...      NaN      NaN      NaN      NaN   
1         1      NaN      NaN  ...      NaN      NaN      NaN      NaN   
2         1      NaN      NaN  ...      NaN      NaN      NaN      NaN   
3         0      NaN      NaN  ...      NaN      NaN      NaN      NaN   
4         1      0.0      NaN  ...      NaN      NaN      NaN      NaN   

   C5D-9HA  C5D-10HA  C5D-11HA  C5D-12HA  C5D-13HA  C5D-14HA  
0      NaN       