Weather Data: https://www.visualcrossing.com/weather-data/

Traffic Incident: https://data.austintexas.gov/Transportation-and-Mobility/Real-Time-Traffic-Incident-Reports/dx9v-zd7x/about_data

# Classification Model

Goal: Out of all the areas that have incidents recorded, which ones are high-risk, and which are low-risk?

- Define high-risk (total incidents recorded for that area >1000, or 7 times a week)
- Train a classifier
- Evaluate using accuracy, precision, recall

# Read in Data

In [1]:
import pandas as pd

incident = pd.read_csv('Traffic_Incident_Reports.csv')
incident['Date'] = pd.to_datetime(incident['Date'])
incident = incident[(incident['Latitude'] < 300) & (incident['Longitude'] < -10) & (incident['Date'] >= '2022-05-01') & (incident['Date'] <= '2024-12-11')]

weather = pd.read_csv('Austin_Weather_20220501_20241211.csv')
weather['datetime'] = pd.to_datetime(weather['datetime'])

traffic = incident.merge(weather, left_on='Date', right_on='datetime', how='left')

  from pandas.core import (


# Define "high-risk"

In [2]:
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

# Location clusters using Kmeans
coords = traffic[['Latitude', 'Longitude']]
kmeans = KMeans(n_clusters=50, random_state=1).fit(coords)
traffic['location_cluster'] = kmeans.labels_

# Define high-risk
cluster_counts = traffic['location_cluster'].value_counts()
high_risk_clusters = cluster_counts[cluster_counts > 1000].index
traffic['highrisk'] = traffic['location_cluster'].apply(lambda x: 1 if x in high_risk_clusters else 0)



### Visualize clusters/areas

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(6, 4))
for cluster_id in sorted(traffic['location_cluster'].unique()):
    cluster_data = traffic[traffic['location_cluster'] == cluster_id]
    plt.scatter(cluster_data['Longitude'], cluster_data['Latitude'], s=5, label=f'Cluster {cluster_id}', alpha=0.6)

centroids = kmeans.cluster_centers_
for i, (lat, lon) in enumerate(centroids):
    plt.text(lon, lat, str(i), fontsize=5, fontweight='bold', color='black', ha='center', va='center')

plt.xlabel('Longitude')
plt.ylabel('Latitude')
plt.grid(True, linestyle='--', linewidth=0.3, alpha=0.8)
plt.title('Geographic Clusters of Incidents in Austin')
#plt.savefig('old', dpi=300, bbox_inches='tight')  # <-- This line saves the image


In [None]:
import matplotlib.pyplot as plt
from matplotlib.patches import Patch

plt.figure(figsize=(6, 4))

# Plot clusters with light red for high-risk and light blue for low-risk
for cluster_id in sorted(traffic['location_cluster'].unique()):
    cluster_data = traffic[traffic['location_cluster'] == cluster_id]
    color = 'lightcoral' if cluster_id in high_risk_clusters else 'lightblue'
    plt.scatter(cluster_data['Longitude'], cluster_data['Latitude'], 
                s=5, color=color, alpha=0.6)

# Plot centroids with cluster labels
centroids = kmeans.cluster_centers_
for i, (lat, lon) in enumerate(centroids):
    plt.text(lon, lat, str(i), fontsize=5, fontweight='bold', color='black', ha='center', va='center')

# Custom legend
legend_elements = [
    Patch(facecolor='lightcoral', label='High-Risk'),
    Patch(facecolor='lightblue', label='Low-Risk')
]
plt.legend(handles=legend_elements, loc='upper right')

plt.xlabel('Longitude')
plt.ylabel('Latitude')
plt.grid(True, linestyle='--', linewidth=0.3, alpha=0.8)
plt.title('Geographic Clusters of Incidents in Austin')
#plt.savefig('new', dpi=300, bbox_inches='tight')  # <-- This line saves the image
plt.show()


### Some things to double check

In [None]:
cluster_counts

In [None]:
traffic['highrisk'].value_counts()

In [None]:
centroids

# Models

In [4]:
# Features
traffic['hour'] = traffic['Date'].dt.hour
traffic['day_of_week'] = traffic['Date'].dt.dayofweek
traffic['month'] = traffic['Date'].dt.month
traffic['rush_hour'] = traffic['hour'].apply(lambda x: 1 if 20 <= x <= 23 else 0)  #7<=x<=9 or 16<=x<=18
traffic['AM'] = traffic['AM/PM'].apply(lambda x: 1 if 'AM' else 0)

# Set X, y
X = traffic[['rush_hour', 'hour', 'day_of_week', 'month', 'AM', 'temp', 'humidity', 'precip', 'windspeed', 'cloudcover', 'visibility']]
y = traffic['highrisk']

# Fit models
from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import classification_report
from sklearn import metrics
from pandas import DataFrame

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

models = {
    'Decision Tree': tree.DecisionTreeClassifier(criterion='entropy'),
    'Bagging': BaggingClassifier(tree.DecisionTreeClassifier(criterion='entropy'), random_state=1),
    'Random Forest': RandomForestClassifier(n_estimators=10, random_state=1),
    'Gradient Boosting': GradientBoostingClassifier(n_estimators=200, max_depth=4, random_state=1)    }

for name, model in models.items():
    model.fit(X_train, y_train)

# Evaluate
accuracy_train = {}
accuracy_test = {}

for name, model in models.items():
    prediction_train = model.predict(X_train)
    accuracy_train[name] = metrics.accuracy_score(y_train, prediction_train)

    prediction_test = model.predict(X_test)
    accuracy_test[name] = metrics.accuracy_score(y_test, prediction_test)

#pd.Series(prediction_test).value_counts()
    
# Output
result = DataFrame({'Training Accuracy': accuracy_train,
                    'Test Accuracy': accuracy_test  })
result
#print(classification_report(y_test, prediction_test))

Unnamed: 0,Training Accuracy,Test Accuracy
Decision Tree,0.92198,0.923962
Bagging,0.92198,0.923962
Random Forest,0.92198,0.923962
Gradient Boosting,0.92198,0.923962


### What's the optimal n_estimators (number of trees)?

In [None]:
from pandas import Series, DataFrame
import matplotlib.pyplot as plt

scores = {}
for num_trees in [50, 100, 200, 300, 400]:
    model = GradientBoostingClassifier(n_estimators=num_trees,
                                       max_depth=4,
                                       random_state=1)
    model.fit(X_train, y_train)
    prediction_test = model.predict(X_test)
    accuracy_test = metrics.accuracy_score(y_test, prediction_test)
    scores[num_trees] = accuracy_test
Series(scores).sort_index().plot()
plt.xlabel('Number of trees in ensemble')
plt.ylabel('Test accuracy')

### What's the optimal max_depth of each tree?

In [None]:
scores = {}
for depth in range(1, 11):
    model = GradientBoostingClassifier(n_estimators=200,
                                       max_depth=depth,
                                       random_state=1)
    model.fit(X_train, y_train)
    prediction_test = model.predict(X_test)
    accuracy_test = metrics.accuracy_score(y_test, prediction_test)
    scores[depth] = accuracy_test
Series(scores).sort_index().plot()
plt.xlabel('Depth of trees in ensemble')
plt.ylabel('Test accuracy')

# Precision, Recall

In [None]:
from sklearn.metrics import precision_recall_curve

for index, (name, model) in enumerate(models.items()):
    predictions = model.predict_proba(X_test)
    precision, recall, thresholds = precision_recall_curve(y_test, predictions[:, 1])
    plt.plot(recall, precision, label=name)

plt.xlabel('Recall')
plt.ylabel('Precision')
plt.legend(loc='best')