In [3]:
import pandas as pd

# Load only the header (no data rows)
df = pd.read_csv("US_Accidents_data.csv", nrows=2)

# Print column names
print("Column Names:\n", df.columns.tolist())

Column Names:
 ['ID', 'Source', 'Severity', 'Start_Time', 'End_Time', 'Start_Lat', 'Start_Lng', 'End_Lat', 'End_Lng', 'Distance(mi)', 'Description', 'Street', 'City', 'County', 'State', 'Zipcode', 'Country', 'Timezone', 'Airport_Code', 'Weather_Timestamp', 'Temperature(F)', 'Wind_Chill(F)', 'Humidity(%)', 'Pressure(in)', 'Visibility(mi)', 'Wind_Direction', 'Wind_Speed(mph)', 'Precipitation(in)', 'Weather_Condition', 'Amenity', 'Bump', 'Crossing', 'Give_Way', 'Junction', 'No_Exit', 'Railway', 'Roundabout', 'Station', 'Stop', 'Traffic_Calming', 'Traffic_Signal', 'Turning_Loop', 'Sunrise_Sunset', 'Civil_Twilight', 'Nautical_Twilight', 'Astronomical_Twilight']


In [5]:
df.head()

Unnamed: 0,ID,Source,Severity,Start_Time,End_Time,Start_Lat,Start_Lng,End_Lat,End_Lng,Distance(mi),...,Roundabout,Station,Stop,Traffic_Calming,Traffic_Signal,Turning_Loop,Sunrise_Sunset,Civil_Twilight,Nautical_Twilight,Astronomical_Twilight
0,A-1,Source2,3,2016-02-08 05:46:00,2016-02-08 11:00:00,39.865147,-84.058723,,,0.01,...,False,False,False,False,False,False,Night,Night,Night,Night
1,A-2,Source2,2,2016-02-08 06:07:59,2016-02-08 06:37:59,39.928059,-82.831184,,,0.01,...,False,False,False,False,False,False,Night,Night,Night,Day


In [6]:
import pandas as pd
import random

filename = "US_Accidents_data.csv"

# Step 1: Count total rows (excluding header)
with open(filename, 'r', encoding='utf-8') as f:
    total_rows = sum(1 for _ in f) - 1  # subtract header

# Step 2: Randomly choose rows to **keep**
rows_to_skip = sorted(random.sample(range(1, total_rows + 1), total_rows - 5000))

# Step 3: Read CSV, skipping unwanted rows
df_sample = pd.read_csv(filename, skiprows=rows_to_skip)

# Step 4: Save sample if needed
df_sample.to_csv("US_Accidents_random_sample.csv", index=False)

print("Random 5000-row sample created with shape:", df_sample.shape)


Random 5000-row sample created with shape: (5000, 46)


In [1]:
import pandas as pd

# Load the individual files
traffic_data = pd.read_csv("traffic_data.csv")
incidents_data = pd.read_csv("incidents.csv")
weather_data = pd.read_csv("weather_data.csv")

# Merge all three files on 'timestamp' and 'City'
merged_df = pd.merge(traffic_data, incidents_data, on=['timestamp', 'City'], how='inner')
merged_df = pd.merge(merged_df, weather_data, on=['timestamp', 'City'], how='inner')

# Save to a single CSV
merged_df.to_csv("traffic.csv", index=False)

print("Merged file created: traffic.csv")

Merged file created: traffic.csv


In [3]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error
import joblib
import os
from datetime import datetime
from typing import Dict, Optional, Union

class TrafficModelTrainer:
    def __init__(self, model_path="models/traffic_model.pkl", save_interval=10):
        self.model_path = model_path
        self.save_interval = save_interval
        self.model = None
        self.features = None
        self.metrics = None
        self.last_trained = None
        os.makedirs(os.path.dirname(self.model_path), exist_ok=True)

    def train_model(self, 
                   data: pd.DataFrame, 
                   test_size: float = 0.2, 
                   n_estimators: int = 100, 
                   random_state: int = 42,
                   verbose: bool = True) -> Optional[Dict]:
        try:
            if not isinstance(data, pd.DataFrame):
                raise ValueError("Input data must be a pandas DataFrame")
            if 'congestion_level' not in data.columns:
                raise ValueError("Target column 'congestion_level' not found in data")

            # Prepare features and target
            X = data.drop(columns=['congestion_level', 'timestamp', 'city'], errors='ignore')
            y = data['congestion_level']
            if X.empty:
                raise ValueError("No features available for training")
            self.features = list(X.columns)

            # Train-test split
            X_train, X_test, y_train, y_test = train_test_split(
                X, y, test_size=test_size, random_state=random_state
            )

            # Train model
            self.model = RandomForestRegressor(
                n_estimators=n_estimators,
                random_state=random_state,
                max_depth=10,
                min_samples_split=5,
                n_jobs=-1
            )
            self.model.fit(X_train, y_train)

            # Evaluate
            y_pred = self.model.predict(X_test)
            self.metrics = {
                'MAE': round(mean_absolute_error(y_test, y_pred), 2),
                'R2': round(r2_score(y_test, y_pred), 2),
                'RMSE': round(mean_squared_error(y_test, y_pred, squared=False), 2)
            }

            # Metadata
            self.last_trained = datetime.now()

            # Save model
            saved = self._save_model(verbose=verbose)
            if not saved:
                raise RuntimeError("❌ Model training succeeded but saving failed.")

            if verbose:
                print(f"✅ Training completed at {self.last_trained}")
                print(f"📊 Metrics: {self.metrics}")
                print(f"📦 Saved to: {self.model_path}")

            return {
                'model': self.model,
                'metrics': self.metrics,
                'features': self.features,
                'last_trained': self.last_trained
            }

        except Exception as e:
            print(f"❌ Training error: {str(e)}")
            return None

    def _save_model(self, verbose: bool = True) -> bool:
        try:
            model_data = {
                'model': self.model,
                'features': self.features,
                'metrics': self.metrics,
                'last_trained': self.last_trained
            }
            print(f"🔄 Saving model to: {os.path.abspath(self.model_path)}")
            joblib.dump(model_data, self.model_path)
            if verbose:
                print(f"💾 Model saved successfully to {self.model_path}")
            return True
        except Exception as e:
            print(f"❌ Failed to save model: {str(e)}")
            return False

    def load_model(self) -> Optional[RandomForestRegressor]:
        try:
            if not os.path.exists(self.model_path):
                print("⚠️ Model file not found.")
                return None

            model_data = joblib.load(self.model_path)
            if isinstance(model_data, dict):
                self.model = model_data.get('model')
                self.features = model_data.get('features')
                self.metrics = model_data.get('metrics')
                self.last_trained = model_data.get('last_trained')
            else:
                self.model = model_data

            print(f"✅ Model loaded (trained on {self.last_trained})")
            return self.model

        except Exception as e:
            print(f"❌ Model loading failed: {str(e)}")
            return None

    def predict(self, input_data: Union[pd.DataFrame, dict], verbose: bool = True) -> Optional[Union[float, list]]:
        try:
            if self.model is None and not self.load_model():
                raise ValueError("No model available for prediction")

            if isinstance(input_data, dict):
                input_data = pd.DataFrame([input_data])

            missing_features = [f for f in self.features if f not in input_data.columns]
            if missing_features:
                raise ValueError(f"Missing features: {missing_features}")

            X = input_data[self.features]
            predictions = self.model.predict(X)

            if verbose:
                print(f"🔮 Prediction made using model trained on {self.last_trained}")
                if len(predictions) == 1:
                    print(f"📊 Predicted congestion level: {predictions[0]:.2f}%")

            return predictions[0] if len(predictions) == 1 else predictions

        except Exception as e:
            print(f"❌ Prediction error: {str(e)}")
            return None

In [2]:
import pandas as pd
import numpy as np

# Load your existing CSV
df = pd.read_csv("data/traffic.csv")

# Add a new column 'incident_count' with random integers (e.g., 0 to 5)
np.random.seed(42)  # for reproducibility
df['incident_count'] = np.random.randint(0, 6, size=len(df))

# Save the updated CSV back
df.to_csv("data/traffic_with_incident_count.csv", index=False)

print("Added 'incident_count' column with random values.")


Added 'incident_count' column with random values.


In [4]:
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder

def train_and_save_model(csv_path, model_path="traffic_model.pkl"):
    try:
        # Load the dataset
        data = pd.read_csv(csv_path)
        print(f"📄 Loaded dataset with shape: {data.shape}")
        print("📋 Columns in CSV:", data.columns.tolist())

        target_col = "is_congested"

        # Check if target column exists
        if target_col not in data.columns:
            print(f"❌ '{target_col}' column not found in data.")
            return False

        # Drop rows with missing target
        data = data.dropna(subset=[target_col])

        # Drop non-numeric and irrelevant columns
        drop_cols = ["timestamp", "Description"]  # Drop text/date columns
        data = data.drop(columns=[col for col in drop_cols if col in data.columns])

        # Label encode categorical columns
        cat_cols = data.select_dtypes(include=["object"]).columns
        le_dict = {}
        for col in cat_cols:
            le = LabelEncoder()
            data[col] = le.fit_transform(data[col].astype(str))
            le_dict[col] = le

        # Split features and target
        X = data.drop(columns=[target_col])
        y = data[target_col]

        # Split into train and test sets
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

        # Train model
        print("🚦 Starting model training...")
        model = RandomForestClassifier(n_estimators=100, random_state=42)
        model.fit(X_train, y_train)
        print("✅ Model training completed.")

        # Save model
        joblib.dump(model, model_path)
        print(f"💾 Model saved to: {model_path}")
        return True

    except Exception as e:
        print(f"❌ Error: {e}")
        return False

# Run the training
train_and_save_model("data/traffic.csv")

📄 Loaded dataset with shape: (5006, 16)
📋 Columns in CSV: ['timestamp', 'City', 'latitude', 'longitude', 'severity_x', 'free_flow_speed', 'current_speed', 'is_congested', 'Severity_y', 'description', 'distance', 'temperature', 'humidity', 'precipitation', 'weather_condition', 'incident_count']
🚦 Starting model training...
✅ Model training completed.
💾 Model saved to: traffic_model.pkl


True