In [12]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, f1_score
import warnings
warnings.filterwarnings('ignore')

In [13]:
df=pd.read_csv('smart_mobility_dataset.csv')
df.head()

Unnamed: 0,Timestamp,Latitude,Longitude,Vehicle_Count,Traffic_Speed_kmh,Road_Occupancy_%,Traffic_Light_State,Weather_Condition,Accident_Report,Sentiment_Score,Ride_Sharing_Demand,Parking_Availability,Emission_Levels_g_km,Energy_Consumption_L_h,Traffic_Condition
0,2024-03-01 00:00:00,40.842275,-73.703149,205,49.893435,82.65278,Yellow,Clear,0,-0.609199,2,45,450.760055,19.574337,High
1,2024-03-01 00:05:00,40.831119,-73.987354,202,22.383965,45.829298,Green,Clear,0,0.965442,16,1,321.800341,5.385554,High
2,2024-03-01 00:10:00,40.819549,-73.732462,252,46.889699,82.772465,Green,Rain,0,0.28966,16,49,231.152655,10.277477,High
3,2024-03-01 00:15:00,40.725849,-73.980134,37,5.730536,37.695567,Red,Fog,0,-0.271965,66,10,410.384292,29.243279,High
4,2024-03-01 00:20:00,40.813265,-73.961631,64,61.348034,22.313358,Red,Snow,0,-0.797606,3,5,364.466342,16.801459,Low


In [14]:
df.isna().sum()

Timestamp                 0
Latitude                  0
Longitude                 0
Vehicle_Count             0
Traffic_Speed_kmh         0
Road_Occupancy_%          0
Traffic_Light_State       0
Weather_Condition         0
Accident_Report           0
Sentiment_Score           0
Ride_Sharing_Demand       0
Parking_Availability      0
Emission_Levels_g_km      0
Energy_Consumption_L_h    0
Traffic_Condition         0
dtype: int64

In [15]:
df.info

<bound method DataFrame.info of                 Timestamp   Latitude  Longitude  Vehicle_Count  \
0     2024-03-01 00:00:00  40.842275 -73.703149            205   
1     2024-03-01 00:05:00  40.831119 -73.987354            202   
2     2024-03-01 00:10:00  40.819549 -73.732462            252   
3     2024-03-01 00:15:00  40.725849 -73.980134             37   
4     2024-03-01 00:20:00  40.813265 -73.961631             64   
...                   ...        ...        ...            ...   
4995  2024-03-18 08:15:00  40.604663 -73.831032            266   
4996  2024-03-18 08:20:00  40.733790 -73.752213             20   
4997  2024-03-18 08:25:00  40.821354 -73.759376             43   
4998  2024-03-18 08:30:00  40.772343 -73.836698             88   
4999  2024-03-18 08:35:00  40.756688 -73.849335            147   

      Traffic_Speed_kmh  Road_Occupancy_% Traffic_Light_State  \
0             49.893435         82.652780              Yellow   
1             22.383965         45.829298    

In [16]:
df['hour'] = pd.to_datetime(df['Timestamp']).dt.hour
df['is_weekend'] = pd.to_datetime(df['Timestamp']).dt.dayofweek.isin([5, 6]).astype(int)
df = df.drop('Timestamp', axis=1)

In [17]:
X = df.drop('Traffic_Condition', axis=1)  # Assuming Traffic_Condition is the target
y = df['Traffic_Condition']

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [19]:
numeric_cols = ['Vehicle_Count', 'Road_Occupancy_%']
categorical_cols = ['Weather_Condition']

In [21]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', SimpleImputer(strategy='mean'), numeric_cols),
        ('cat', OneHotEncoder(drop='first', sparse_output=False), categorical_cols)
    ])

In [22]:
# Define multiple models in pipelines
rf_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(n_estimators=10, max_depth=5, random_state=42))
])

lr_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(max_iter=1000, random_state=42))
])

svm_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', SVC(kernel='linear', random_state=42))
])

In [23]:
pipelines = {
    'Random Forest': rf_pipeline,
    'Logistic Regression': lr_pipeline,
    'Support Vector Machine': svm_pipeline
}

In [26]:
results = {}
for name, pipeline in pipelines.items():
    # Train
    pipeline.fit(X_train, y_train)
    
    # Predict
    y_pred = pipeline.predict(X_test)
    
    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='weighted')
    
    # Store results
    results[name] = {'Accuracy': accuracy, 'F1 Score': f1}
    print(f"\nResults for {name}:")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"F1 Score: {f1:.4f}")


Results for Random Forest:
Accuracy: 0.8153
F1 Score: 0.8274

Results for Logistic Regression:
Accuracy: 0.6833
F1 Score: 0.6637

Results for Support Vector Machine:
Accuracy: 0.6907
F1 Score: 0.6741


In [27]:
best_model = max(results.items(), key=lambda x: x[1]['F1 Score'])
print(f"\nBest Model: {best_model[0]} with F1 Score: {best_model[1]['F1 Score']:.4f}")


Best Model: Random Forest with F1 Score: 0.8274
