In [2]:
# Full reproducible pipeline 
import pandas as pd, numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import f1_score, classification_report
from sklearn.inspection import permutation_importance
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

path = r'C:/Users/Asus/Downloads/allcars.csv'   # <-- use your local path
df = pd.read_csv(path)
df.columns = [c.strip() for c in df.columns]
if 'timeStamp' in df.columns:
    df = df.rename(columns={'timeStamp':'timestamp'})
if 'tripID' in df.columns:
    df = df.rename(columns={'tripID':'trip_id'})

# parse timestamp strings like '46:00.0' to seconds
def parse_ts(s):
    try:
        parts = str(s).split(':')
        if len(parts)==2:
            m = float(parts[0]); sec = float(parts[1])
            return m*60 + sec
        elif len(parts)==3:
            h=float(parts[0]); m=float(parts[1]); sec=float(parts[2])
            return h*3600 + m*60 + sec
        else:
            return float(s)
    except:
        return np.nan

df['ts_seconds'] = df['timestamp'].apply(parse_ts)

# ensure 'speed' column exists
if 'speed' not in df.columns and 'gps_speed' in df.columns:
    df['speed'] = df['gps_speed']
if 'speed' not in df.columns:
    raise ValueError("No speed column found.")

# compute dt, decel, jerk per trip (safe dt handling)
def compute_derivs(g):
    g = g.sort_values('ts_seconds').reset_index(drop=True)
    g['dt'] = g['ts_seconds'].diff().fillna(1.0)
    g.loc[g['dt'] <= 0, 'dt'] = 0.5
    g['speed_mps'] = g['speed'].fillna(0) * (1000/3600)
    g['dv'] = g['speed_mps'].diff().fillna(0)
    g['decel'] = (g['dv'] / g['dt']).clip(-50,50)
    g['jerk'] = g['decel'].diff().fillna(0) / g['dt']
    g['jerk'] = g['jerk'].clip(-100,100)
    return g

df = df.groupby('trip_id', group_keys=False).apply(compute_derivs)

# label: harsh braking if any decel <= -3.0 m/s^2 in last 20% rows of trip
def label_trip(g, thr=-3.0):
    g = g.sort_values('ts_seconds')
    n = len(g)
    cutoff = int(np.ceil(0.8*n))
    last = g.iloc[cutoff:]
    return int((last['decel'] <= thr).any())

trip_labels = df.groupby('trip_id').apply(label_trip).rename('harsh_last20').reset_index()

# aggregate features from first 80%
def agg_feats(g):
    g = g.sort_values('ts_seconds').reset_index(drop=True)
    n = len(g); cutoff = int(np.ceil(0.8*n)); first=g.iloc[:cutoff]
    res = {
        'rows_first80': len(first),
        'mean_speed': first['speed'].mean(),
        'max_speed': first['speed'].max(),
        'median_speed': first['speed'].median(),
        'pct_time_over_80': (first['speed'] > 80).mean(),
        'stop_ratio': (first['speed'] <= 1).mean(),
        'idle_ratio': (first['speed'] < 5).mean(),
        'mean_decel': first['decel'].mean(),
        'min_decel': first['decel'].min(),
        'max_decel': first['decel'].max(),
        'mean_jerk': first['jerk'].mean(),
        'max_jerk': first['jerk'].max(),
        'heavy_decel_count': (first['decel'] <= -3.0).sum(),
    }
    res['heavy_decel_rate'] = res['heavy_decel_count'] / max(1, res['rows_first80'])
    res['decel_std'] = first['decel'].std()
    res['speed_std'] = first['speed'].std()
    res['harsh_score_like'] = -res['min_decel'] * res['heavy_decel_rate'] + abs(res['mean_jerk'] if not pd.isna(res['mean_jerk']) else 0)
    return pd.Series(res)

trip_feats = df.groupby('trip_id').apply(agg_feats).reset_index()
data = trip_feats.merge(trip_labels, on='trip_id', how='left')
data['start_ts'] = df.groupby('trip_id')['ts_seconds'].min().values

# clean inf / nan
data.replace([np.inf, -np.inf], np.nan, inplace=True)
for c in data.columns:
    if data[c].dtype.kind in 'fi':
        data[c].fillna(data[c].median(), inplace=True)

# save features
data.to_csv('trip_level_features_labels.csv', index=False)

# modeling
features = [c for c in data.columns if c not in ['trip_id','harsh_last20','start_ts']]
X = data[features]; y = data['harsh_last20'].astype(int)
scaler = StandardScaler(); Xs = scaler.fit_transform(X)

data = data.sort_values('start_ts').reset_index(drop=True)
n = len(data); train_end=int(0.6*n); val_end=int(0.8*n)
train = data.iloc[:train_end]; val = data.iloc[train_end:val_end]; test = data.iloc[val_end:]
X_train = scaler.transform(train[features]); y_train = train['harsh_last20']
X_test = scaler.transform(test[features]); y_test = test['harsh_last20']

# RandomForest randomized search (n_iter=20)
rf = RandomForestClassifier(random_state=42, n_jobs=-1)
param_rf = {'n_estimators':[50,100,150],'max_depth':[3,5,8,12,None],'min_samples_split':[2,5,10],'min_samples_leaf':[1,2,4],'max_features':['sqrt','log2',None]}
rs_rf = RandomizedSearchCV(rf, param_rf, n_iter=20, scoring='f1', cv=3, random_state=42, n_jobs=-1)
rs_rf.fit(X_train, y_train)
best_rf = rs_rf.best_estimator_
y_test_pred_rf = best_rf.predict(X_test)
print("RF test F1:", f1_score(y_test, y_test_pred_rf))
print(classification_report(y_test, y_test_pred_rf))

# GradientBoosting randomized search (n_iter=20)
gb = GradientBoostingClassifier(random_state=42)
param_gb = {'n_estimators':[50,100,150],'learning_rate':[0.01,0.05,0.1],'max_depth':[3,4,5],'min_samples_leaf':[1,2,4],'subsample':[0.6,0.8,1.0]}
rs_gb = RandomizedSearchCV(gb, param_gb, n_iter=20, scoring='f1', cv=3, random_state=42, n_jobs=-1)
rs_gb.fit(X_train, y_train)
best_gb = rs_gb.best_estimator_
y_test_pred_gb = best_gb.predict(X_test)
print("GB test F1:", f1_score(y_test, y_test_pred_gb))
print(classification_report(y_test, y_test_pred_gb))

# permutation importance
perm = permutation_importance(best_gb, X_test, y_test, n_repeats=20, random_state=42, n_jobs=-1)
imp_df = pd.DataFrame({'feature':features, 'importance':perm.importances_mean}).sort_values('importance', ascending=False)
print(imp_df.head(12))


RF test F1: 0.5714285714285714
              precision    recall  f1-score   support

           0       0.96      0.93      0.94        27
           1       0.50      0.67      0.57         3

    accuracy                           0.90        30
   macro avg       0.73      0.80      0.76        30
weighted avg       0.92      0.90      0.91        30

GB test F1: 0.4444444444444444
              precision    recall  f1-score   support

           0       0.96      0.85      0.90        27
           1       0.33      0.67      0.44         3

    accuracy                           0.83        30
   macro avg       0.65      0.76      0.67        30
weighted avg       0.90      0.83      0.86        30

             feature  importance
16  harsh_score_like    0.011667
8          min_decel    0.010000
0       rows_first80    0.006667
13  heavy_decel_rate    0.001667
1         mean_speed    0.000000
4   pct_time_over_80    0.000000
5         stop_ratio    0.000000
3       median_speed

In [3]:
%history -f cars.py