In [91]:
# Importing and reading dataset

import pandas as pd
acc = pd.read_csv("Accelerometer.csv")
gyro = pd.read_csv("Gyroscope.csv")

In [10]:
# Check the first 5 rows of the acclerometer dataset

acc.head()

Unnamed: 0,Timestamp,Milliseconds,X,Y,Z
0,2021-02-02 10:45:08,0,0.006879,-2.54058,9.575478
1,2021-02-02 10:45:08,22,0.565005,-6.328119,9.661919
2,2021-02-02 10:45:08,22,0.565005,-6.328119,9.661919
3,2021-02-02 10:45:08,30,-0.223131,-6.505187,10.342078
4,2021-02-02 10:45:08,40,0.425922,-7.753344,11.015359


In [11]:
# check the first 5 rows of the gyroscope dataset

gyro.head()

Unnamed: 0,Timestamp,Milliseconds,X,Y,Z
0,2021-02-02 10:45:08,3,-0.455387,0.098001,-2.172277
1,2021-02-02 10:45:08,22,-0.44367,-0.264045,-1.642856
2,2021-02-02 10:45:08,22,-0.44367,-0.264045,-1.642856
3,2021-02-02 10:45:08,30,-0.326494,-0.247001,-1.409703
4,2021-02-02 10:45:08,40,0.067509,-0.12876,-0.920894


In [20]:
# check the columns of the accelerometer dataset

acc.columns

Index(['Timestamp', 'Milliseconds', 'X', 'Y', 'Z'], dtype='object')

In [21]:
# check the columns of the gyroscope dataset

gyro.columns

Index(['Timestamp', 'Milliseconds', 'X', 'Y', 'Z'], dtype='object')

In [29]:
# Create acceleration magnitude to removes direction and captures overall driving intensity

import numpy as np

acc["acc_mag"] = np.sqrt( acc["X"]**2 + acc["Y"]**2 + acc["Z"]**2)
gyro["gyro_mag"] = np.sqrt(gyro["X"]**2 + gyro["Y"]**2 + gyro["Z"]**2)

acc[["X" , "Y", "Z", "acc_mag"]].head()

Unnamed: 0,X,Y,Z,acc_mag
0,0.006879,-2.54058,9.575478,9.906784
1,0.565005,-6.328119,9.661919,11.563607
2,0.565005,-6.328119,9.661919,11.563607
3,-0.223131,-6.505187,10.342078,12.219895
4,0.425922,-7.753344,11.015359,13.477161


In [36]:
# Engineer behavioural features

acc_features= {
    "acc_mean": acc["acc_mag"].mean(),
    "acc_std": acc["acc_mag"].std(),
    "acc.max": acc["acc_mag"].max()
}

gyro_features = {
    "gyro_mean": gyro["gyro_mag"].mean(),
    "gyro_std": gyro["gyro_mag"].std(),
    "gyro_max": gyro["gyro_mag"].max()
}

In [37]:
# Combine features into one dataframe

drivers_features = pd.DataFrame([{ ** acc_features, **gyro_features}])
drivers_features

Unnamed: 0,acc_mean,acc_std,acc.max,gyro_mean,gyro_std,gyro_max
0,9.628308,0.884123,25.612467,0.469564,0.526734,3.590601


In [40]:
# Create synthetic driver dataset. Using one driving session to simulate multiple drivers. 

np.random.seed(42)
n_drivers = 200
df = pd.DataFrame({
    "acc_mean": np.random.normal(9.6, 0.5, n_drivers),
    "acc_std": np.random.normal(0.9, 0.2, n_drivers),
    "acc_max": np.random.normal(25, 5, n_drivers),
    "gyro_mean": np.random.normal(0.47, 0.15, n_drivers),
    "gyro_std": np.random.normal(0.53, 0.2, n_drivers),
    "gyro_max": np.random.normal(3.6, 1.0, n_drivers)
})

df

Unnamed: 0,acc_mean,acc_std,acc_max,gyro_mean,gyro_std,gyro_max
0,9.848357,0.971557,17.027862,0.583548,0.717657,4.999355
1,9.530868,1.012157,22.003125,0.331675,0.426791,4.524634
2,9.923844,1.116610,25.026218,0.600441,0.549224,3.659630
3,10.361515,1.110760,25.234903,0.673346,0.437545,2.953063
4,9.482923,0.624466,22.749673,0.532015,0.443101,4.298223
...,...,...,...,...,...,...
195,9.792659,0.806165,22.449918,0.627973,0.473780,4.040475
196,9.158071,0.557373,23.650625,0.464067,0.889537,3.580362
197,9.676863,1.170774,20.106181,0.572225,0.658169,4.152490
198,9.629104,0.877092,22.778534,0.474248,0.415764,3.823914


In [41]:
# check the first five rows. 
df.head()

Unnamed: 0,acc_mean,acc_std,acc_max,gyro_mean,gyro_std,gyro_max
0,9.848357,0.971557,17.027862,0.583548,0.717657,4.999355
1,9.530868,1.012157,22.003125,0.331675,0.426791,4.524634
2,9.923844,1.11661,25.026218,0.600441,0.549224,3.65963
3,10.361515,1.11076,25.234903,0.673346,0.437545,2.953063
4,9.482923,0.624466,22.749673,0.532015,0.443101,4.298223


In [45]:
# Define Risk Label
# 1 = risky, 0 = safe

df["risky_driver"] = (
    (df["acc_std"]> 1.1)| 
    (df["acc_max"]>30)|
    (df["gyro_max"]> 4.5)
).astype(int)

df["risky_driver"].value_counts()

risky_driver
0    105
1     95
Name: count, dtype: int64

In [46]:
df

Unnamed: 0,acc_mean,acc_std,acc_max,gyro_mean,gyro_std,gyro_max,risky_driver
0,9.848357,0.971557,17.027862,0.583548,0.717657,4.999355,1
1,9.530868,1.012157,22.003125,0.331675,0.426791,4.524634,1
2,9.923844,1.116610,25.026218,0.600441,0.549224,3.659630,1
3,10.361515,1.110760,25.234903,0.673346,0.437545,2.953063,1
4,9.482923,0.624466,22.749673,0.532015,0.443101,4.298223,0
...,...,...,...,...,...,...,...
195,9.792659,0.806165,22.449918,0.627973,0.473780,4.040475,0
196,9.158071,0.557373,23.650625,0.464067,0.889537,3.580362,0
197,9.676863,1.170774,20.106181,0.572225,0.658169,4.152490,1
198,9.629104,0.877092,22.778534,0.474248,0.415764,3.823914,0


In [55]:
# Train logistic regression

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report, roc_auc_score


x_train, x_test, y_train, y_test = train_test_split( 
    x, y, test_size = 0.3, random_state = 42
)

In [92]:
x = df.drop("risky_driver", axis=1)
y = df["risky_driver"]

Unnamed: 0,acc_mean,acc_std,acc_max,gyro_mean,gyro_std,gyro_max
0,9.848357,0.971557,17.027862,0.583548,0.717657,4.999355
1,9.530868,1.012157,22.003125,0.331675,0.426791,4.524634
2,9.923844,1.116610,25.026218,0.600441,0.549224,3.659630
3,10.361515,1.110760,25.234903,0.673346,0.437545,2.953063
4,9.482923,0.624466,22.749673,0.532015,0.443101,4.298223
...,...,...,...,...,...,...
195,9.792659,0.806165,22.449918,0.627973,0.473780,4.040475
196,9.158071,0.557373,23.650625,0.464067,0.889537,3.580362
197,9.676863,1.170774,20.106181,0.572225,0.658169,4.152490
198,9.629104,0.877092,22.778534,0.474248,0.415764,3.823914


In [59]:
x_train, x_test, y_train, y_test = train_test_split( 
    x, y, test_size = 0.3, random_state = 42
)

In [69]:
scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.fit_transform(x_test)

In [74]:
model = LogisticRegression()
model.fit( x_train_scaled, y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,100


In [93]:
y_pred = model.predict( x_test_scaled)
y_probs = model.predict_proba(x_test_scaled)

In [85]:
print( classification_report( y_test, y_pred))
print("AUC:", roc_auc_score(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.73      0.76      0.75        29
           1       0.77      0.74      0.75        31

    accuracy                           0.75        60
   macro avg       0.75      0.75      0.75        60
weighted avg       0.75      0.75      0.75        60

AUC: 0.7502780867630701


In [87]:
# Identify Risk Drivers
# Positive coefficient = increases risk
# Negative coefficient = reduces risk

coef_df = pd.DataFrame({
    "feature": x.columns,
    "coefficient": model.coef_[0]
}).sort_values(by="coefficient", ascending=False)

coef_df

Unnamed: 0,feature,coefficient
5,gyro_max,1.323257
2,acc_max,0.981561
1,acc_std,0.790367
3,gyro_mean,0.203295
0,acc_mean,0.053226
4,gyro_std,-0.066669
