In [5]:
import pandas as pd
import os
import numpy as np

In [3]:
def load_and_explore_data(filepath, sample_frac=None):
    try:
        df = pd.read_csv(filepath)
        print("Dataset loaded successfully.")
        print("\nDataset Info:")
        print(df.info())
        print("\nFirst 5 rows:")
        print(df.head())

        if sample_frac:
            print(f"\nSampling {sample_frac*100}% of the data...")
            df_sampled = df.sample(frac=sample_frac, random_state=42) 
            print("Sampled data info:")
            print(df_sampled.info())
            return df_sampled
        else:
            return df

    except FileNotFoundError:
        print(f"Error: Dataset not found at {filepath}")
        return None

if __name__ == "__main__":
    data_filepath = '/Users/ajay/Documents/repos/Telematics-Based-Usage-Insurance-System/Telematicsdata.csv'
    
    telematics_data = load_and_explore_data(data_filepath, sample_frac=0.01) # Sample 1% for a quick run

    if telematics_data is not None:
        print("\nData loading and exploration complete.")

Dataset loaded successfully.

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 173853 entries, 0 to 173852
Data columns (total 6 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   deviceId    173853 non-null  object 
 1   timeMili    52202 non-null   float64
 2   timestamp   173853 non-null  object 
 3   value       173853 non-null  object 
 4   variable    173853 non-null  object 
 5   alarmClass  173853 non-null  int64  
dtypes: float64(1), int64(1), object(4)
memory usage: 8.0+ MB
None

First 5 rows:
                           deviceId      timeMili                   timestamp  \
0  zRYzhAEAHAABAAAKCRtcAAsAtwB1gBAQ  1.597603e+12  2020-08-17 00:00:02.000000   
1  zRYzhAEAHAABAAAKCRtcAAsAtwB1gBAQ  1.597603e+12  2020-08-17 00:00:03.000000   
2  zRYzhAEAHAABAAAKCRtcAAsAtwB1gBAQ  1.597603e+12  2020-08-17 00:00:07.000000   
3  zRYzhAEAHAABAAAKCRtcAAsAtwB1gBAQ  1.597603e+12  2020-08-17 00:00:10.000000   
4  zRYzhAEAHAABAAAKCRtc

In [8]:
def preprocess_data(df):
    if df is None:
        return None

    df['timestamp'] = pd.to_datetime(df['timestamp'])

    categorical_cols = df.select_dtypes(include=['object']).columns
    for col in categorical_cols:
        mode = df[col].mode()
        if not mode.empty: 
             df[col] = df[col].fillna(mode[0])
    print("\nData preprocessing complete.")
    return df

if __name__ == "__main__":

    if telematics_data is not None:
        processed_data = preprocess_data(telematics_data)

        if processed_data is not None:
             print("\nProcessed data sample:")
             print(processed_data.head())


Data preprocessing complete.

Processed data sample:
                                deviceId      timeMili           timestamp  \
149641  zRYzhAEAHAABAAAKCRtcAAsAtwB1gBAQ           NaN 2020-08-20 20:11:51   
30866   zRYzhAEAHAABAAAKCRtcAAsAtwB1gBAQ  1.597660e+12 2020-08-17 15:53:59   
67937   zRYzhAEAHAABAAAKCRtcAAsAtwB1gBAQ           NaN 2020-08-18 18:07:11   
173299  zRYzhAEAHAABAAAKCRtcAAsAGAB0gBAQ  1.598248e+12 2020-08-24 11:23:28   
74720   zRYzhAEAHAABAAAKCRtcAAsAtwB1gBAQ           NaN 2020-08-18 20:34:22   

                           value                      variable  alarmClass  
149641                         0               HARDWARE STATUS           0  
30866                      740.0                    ENGINE RPM           1  
67937   13.330149,74.744225,17.0                      POSITION           0  
173299                       7.0  ABSOLUTE THROTTLE POSITION E           1  
74720                       -8.8                ACCELERATION Y           1  


In [11]:
def analyze_driving_behavior(df):
    if df is None:
        return None

    risk_scores = {}
    speed_threshold = 80 
    harsh_braking_threshold = -5 
    
    for device_id, device_data in df.groupby('deviceId'):
        risk_score = 0
        driving_behaviors = []

        
        speed_data = device_data[device_data['variable'] == 'SPEED']['value']
        if not speed_data.empty:
            speeding_incidents = speed_data[speed_data > speed_threshold].count()
            risk_score += speeding_incidents * 5 
            if speeding_incidents > 0:
                 driving_behaviors.append(f"{speeding_incidents} speeding incident(s)")

        
        accelerometer_z_data = device_data[device_data['variable'] == 'ACCELEROMETER_Z']['value']
        if not accelerometer_z_data.empty:
             harsh_braking_incidents = accelerometer_z_data[accelerometer_z_data < harsh_braking_threshold].count()
             risk_score += harsh_braking_incidents * 10 
             if harsh_braking_incidents > 0:
                 driving_behaviors.append(f"{harsh_braking_incidents} harsh braking incident(s)")


        risk_scores[device_id] = {'score': risk_score, 'behaviors': driving_behaviors}

    print("\nDriving behavior analysis and risk scoring complete.")
    return risk_scores

if __name__ == "__main__":
    dummy_data = {
        'deviceId': [1, 1, 1, 2, 2, 2],
        'timestamp': pd.to_datetime(['2023-01-01 10:00:00', '2023-01-01 10:01:00', '2023-01-01 10:02:00',
                                    '2023-01-01 10:00:00', '2023-01-01 10:01:00', '2023-01-01 10:02:00']),
        'variable': ['SPEED', 'ACCELEROMETER_Z', 'SPEED', 'SPEED', 'ACCELEROMETER_Z', 'SPEED'],
        'value': [70, -3, 90, 60, -7, 85]
    }
    dummy_df = pd.DataFrame(dummy_data)

    risk_analysis_results = analyze_driving_behavior(dummy_df)

    if risk_analysis_results:
        print("\nRisk Analysis Results (Dummy Data):")
        for device_id, results in risk_analysis_results.items():
            print(f"Device {device_id}: Risk Score = {results['score']}, Behaviors = {', '.join(results['behaviors'])}")



Driving behavior analysis and risk scoring complete.

Risk Analysis Results (Dummy Data):
Device 1: Risk Score = 5, Behaviors = 1 speeding incident(s)
Device 2: Risk Score = 15, Behaviors = 1 speeding incident(s), 1 harsh braking incident(s)


In [12]:
def generate_feedback(risk_analysis_results):
    if not risk_analysis_results:
        return None

    feedback_messages = {}
    risk_threshold_high = 50
    risk_threshold_medium = 20

    for device_id, results in risk_analysis_results.items():
        feedback = f"Device {device_id} Safety Recommendations:\n"
        score = results['score']
        behaviors = results['behaviors']

        if score > risk_threshold_high:
            feedback += "Your risk score is high. Urgent action recommended to improve driving habits.\n"
        elif score > risk_threshold_medium:
            feedback += "Your risk score is medium. Consider safer driving practices.\n"
        else:
            feedback += "Good driving habits. Keep it up!\n"

        if behaviors:
            feedback += "Specific areas to focus on:\n"
            for behavior in behaviors:
                feedback += f"- {behavior}\n"

        feedback_messages[device_id] = feedback

    print("\nFeedback generation complete.")
    return feedback_messages

if __name__ == "__main__":
    dummy_risk_results = {
        1: {'score': 65, 'behaviors': ['1 speeding incident(s)', '1 harsh braking incident(s)']},
        2: {'score': 15, 'behaviors': []}
    }

    feedback_generated = generate_feedback(dummy_risk_results)

    if feedback_generated:
        print("\nGenerated Feedback (Dummy Data):")
        for device_id, message in feedback_generated.items():
            print(message)


Feedback generation complete.

Generated Feedback (Dummy Data):
Device 1 Safety Recommendations:
Your risk score is high. Urgent action recommended to improve driving habits.
Specific areas to focus on:
- 1 speeding incident(s)
- 1 harsh braking incident(s)

Device 2 Safety Recommendations:
Good driving habits. Keep it up!

