In [1]:
import pandas as pd
import numpy as np
from src.preprocessing import load_and_process_data, create_user_sequences, train_test_split_by_user
from src.hmm import HiddenMarkovModel
from src.baselines import TimeOfDayBaseline, FrequencyBaseline
from src.utils import LabelEncoder, calculate_accuracy
from src.baselines import TimeOfDayBaseline, FrequencyBaseline,RFBaseline
from sklearn.metrics import accuracy_score

**DATA**

In [2]:
data_path = "./data/mode_purpose_hmm.csv" # Adjust path if necessary
df = load_and_process_data(data_path)
user_sequences = create_user_sequences(df)

In [3]:
train_seqs, test_seqs = train_test_split_by_user(user_sequences, test_size=0.2)

In [4]:
# sample entry
print(train_seqs[0])

[('car', 'errand', Timestamp('2021-08-04 00:00:00')), ('walk', 'home', Timestamp('2021-08-04 00:00:00')), ('car', 'home', Timestamp('2021-08-04 00:00:00'))]


In [5]:
# encode data
mode_encoder = LabelEncoder()
purpose_encoder = LabelEncoder()

all_modes = set()
all_purposes = set()

for seq in train_seqs+test_seqs:
    for mode,purpose,_ in seq:
        all_modes.add(mode)
        all_purposes.add(purpose)


In [6]:
mode_encoder.fit(list(all_modes))
purpose_encoder.fit(list(all_purposes))

In [7]:
print(f"Modes: {mode_encoder.classes_}")
print(f"Purposes: {purpose_encoder.classes_}")

Modes: ['bike', 'bus', 'car', 'train', 'walk']
Purposes: ['eat', 'errand', 'home', 'leisure', 'work']


**BASELINE**

In [8]:
# rule-based, predict purpose given time of day

tod_baseline = TimeOfDayBaseline()

tod_true = []
tod_pred = []

for seq in test_seqs:
    for _,gt_purpose, timestamp in seq:
        tod_true.append(gt_purpose)
        pred = tod_baseline.predict(timestamp)
        tod_pred.append(pred)

acc_tod = accuracy_score(tod_true, tod_pred)
print(f"TimeOfDay Baseline Accuracy: {acc_tod:.4f}")



TimeOfDay Baseline Accuracy: 0.3522


In [None]:
freq_baseline = FrequencyBaseline()

# get mode and purpose, w/o timestamps
train_seqs_stripped = [[(row[0], row[1]) for row in seq] for seq in train_seqs]

# training
freq_baseline.fit(train_seqs_stripped)

freq_true_flattened = []
freq_pred_flattened = []

# prediction
for seq in test_seqs:
    for row in seq:
        mode, true_purpose, _ = row  
        
        pred_purpose = freq_baseline.predict(mode)
        
        freq_true_flattened.append(true_purpose)
        freq_pred_flattened.append(pred_purpose)

acc_freq = accuracy_score(freq_true_flattened, freq_pred_flattened)
print(f"Frequency Baseline Accuracy: {acc_freq:.4f}")

Frequency Baseline Accuracy: 0.2722
