In [1]:
%%capture setup_output
%run 'setup.ipynb'

# run the following command to get the output of the setup commands
# setup_output.show()

In [117]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, LabelEncoder, OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

from concurrent.futures import ThreadPoolExecutor
from concurrent.futures import ProcessPoolExecutor

import torch
from torch.utils.data import DataLoader, TensorDataset
import pytorch_lightning as pl
from torch import nn
from pytorch_lightning import Trainer
from pytorch_lightning.callbacks import EarlyStopping
from pytorch_lightning.loggers import CSVLogger
from pytorch_lightning.callbacks import ModelCheckpoint

from sklearn.metrics import mean_absolute_error, mean_squared_error, median_absolute_error
import matplotlib.pyplot as plt
from src.UserEmbeddingLstm import UserEmbeddingLstm
from src.UserEmbeddingDataset import UserEmbeddingDataset

In [219]:
# Load and filter the dataset
sequence_length = 5
batch_size = 512

# Define dataset file path
file_path = '../results/removed_outliers.parquet'
df = pd.read_parquet(file_path)

# Define features and target variables
# removed 'c_chargingmethod','c_chargingtype' 'mean_dep_time','latitude', 'longitude',  'start_da'
relevant_features = ['c_vin', 'c_realstartsoc', 'weekday_numerical', 'is_weekend',
                     'mean_consumption', 'mean_duration', 'latitude', 'longitude',
                     'start_hour', 'start_day', 'is_home_spot', 'is_location_one',
                     'start_datetime', 'delta_soc_real', 'plugin_duration_hr']

target = 'delta_soc_real' #, 'delta_soc_real', plugin_duration_hr

df = df[relevant_features]

In [220]:
print(len(df))
df = df.dropna()
print(len(df))

8637446
8631511


In [221]:
num_users = df['c_vin'].nunique()

In [222]:
# Sort the DataFrame by 'c_vin' and 'start_datetime'
df = df.sort_values(by=['c_vin', 'start_datetime'])

# Add new features for previous session plugin_duration_hr and delta_soc_real
df['prev_plugin_duration_hr'] = df.groupby('c_vin')['plugin_duration_hr'].shift(1)
df['prev_delta_soc_real'] = df.groupby('c_vin')['delta_soc_real'].shift(1)

# Fill missing prev values with current
df['prev_plugin_duration_hr'] = df['prev_plugin_duration_hr'].fillna(df['plugin_duration_hr'])
df['prev_delta_soc_real'] = df['prev_delta_soc_real'].fillna(df['delta_soc_real'])

In [223]:
label_encoder = LabelEncoder()
df['c_vin_encoded'] = label_encoder.fit_transform(df['c_vin'])
# df = df.drop(columns=['c_vin'])

features = ['c_realstartsoc', 'weekday_numerical', 'is_weekend',
            'mean_consumption', 'mean_duration', 'latitude', 'longitude',
            'start_hour', 'start_day', 'is_home_spot', 'is_location_one',
            'prev_plugin_duration_hr', 'prev_delta_soc_real']


In [224]:
# Scaling numeric features
feature_scaler = StandardScaler()
target_scaler = StandardScaler()

df[features] = feature_scaler.fit_transform(df[features])
df[[target]] = target_scaler.fit_transform(df[[target]])

In [225]:
def process_group(group):
    try:
        group = group.sort_values('start_datetime')
        sequences = []
        targets = []
        vins = []
        for i in range(len(group) - sequence_length):
            sequence = group[features].iloc[i:i + sequence_length].values
            target_value = group[target].iloc[i + sequence_length]
            vin_value = group['c_vin_encoded'].iloc[0]
            sequences.append(sequence)
            targets.append(target_value)
            vins.append(vin_value)
    except Exception as e:
        print(f"Error processing group {group['c_vin'].iloc[0]}: {e}")
    return sequences, targets, vins

In [None]:
# Creates multiple processes to create user sequences faster
with ProcessPoolExecutor(max_workers=31) as executor:
    results = list(executor.map(process_group, [group for _, group in df.groupby('c_vin')]))

In [None]:
merged_X = []
merged_y = []
merged_vin = []

for X, y, vin in results:
    merged_X.extend(X)
    merged_y.extend(y)
    merged_vin.extend(vin)

merged_X = np.array(merged_X)
merged_y = np.array(merged_y)
merged_vin = np.array(merged_vin)

In [None]:
# The user sequences can be either shuffled or split
# sequencially to avoid having parts of the same sequence in the train and test sets

# Option 1 of splitting the dataset
train_size = int(0.8 * len(merged_X))
val_size = int(0.1 * len(merged_X))
X_train, X_val, X_test = merged_X[:train_size], merged_X[train_size:train_size + val_size], merged_X[train_size + val_size:]
y_train, y_val, y_test = merged_y[:train_size], merged_y[train_size:train_size + val_size], merged_y[train_size + val_size:]
vin_train, vin_val, vin_test = merged_vin[:train_size], merged_vin[train_size:train_size + val_size], merged_vin[train_size + val_size:]

# Option 2 of splitting the dataset

# X_train, X_temp, y_train, y_temp, vin_train, vin_temp = train_test_split(
#     merged_X, merged_y, merged_vin, test_size=0.2, random_state=42)

# X_val, X_test, y_val, y_test, vin_val, vin_test = train_test_split(
#     X_temp, y_temp, vin_temp, test_size=0.5, random_state=42)


print("Train Set:", len(X_train), "samples")
print("Validation Set:", len(X_val), "samples")
print("Test Set:", len(X_test), "samples")

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Device: ', device)

In [None]:
# Make sure the dimensions are the same for different sets
print(f"X_train: {len(X_train)}, X_val: {len(X_val)}, X_test: {len(X_test)}")
print(f"y_train: {len(y_train)}, y_val: {len(y_val)}, y_test: {len(y_test)}")
print(f"vin_train: {len(vin_train)}, vin_val: {len(vin_val)}, vin_test: {len(vin_test)}")

In [None]:
unique_values = set(df['c_vin_encoded'])
num_users = len(unique_values)
print(f"The number of unique values in the list is: {num_users}")

In [None]:
X_train = np.array(X_train)
X_val = np.array(X_val)
X_test = np.array(X_test)
y_train = np.array(y_train)
y_val = np.array(y_val)
y_test = np.array(y_test)
vins_train = np.array(vin_train)
vins_val = np.array(vin_val)
vins_test = np.array(vin_test)

In [None]:
# Convert the data to PyTorch tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.float32).to(device)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32).to(device).unsqueeze(1)
vin_train_tensor = torch.tensor(vin_train, dtype=torch.long).to(device)

X_test_tensor = torch.tensor(X_test, dtype=torch.float32).to(device)
y_test_tensor = torch.tensor(y_test, dtype=torch.float32).to(device).unsqueeze(1)
vin_test_tensor = torch.tensor(vin_test, dtype=torch.long).to(device)

X_val_tensor = torch.tensor(X_val, dtype=torch.float32).to(device)
y_val_tensor = torch.tensor(y_val, dtype=torch.float32).to(device).unsqueeze(1)
vin_val_tensor = torch.tensor(vin_val, dtype=torch.long).to(device)

In [None]:
# Create datasets and dataloaders
train_dataset = UserEmbeddingDataset(X=X_train_tensor, user_ids=vin_train_tensor, y=y_train_tensor)
val_dataset = UserEmbeddingDataset(X=X_val_tensor, user_ids=vin_val_tensor, y=y_val_tensor)
test_dataset = UserEmbeddingDataset(X=X_test_tensor, user_ids=vin_test_tensor, y=y_test_tensor)


train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

In [None]:
# Set up logger
csv_logger = CSVLogger("logs", name="lstm-soc")

checkpoint_callback = ModelCheckpoint(
    dirpath='checkpoints-lstm',
    filename='best_checkpoint_soc',
    save_top_k=1,
    verbose=True,
    monitor='val_loss',
    mode='min'
)
early_stopping = EarlyStopping(monitor="val_loss", patience=5, mode="min")

In [None]:
embedding_dim = 16

In [None]:
model = UserEmbeddingLstm(sequence_length=sequence_length,
                          num_users=num_users,
                          embedding_dim=embedding_dim,
                          num_features=len(features),
                          dropout_rate=0.3)

# Or load from checkpoint
# checkpoint_path = 'checkpoints/best_checkpoint_lstm_soc.ckpt'
# model = UserEmbeddingLstm.load_from_checkpoint(checkpoint_path,
#                                                sequence_length=sequence_length,
#                                                num_users=num_users,
#                                                embedding_dim=embedding_dim,
#                                                num_features=len(features))

In [None]:
early_stopping = EarlyStopping(monitor="val_loss", patience=5, mode="min")

trainer = pl.Trainer(max_epochs=100,
                     logger=csv_logger,
                     callbacks=[checkpoint_callback, early_stopping],
                     gradient_clip_val=1.0)
trainer.fit(model, train_loader, val_loader)

In [28]:
trainer.test(model, test_loader)

/home/ec2-user/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:424: The 'test_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=31` in the `DataLoader` to improve performance.


Testing: |          | 0/? [00:00<?, ?it/s]

[{'test_loss': 0.7823531031608582}]

In [56]:
model.eval()
# model.to(device)
with torch.no_grad():
    y_pred = model(torch.tensor(X_test, dtype=torch.float32)).detach().numpy()

prediction:  [[-0.24992622]
 [-0.16984463]
 [ 0.22386815]
 ...
 [-0.6504032 ]
 [-0.62801296]
 [-0.45988226]]
test data:  tensor([-1.1322, -0.3201, -0.8164,  ...,  0.0859, -0.6359, -1.4480])


In [82]:
y_pred_original = target_scaler.inverse_transform(y_pred)
y_test_original = target_scaler.inverse_transform(y_test)

In [92]:
mae_original = mean_absolute_error(y_pred_original, y_test_original)
medae_original = median_absolute_error(y_pred_original, y_test_original)
rmse_original = mean_squared_error(y_pred_original, y_test_original, squared=False)
print(f"Mean Absolute Error(MAE) original: {mae_original}")
print(f"Median Absolute Error(MAE) original: {medae_original}")
print(f"Root Mean Squared Error(MAE) original: {rmse_original}")

Mean Absolute Error(MAE) original: 15.872245022813656
Median Absolute Error(MAE) original: 13.738107681274414
Root Mean Squared Error(MAE) original: 19.606493225309485


