In [1]:
%%capture setup_output
%run 'setup.ipynb'

In [2]:
import optuna
import pytorch_lightning as pl
import torch
import pandas as pd
import numpy as np
import os
import sys
import pdb
import seaborn as sns

from torch import nn
from torch.utils.data import DataLoader, Dataset
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.loggers import CSVLogger
import matplotlib.pyplot as plt

from src.data_processing import load_and_process_data
from src.EVChargingModel import EVDataset
from src.dnn_model import EVChargingModel
from src.FullyConnected import FullyConnected

In [None]:
# Define dataset file path
file_path = '../results/removed_outliers.parquet'
df = pd.read_parquet(file_path)

# Define features and target variables
# removed 'c_chargingmethod','c_chargingtype' 'mean_dep_time','latitude', 'longitude',  'start_datetime'
relevant_features = ['c_vin', 'c_realstartsoc', 'weekday_numerical', 'is_weekend',
                     'mean_consumption', 'mean_duration', 'latitude', 'longitude',
                     'start_hour', 'start_day', 'is_home_spot', 'is_location_one',
                     'start_datetime', 'delta_soc_real', 'plugin_duration_hr']

target = 'plugin_duration_hr' #, 'delta_soc_real', plugin_duration_hr

df = df[relevant_features]

In [None]:
# Training batch size
batch_size = 2048

In [None]:
print(len(df))
df = df.dropna()
print(len(df))

In [None]:
# Sort the DataFrame by 'c_vin' and 'start_datetime'
df = df.sort_values(by=['c_vin', 'start_datetime'])
# Add new features for previous session plugin_duration_hr and delta_soc_real
df['prev_plugin_duration_hr'] = df.groupby('c_vin')['plugin_duration_hr'].shift(1)
df['prev_delta_soc_real'] = df.groupby('c_vin')['delta_soc_real'].shift(1)

df['prev_plugin_duration_hr'] = df['prev_plugin_duration_hr'].fillna(df['plugin_duration_hr'])
df['prev_delta_soc_real'] = df['prev_delta_soc_real'].fillna(df['delta_soc_real'])

In [None]:
label_encoder = LabelEncoder()
df['c_vin_encoded'] = label_encoder.fit_transform(df['c_vin'])
df = df.drop(columns=['c_vin'])

features = ['c_vin_encoded', 'c_realstartsoc', 'weekday_numerical', 'is_weekend',
            'mean_consumption', 'mean_duration', 'latitude', 'longitude',
            'start_hour', 'start_day', 'is_home_spot', 'is_location_one',
            'prev_plugin_duration_hr', 'prev_delta_soc_real']


feature_scaler = StandardScaler()
target_scaler = StandardScaler()

df[features] = feature_scaler.fit_transform(df[features])
df[[target]] = target_scaler.fit_transform(df[[target]])

In [8]:
X = df[features]
y = df[[target]]

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=42)

In [10]:
# Make sure the dimensions are the same for different sets
print(f"X_train: {len(X_train)}, X_val: {len(X_val)}, X_test: {len(X_test)}")
print(f"y_train: {len(y_train)}, y_val: {len(y_val)}, y_test: {len(y_test)}")

6905208 863151 863152
6905208 863151 863152


In [11]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

X_train = np.array(X_train)
X_val = np.array(X_val)
X_test = np.array(X_test)
y_train = np.array(y_train)
y_val = np.array(y_val)
y_test = np.array(y_test)


# Convert the data to PyTorch tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.float32).to(device)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32).to(device)

X_test_tensor = torch.tensor(X_test, dtype=torch.float32).to(device)
y_test_tensor = torch.tensor(y_test, dtype=torch.float32).to(device)

X_val_tensor = torch.tensor(X_val, dtype=torch.float32).to(device)
y_val_tensor = torch.tensor(y_val, dtype=torch.float32).to(device)

In [12]:
# Create datasets and dataloaders
train_dataset = EVDataset(X_train_tensor, y_train_tensor)
val_dataset = EVDataset(X_val_tensor, y_val_tensor)
test_dataset = EVDataset(X_test_tensor, y_test_tensor)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

In [14]:
# Set up logger
csv_logger = CSVLogger("logs", name="dnn-soc")

checkpoint_callback = ModelCheckpoint(
    dirpath='checkpoints',
    filename='best_checkpoint',
    save_top_k=1,
    verbose=True,
    monitor='val_loss',
    mode='min'
)

In [17]:
model = FullyConnected(input_size=X_train.shape[1], dropout_prob=0)

cpu


In [16]:
trainer = pl.Trainer(max_epochs=50,
                     logger=csv_logger,
                     callbacks=[checkpoint_callback])

trainer.fit(model, train_loader, val_loader)

/home/ec2-user/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/pytorch_lightning/callbacks/model_checkpoint.py:654: Checkpoint directory /home/ec2-user/SageMaker/Q658166-thesis/checkpoints exists and is not empty.


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

/home/ec2-user/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:424: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=31` in the `DataLoader` to improve performance.
/home/ec2-user/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:424: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=31` in the `DataLoader` to improve performance.


Training: |          | 0/? [00:00<?, ?it/s]

Exception ignored in: <bound method IPythonKernel._clean_thread_parent_frames of <ipykernel.ipkernel.IPythonKernel object at 0x7f322f1d1690>>
Traceback (most recent call last):
  File "/home/ec2-user/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/ipykernel/ipkernel.py", line 775, in _clean_thread_parent_frames
    def _clean_thread_parent_frames(
KeyboardInterrupt: 

KeyboardInterrupt



In [21]:
# checkpoint_path = 'checkpoints/best_checkpoint-v71.ckpt'
# model = FullyConnected.load_from_checkpoint(checkpoint_path, input_size=X_train.shape[1])

In [24]:
model.eval()
model.to(device)
with torch.no_grad():
    y_pred = model(X_test_tensor)

In [25]:
mae = mean_absolute_error(y_test_tensor.to('cpu'), y_pred.to('cpu'))
rmse = mean_squared_error(y_test_tensor.to('cpu'), y_pred.to('cpu'), squared=False)
print(f"Mean Absolute Error(MAE): {mae}")
print(f"Mean Squared Error(MAE): {rmse}")

y_pred_original = target_scaler.inverse_transform(y_pred.to('cpu'))
y_test_original = target_scaler.inverse_transform(y_test_tensor.to('cpu'))

mae_original = mean_absolute_error(y_pred_original, y_test_original)
rmse_original = mean_squared_error(y_pred_original, y_test_original, squared=False)
print(f"Mean Absolute Error(MAE) original: {mae_original}")
print(f"Mean Squared Error(MAE) original: {rmse_original}")

Mean Absolute Error(MAE): 0.4965437650680542
Mean Squared Error(MAE): 0.6786376237869263
Mean Absolute Error(MAE) original: 11.006671243734607
Mean Squared Error(MAE) original: 15.043069770238704


