# All the imports

In [None]:
# Set seed for reproducibility
SEED = 42

# Import necessary libraries
import os

# Set environment variables before importing modules
os.environ['PYTHONHASHSEED'] = str(SEED)
os.environ['MPLCONFIGDIR'] = os.getcwd() + '/configs/'

# Suppress warnings
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=Warning)

# Import necessary modules
import logging
import random
import numpy as np

# Set seeds for random number generators in NumPy and Python
np.random.seed(SEED)
random.seed(SEED)

# Import PyTorch
import torch
torch.manual_seed(SEED)
from torch import nn
# from torchsummary import summary
from torch.utils.tensorboard import SummaryWriter
from torch.utils.data import TensorDataset, DataLoader
logs_dir = "tensorboard"
!pkill -f tensorboard
%load_ext tensorboard
!mkdir -p models

if torch.cuda.is_available():
    device = torch.device("cuda")
    torch.cuda.manual_seed_all(SEED)
    torch.backends.cudnn.benchmark = True
else:
    device = torch.device("cpu")

print(f"PyTorch version: {torch.__version__}")
print(f"Device: {device}")

# Import other libraries
import copy
import shutil
from itertools import product
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Configure plot display settings
sns.set(font_scale=1.4)
sns.set_style('white')
plt.rc('font', size=14)
%matplotlib inline

# Data Loading (change the dataset though)

In [None]:
# Set environment variables for Air Quality dataset
os.environ["DATASET_NAME"] = "[your_dataset_name]"
os.environ["DATASET_URL"] = "[your_dataset_url]"

# Check if Air Quality dataset exists, download and unzip if not
if not os.path.exists(os.environ["DATASET_NAME"]):
    print("Downloading [Dataset Name] dataset...")
    !gdown -q ${DATASET_URL} -O ${DATASET_NAME}
    print("[Dataset Name] dataset downloaded!")
else:
    print("[Dataset Name] dataset already downloaded. Using cached data.")

# Basic Datset Exploration

In [None]:
# Define column names for the dataset
column_names = [list_of_column_names]

# Read the dataset into a DataFrame with specified column names
df = pd.read_csv('[csv_name]', header=None, names=column_names)

# Remove rows with any missing values
df.dropna(axis=0, how='any', inplace=True)

# Print the shape of the DataFrame
print(f"DataFrame shape: {df.shape}")

# Display the first 10 rows of the DataFrame
df.head(10)

# Dataset-Specific Exploration

In [None]:
# Always depends on the dataset... you will need to do that yourself :(

Data Split

In [None]:
# Define sizes for validation and test sets
val_size = 2850
test_size = 2850

# Split the dataset into training, validation, and test sets
X_train_raw = dataset.iloc[:-val_size-test_size]
X_val_raw = dataset.iloc[-val_size-test_size:-test_size]
X_test_raw = dataset.iloc[-test_size:]

# Print the shapes of the split datasets
print(f"Train set shape: {X_train_raw.shape}")
print(f"Validation set shape: {X_val_raw.shape}")
print(f"Test set shape: {X_test_raw.shape}")

# Normalise data using training set statistics
X_min = X_train_raw.min()
X_max = X_train_raw.max()

# Apply min-max normalisation
X_train_raw = (X_train_raw - X_min) / (X_max - X_min)
X_val_raw = (X_val_raw - X_min) / (X_max - X_min)
X_test_raw = (X_test_raw - X_min) / (X_max - X_min)

# Define target labels as the column names of the dataset
TARGET_LABELS = dataset.columns

In [None]:
# for timeseries, you need to add a step here to convert the data into an actual time series
# otherwise use the _raw data directly

In [None]:
# Convert numpy arrays to PyTorch datasets (pairs features with labels)
train_ds = TensorDataset(torch.from_numpy(X_train), torch.from_numpy(y_train))
val_ds   = TensorDataset(torch.from_numpy(X_val), torch.from_numpy(y_val))
test_ds  = TensorDataset(torch.from_numpy(X_test), torch.from_numpy(y_test))

Data Loader

In [None]:
def make_loader(ds, batch_size, shuffle, drop_last):
    # Determine optimal number of worker processes for data loading
    cpu_cores = os.cpu_count() or 2
    num_workers = max(2, min(4, cpu_cores))

    # Create DataLoader with performance optimizations
    return DataLoader(
        ds,
        batch_size=batch_size,
        shuffle=shuffle,
        drop_last=drop_last,
        num_workers=num_workers,
        pin_memory=True,  # Faster GPU transfer
        pin_memory_device="cuda" if torch.cuda.is_available() else "",
        prefetch_factor=4,  # Load 4 batches ahead
    )

In [None]:
# Create data loaders with different settings for each phase
train_loader = make_loader(train_ds, batch_size=BATCH_SIZE, shuffle=True, drop_last=False)
val_loader   = make_loader(val_ds, batch_size=BATCH_SIZE, shuffle=False, drop_last=False)
test_loader  = make_loader(test_ds, batch_size=BATCH_SIZE, shuffle=False, drop_last=False)

for xb, yb in train_loader:
    print("Features batch shape:", xb.shape)
    print("Labels batch shape:", yb.shape)
    break # Stop after getting one batch