In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
from collections import Counter
import torch
import rtdl
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, accuracy_score, balanced_accuracy_score
from imblearn.over_sampling import SMOTE
from torch.utils.data import DataLoader, TensorDataset
from tqdm import tqdm
from sklearn.utils.class_weight import compute_class_weight

In [None]:
# Load Data
df = pd.read_csv("/content/drive/MyDrive/AMIRL Task/AQI/final-data-with-label.csv")

df.head()

In [None]:
# Select Features & Target
X = df.drop(['date', 'address', 'AQI', 'AQI Category', 'day_of_year'], axis=1)
y = df['AQI Category']

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

X_train.shape

In [None]:
X

In [None]:
# Normalize Numerical Features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Compute class weights dynamically
classes = list(set(y_train))
class_weights = compute_class_weight('balanced', classes=classes, y=y_train)
class_weights = torch.tensor(class_weights, dtype=torch.float32).to(device)

# Convert to tensor for PyTorch loss function
class_weights_tensor = torch.tensor([class_weights[i] for i in sorted(class_weights.keys())], dtype=torch.float32)

# Handle Class Imbalance (SMOTE)
smote = SMOTE(random_state=42)
X_train, y_train = smote.fit_resample(X_train, y_train)

# Convert Data to Tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.long)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.long)

# Create DataLoaders
batch_size = 64
train_loader = DataLoader(TensorDataset(X_train_tensor, y_train_tensor), batch_size=batch_size, shuffle=True)
test_loader = DataLoader(TensorDataset(X_test_tensor, y_test_tensor), batch_size=batch_size, shuffle=False)

# Define FT-Transformer Model
model = rtdl.FTTransformer.make_default(
    n_num_features=X.shape[1],
    cat_cardinalities=None,
    d_out=len(np.unique(y))
)


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"\nRunning code on {Device}")

# Define Loss Function with Class Weights
class_weights_tensor = class_weights_tensor.to(device)
criterion = torch.nn.CrossEntropyLoss(weight=class_weights_tensor)

optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)

# Training Loop with Metrics Tracking
model.to(device)

# Training Loop with Progress Bar
num_epochs = 20
train_losses, test_losses, train_accuracies, test_accuracies = [], [], [], []

for epoch in range(num_epochs):
    model.train()
    total_train_loss, correct_train, total_train = 0, 0, 0

    # Training
    with tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}", unit="batch") as tepoch:
        for X_batch, y_batch in tepoch:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            optimizer.zero_grad()

            outputs = model(X_batch)
            loss = criterion(outputs, y_batch)

            loss.backward()
            optimizer.step()

            total_train_loss += loss.item()
            correct_train += (outputs.argmax(1) == y_batch).sum().item()
            total_train += y_batch.size(0)

            tepoch.set_postfix(loss=loss.item())

    train_loss = total_train_loss / len(train_loader)
    train_acc = correct_train / total_train

    # Evaluate on Test Data
    model.eval()
    total_test_loss, correct_test, total_test = 0, 0, 0

    with torch.no_grad():
        for X_batch, y_batch in test_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            outputs = model(X_batch)
            loss = criterion(outputs, y_batch)

            total_test_loss += loss.item()
            correct_test += (outputs.argmax(1) == y_batch).sum().item()
            total_test += y_batch.size(0)

    test_loss = total_test_loss / len(test_loader)
    test_acc = correct_test / total_test

    # Store Metrics
    train_losses.append(train_loss)
    test_losses.append(test_loss)
    train_accuracies.append(train_acc)
    test_accuracies.append(test_acc)

    print(f"Epoch [{epoch+1}/{num_epochs}] → Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.4f}, Test Loss: {test_loss:.4f}, Test Acc: {test_acc:.4f}")


# Plot Training Curves
plt.figure(figsize=(12, 5))

# Plot Loss
plt.subplot(1, 2, 1)
plt.plot(range(1, num_epochs+1), train_losses, label="Train Loss", marker="o")
plt.plot(range(1, num_epochs+1), test_losses, label="Test Loss", marker="s")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.title("Training & Testing Loss")
plt.legend()
plt.grid()

# Plot Accuracy
plt.subplot(1, 2, 2)
plt.plot(range(1, num_epochs+1), train_accuracies, label="Train Accuracy", marker="o")
plt.plot(range(1, num_epochs+1), test_accuracies, label="Test Accuracy", marker="s")
plt.xlabel("Epoch")
plt.ylabel("Accuracy")
plt.title("Training & Testing Accuracy")
plt.legend()
plt.grid()

plt.show()

# Final Evaluation
model.eval()
y_pred_list, y_true_list = [], []

with torch.no_grad():
    for X_batch, y_batch in test_loader:
        X_batch = X_batch.to(device)
        outputs = model(X_batch)
        _, y_pred = torch.max(outputs, 1)
        y_pred_list.extend(y_pred.cpu().numpy())
        y_true_list.extend(y_batch.numpy())

# Compute Final Metrics
balanced_acc = balanced_accuracy_score(y_true_list, y_pred_list)
print("\nFinal Balanced Accuracy:", balanced_acc)
print("\nClassification Report:\n", classification_report(y_true_list, y_pred_list))
