# 🧪 EDA and Preprocessing - QM7b Dataset
This notebook performs exploratory data analysis and preprocessing for a classification task based on the QM7b dataset.

In [None]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

import torch
from torch_geometric.datasets import QM7b
from torch_geometric.transforms import Distance
from torch_geometric.loader import DataLoader

sns.set(style="whitegrid")


## 📥 Load QM7b Dataset

In [None]:

dataset = QM7b(root="data/qm7b", transform=Distance(norm=False))
print(f"Dataset size: {len(dataset)}")
print(dataset[0])


## 🧪 Create Labels Based on HOMO-LUMO Gap

In [None]:

data_list = []
energy_gaps = []

for data in dataset:
    homo = data.homo.item()
    lumo = data.lumo.item()
    gap = lumo - homo
    energy_gaps.append(gap)
    data.energy_gap = gap
    data_list.append(data)

threshold = np.quantile(energy_gaps, 0.25)  # Top 25% most reactive as "good candidates"
print(f"Energy gap threshold: {threshold:.4f}")

for data in data_list:
    data.y = torch.tensor([1 if data.energy_gap < threshold else 0], dtype=torch.long)


## 📊 Class Balance

In [None]:

labels = [data.y.item() for data in data_list]
sns.countplot(x=labels)
plt.title("Class Distribution (0: Not Good, 1: Good Candidate)")
plt.show()


## 🔍 Explore Features

In [None]:

example = data_list[0]
print(f"Number of atoms: {example.z.size(0)}")
print("Atomic numbers:", example.z.tolist())
print("Positions shape:", example.pos.shape)


## 🔀 Train/Validation/Test Split

In [None]:

train, test = train_test_split(data_list, test_size=0.2, random_state=42, stratify=labels)
train, val = train_test_split(train, test_size=0.25, random_state=42, stratify=[d.y.item() for d in train])

print(f"Train: {len(train)}, Validation: {len(val)}, Test: {len(test)}")
