In [1]:
import torch
from torch.utils.data import DataLoader
from datasets.dataset_utils import *
from utils.feature_utils import generate_feature_matrix
from utils.adjacency_utils import generate_adjacency_matrix

# Path to the CSV file containing the data
csv_file = 'C:/Users/user/Downloads/VAE_GCN/CSD_EES_DB_with_ST_split.csv'
target_col = 'ST_split'  # Column to use as target (replace with your target column)

# Initialize the dataset
dataset = MolecularDataset(csv_file=csv_file, target_col=target_col)

# Create a DataLoader to iterate through the dataset
dataloader = DataLoader(dataset, batch_size=4, shuffle=True)

# Test the dataset by iterating over it
for i, (adj_normalized, features, num_atoms, target, idx) in enumerate(dataloader):
    print(f"Batch {i+1}")
    print("Adjacency Matrix (normalized):")
    print(adj_normalized.shape)
    print("Features Matrix:")
    print(features.shape)
    print("Number of Atoms:", num_atoms)
    print("Target:", target)
    print("Index:", idx)

    # Optionally print the first sample in the batch to inspect
    if i == 0:
        print("\nFirst Sample in the Batch:")
        print("Adjacency Matrix:", adj_normalized[0])
        print("Feature Matrix:", features[0])
        print("Target:", target[0])
        print("Index:", idx[0])
    
    print('-' * 40)
    if i >= 2:  # Limit to a few batches for testing
        break


[15:41:12] Explicit valence for atom # 20 N, 4, is greater than permitted
[15:41:12] Explicit valence for atom # 31 N, 4, is greater than permitted
[15:41:12] Explicit valence for atom # 26 B, 4, is greater than permitted
[15:41:12] Explicit valence for atom # 26 N, 4, is greater than permitted
[15:41:12] Explicit valence for atom # 7 B, 4, is greater than permitted
[15:41:12] Explicit valence for atom # 10 N, 4, is greater than permitted
[15:41:12] Explicit valence for atom # 10 N, 4, is greater than permitted
[15:41:12] Explicit valence for atom # 10 N, 4, is greater than permitted
[15:41:12] Explicit valence for atom # 8 N, 4, is greater than permitted
[15:41:12] Explicit valence for atom # 12 N, 4, is greater than permitted
[15:41:12] Explicit valence for atom # 12 N, 4, is greater than permitted
[15:41:12] Explicit valence for atom # 12 N, 4, is greater than permitted
[15:41:12] Explicit valence for atom # 12 N, 4, is greater than permitted
[15:41:13] Explicit valence for atom # 2

Batch 1
Adjacency Matrix (normalized):
torch.Size([4, 460, 460])
Features Matrix:
torch.Size([4, 460, 61])
Number of Atoms: tensor([44, 42, 40, 21])
Target: tensor([0.4177, 0.4644, 1.5034, 0.3733])
Index: tensor([44896, 23426, 25264, 46085])

First Sample in the Batch:
Adjacency Matrix: tensor([[0.6667, 0.2357, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.2357, 0.3333, 0.2357,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.2357, 0.6667,  ..., 0.0000, 0.0000, 0.0000],
        ...,
        [0.0000, 0.0000, 0.0000,  ..., 1.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 1.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 1.0000]])
Feature Matrix: tensor([[0., 0., 0.,  ..., 0., 0., 1.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 1.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]])
Target: tensor(0.4177)
Index: tensor(44896)


In [2]:
import torch
from torch.utils.data import DataLoader, Subset
from sklearn.model_selection import KFold
from utils.feature_utils import generate_feature_matrix
from utils.adjacency_utils import generate_adjacency_matrix

# Load dataset
dataset_path = "C:/Users/user/Downloads/VAE_GCN/CSD_EES_DB_with_ST_split.csv"
dataset = MolecularDataset(dataset_path, target_col="ST_split")

# K-Fold Cross-Validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

batch_size = 4  # You can adjust this as needed
fold = 1

for train_index, val_index in kf.split(dataset):
    print(f"Starting Fold {fold}...")

    # Create train and validation subsets using the indices from KFold
    train_subset = Subset(dataset, train_index)
    val_subset = Subset(dataset, val_index)

    # Create DataLoader for training and validation sets
    train_loader = DataLoader(train_subset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_subset, batch_size=batch_size, shuffle=False)

    # Train and validate the model for this fold (this is just an example for data loading)
    # You would add your model training and evaluation code here
    
    # Example for inspecting a few batches during training
    for i, (adj_normalized, features, num_atoms, target, idx) in enumerate(train_loader):
        print(f"Train Batch {i+1} - Fold {fold}")
        print("Adjacency Matrix (normalized):", adj_normalized.shape)
        print("Features Matrix:", features.shape)
        print("Number of Atoms:", num_atoms)
        print("Target:", target)
        print("Index:", idx)
        
        if i >= 2:  # Limit to a few batches for inspection
            break
    
    # Example for inspecting a few batches during validation
    for i, (adj_normalized, features, num_atoms, target, idx) in enumerate(val_loader):
        print(f"Validation Batch {i+1} - Fold {fold}")
        print("Adjacency Matrix (normalized):", adj_normalized.shape)
        print("Features Matrix:", features.shape)
        print("Number of Atoms:", num_atoms)
        print("Target:", target)
        print("Index:", idx)
        
        if i >= 2:  # Limit to a few batches for inspection
            break

    fold += 1
    print('-' * 40)


[16:11:33] Explicit valence for atom # 20 N, 4, is greater than permitted
[16:11:33] Explicit valence for atom # 31 N, 4, is greater than permitted
[16:11:33] Explicit valence for atom # 26 B, 4, is greater than permitted
[16:11:33] Explicit valence for atom # 26 N, 4, is greater than permitted
[16:11:33] Explicit valence for atom # 7 B, 4, is greater than permitted
[16:11:33] Explicit valence for atom # 10 N, 4, is greater than permitted
[16:11:33] Explicit valence for atom # 10 N, 4, is greater than permitted
[16:11:33] Explicit valence for atom # 10 N, 4, is greater than permitted
[16:11:33] Explicit valence for atom # 8 N, 4, is greater than permitted
[16:11:33] Explicit valence for atom # 12 N, 4, is greater than permitted
[16:11:33] Explicit valence for atom # 12 N, 4, is greater than permitted
[16:11:33] Explicit valence for atom # 12 N, 4, is greater than permitted
[16:11:33] Explicit valence for atom # 12 N, 4, is greater than permitted
[16:11:33] Explicit valence for atom # 2

Using device: cpu
Starting Fold 1...
Train Batch 1 - Fold 1
Adjacency Matrix (normalized): torch.Size([4, 460, 460])
Features Matrix: torch.Size([4, 460, 61])
Number of Atoms: tensor([17, 14, 32, 38])
Target: tensor([1.4303, 1.2859, 1.2447, 0.7371])
Index: tensor([ 7466, 46454, 15111, 38863], dtype=torch.int32)
Train Batch 2 - Fold 1
Adjacency Matrix (normalized): torch.Size([4, 460, 460])
Features Matrix: torch.Size([4, 460, 61])
Number of Atoms: tensor([20, 28, 19, 36])
Target: tensor([1.1702, 0.6195, 0.7514, 0.3382])
Index: tensor([25443,  9865, 17340, 36732], dtype=torch.int32)


KeyError: 25645