In [None]:
# ====== 1. Cell: Data Loading and Settings ======
import os
import pandas as pd

# CSV file path (band_gap column is removed to prevent data leakage/overfitting)
csv_path = "/kaggle/input/features/modelde_kullanlan_Data.csv"

# CIF directory path (extracted from zip, already loaded as input)
cif_dir = "/kaggle/input/cif-zip/cif_indirme"

# Load the CSV
df = pd.read_csv(csv_path)

# Drop band_gap column if it exists (to prevent overfitting during feature engineering)
if "band_gap" in df.columns:
    df = df.drop(columns=["band_gap"])

print(f"✅ Total {len(df)} rows loaded from CSV.")
print(f"✅ CIF files: {len(os.listdir(cif_dir))} found.")

# Save for future cells
df.to_csv("/kaggle/working/material_no_target.csv", index=False)

In [None]:
import os
import pandas as pd

# Directory containing CIF files
cif_dir = "/kaggle/input/cif-zip/cif_indirme"

# Load the original dataframe
df = pd.read_csv("/kaggle/working/material_no_target.csv")

valid_indices = []
for idx in range(len(df)):
    material_id = df.iloc[idx]["material_id"]
    cif_path = os.path.join(cif_dir, f"{material_id}.cif")
    # Check if the CIF file exists for the given material_id
    if os.path.exists(cif_path):
        valid_indices.append(idx)

print(f"✅ Number of entries with valid CIF files: {len(valid_indices)}")

# Create filtered dataframe
df_filtered = df.iloc[valid_indices].reset_index(drop=True)

# Save the filtered dataframe to disk
df_filtered.to_csv("/kaggle/working/material_no_target_filtered.csv", index=False)

In [None]:
import warnings
warnings.filterwarnings("ignore")  # Suppress all warnings

import torch
from torch_geometric.loader import DataLoader
from torch.utils.data import Dataset
from pymatgen.core import Structure
import os
import pandas as pd
from torch_geometric.nn import SchNet
import torch_geometric.data

# Device configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

# CIF directory path
cif_dir = "/kaggle/input/cif-zip/cif_indirme"

# Load the filtered dataframe
df_filtered = pd.read_csv("/kaggle/working/material_no_target_filtered.csv")

class CrystalDataset(Dataset):
    def __init__(self, dataframe, cif_dir):
        super().__init__()
        self.df = dataframe
        self.cif_dir = cif_dir
        self.structures = []
        print("Pre-loading CIF files... (this may take a while)")
        for idx in range(len(self.df)):
            material_id = self.df.iloc[idx]["material_id"]
            cif_path = os.path.join(self.cif_dir, f"{material_id}.cif")
            structure = Structure.from_file(cif_path)
            self.structures.append(structure)
        print("CIF files loaded successfully.")

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        structure = self.structures[idx]
        # Atomic numbers (z) and Cartesian coordinates (pos)
        z = torch.tensor([site.specie.number for site in structure.sites], dtype=torch.long)
        pos = torch.tensor(structure.cart_coords, dtype=torch.float)
        data = torch_geometric.data.Data(z=z, pos=pos)
        return data

dataset = CrystalDataset(df_filtered, cif_dir)
loader = DataLoader(dataset, batch_size=16, shuffle=True, num_workers=0)

# Initialize SchNet model
model = SchNet(hidden_channels=128, num_filters=128, num_interactions=6,
               num_gaussians=50, cutoff=10.0, readout="add").to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
loss_fn = torch.nn.MSELoss()

# Training loop
for epoch in range(3):
    model.train()
    total_loss = 0
    for batch in loader:
        batch = batch.to(device)
        # Placeholder for real target values (replace with actual labels)
        target = torch.rand(batch.num_graphs).to(device) 
        
        optimizer.zero_grad()
        pred = model(batch).view(-1)
        loss = loss_fn(pred, target)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1} - Loss: {total_loss/len(loader):.4f}")

# Inference
model.eval()
predictions = []
with torch.no_grad():
    for batch in loader:
        batch = batch.to(device)
        pred = model(batch).view(-1)
        predictions.extend(pred.cpu().tolist())

print(f"Total predictions: {len(predictions)} - DataFrame rows: {len(df_filtered)}")

# Save results
df_filtered["predicted_band_gap"] = predictions
df_filtered.to_csv("/kaggle/working/band_gap_predictions.csv", index=False)
print("Predictions saved to: /kaggle/working/band_gap_predictions.csv")

In [None]:
# Updated DataLoader with performance optimizations
loader = DataLoader(
    dataset, 
    batch_size=32, 
    shuffle=True, 
    num_workers=2, 
    pin_memory=True  # Faster data transfer to GPU
)

# SchNet Architecture Configuration
model = SchNet(
    hidden_channels=128, 
    num_filters=128, 
    num_interactions=6,
    num_gaussians=50, 
    cutoff=10.0, 
    readout="add"
).to(device)

# Using a standard learning rate for Adam
optimizer = torch.optim.Adam(model.parameters(), lr=3e-4)
loss_fn = torch.nn.MSELoss()

# Training loop
for epoch in range(3):
    model.train()
    total_loss = 0
    for batch in loader:
        batch = batch.to(device)
        
        # Placeholder for target labels
        target = torch.rand(batch.num_graphs, device=device)
        
        optimizer.zero_grad()
        
        # Forward pass: providing atomic numbers, positions, and batch index
        pred = model(z=batch.z, pos=batch.pos, batch=batch.batch).view(-1)
        
        loss = loss_fn(pred, target)
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
        
    print(f"Epoch {epoch+1} - Loss: {total_loss/len(loader):.4f}")

In [None]:
# Set model to evaluation mode
model.eval()
predictions = []

# Disable gradient calculation for faster inference and lower memory usage
with torch.no_grad():
    for batch in loader:
        batch = batch.to(device)
        # Forward pass using atomic numbers, positions, and batch mapping
        pred = model(z=batch.z, pos=batch.pos, batch=batch.batch).view(-1)
        predictions.extend(pred.cpu().tolist())

print(f"Total predictions: {len(predictions)} - Filtered DataFrame rows: {len(df_filtered)}")

# Add predictions to the dataframe
df_filtered["predicted_band_gap"] = predictions

# Save the final results to a CSV file
output_path = "/kaggle/working/band_gap_predictions.csv"
df_filtered.to_csv(output_path, index=False)
print(f"Predictions successfully saved to: {output_path}")

In [None]:
import pandas as pd

# Path to the main dataset
main_csv_path = "/kaggle/input/features/modelde_kullanlan_Data.csv"
main_df = pd.read_csv(main_csv_path)

# Path to the prediction results
pred_csv_path = "/kaggle/working/band_gap_predictions.csv"
pred_df = pd.read_csv(pred_csv_path)

# Merging: Join based on 'material_id'
merged_df = main_df.merge(
    pred_df[['material_id', 'predicted_band_gap']],  # Select only required columns
    on='material_id',
    how='left'  # Keep all rows from the main dataset
)

# Export the combined data
output_path = "/kaggle/working/main_with_predictions.csv"
merged_df.to_csv(output_path, index=False)

print(f"✅ New CSV saved successfully: {output_path}")

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load the merged prediction results
df = pd.read_csv("/kaggle/working/main_with_predictions.csv")

# Select the 'predicted_band_gap' column
bandgaps = df['predicted_band_gap']

# Count zero values (potential metals)
zero_count = (bandgaps == 0).sum()
total_count = len(bandgaps)
zero_ratio = zero_count / total_count * 100

print(f"Total samples: {total_count}")
print(f"Zero band gap count: {zero_count} ({zero_ratio:.2f}%)")

# Basic statistics
print("\nBand gap summary statistics:")
print(bandgaps.describe())

# Histogram and KDE plot
plt.figure(figsize=(10,6))
sns.histplot(bandgaps, bins=50, kde=True, color="blue")
plt.title("Distribution of Band Gap Predictions")
plt.xlabel("Band Gap (eV)")
plt.ylabel("Frequency")
plt.grid(True)
plt.savefig("band_gap_distribution.png") # Standard practice to save in Kaggle

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load the data file
# Note: Ensure the path points to your main dataset or your prediction output
df = pd.read_csv("/kaggle/input/features/modelde_kullanlan_Data.csv")

# Targeted column for analysis
bandgaps = df['band_gap']

# Calculate the number of zero values (often representing metallic behavior)
zero_count = (bandgaps == 0).sum()
total_count = len(bandgaps)
zero_ratio = (zero_count / total_count) * 100

print(f"Total number of samples: {total_count}")
print(f"Zero band gap count: {zero_count} ({zero_ratio:.2f}%)")

# Descriptive statistics
print("\nBand gap descriptive statistics:")
print(bandgaps.describe())

# Distribution Plot (Histogram + Kernel Density Estimate)
plt.figure(figsize=(10,6))
sns.histplot(bandgaps, bins=50, kde=True, color="blue")
plt.title("Distribution of Band Gap Values")
plt.xlabel("Band Gap (eV)")
plt.ylabel("Frequency")
plt.grid(True)
plt.show()

In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Main CSV file containing all original columns
df_main = pd.read_csv("/kaggle/input/features/modelde_kullanlan_Data.csv")

# CSV file containing the model predictions
df_pred = pd.read_csv("/kaggle/working/main_with_predictions.csv")

# Extract only the ground truth and predicted band gap columns for comparison
df_compare = df_pred[['material_id', 'band_gap', 'predicted_band_gap']].copy()

# Drop rows with NaN values in either of these columns to ensure a clean analysis
df_compare_clean = df_compare.dropna(subset=['band_gap', 'predicted_band_gap'])

print(f"Number of rows used for analysis: {len(df_compare_clean)}")

y_true = df_compare_clean['band_gap']
y_pred = df_compare_clean['predicted_band_gap']

# Calculate regression metrics
mae = mean_absolute_error(y_true, y_pred)
mse = mean_squared_error(y_true, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_true, y_pred)

print(f"MAE: {mae:.4f}")   # Mean Absolute Error
print(f"MSE: {mse:.4f}")   # Mean Squared Error
print(f"RMSE: {rmse:.4f}") # Root Mean Squared Error
print(f"R² Score: {r2:.4f}") # Coefficient of Determination

In [None]:
print("Ground truth band gap range (Original Data):")
print(y_true.describe())

print("\nPredicted band gap range (Model Output):")
print(y_pred.describe())

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(10,5))

# Scatter plot of actual vs predicted values
sns.scatterplot(x=y_true, y=y_pred, alpha=0.3)

# Labeling with units (eV)
plt.xlabel("Actual Band Gap (eV)")
plt.ylabel("Predicted Band Gap (eV)")
plt.title("Actual vs. Predicted Band Gap Distribution")

# 45-degree reference line (Ideal Case)
plt.plot([y_true.min(), y_true.max()], [y_true.min(), y_true.max()], 'r--')

# Save the plot
plt.grid(True)
plt.savefig("parity_plot.png")

In [None]:
import numpy as np

# Constrain predicted values between 0 and the maximum observed ground truth value
y_pred_clipped = np.clip(y_pred, 0, y_true.max())

In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

# Recalculate metrics using clipped predictions
mae = mean_absolute_error(y_true, y_pred_clipped)
mse = mean_squared_error(y_true, y_pred_clipped)
rmse = np.sqrt(mse)
r2 = r2_score(y_true, y_pred_clipped)

print(f"MAE (after clipping): {mae:.4f}")
print(f"MSE (after clipping): {mse:.4f}")
print(f"RMSE (after clipping): {rmse:.4f}")
print(f"R² (after clipping): {r2:.4f}")

In [None]:
# Load the training data from the specified path
df_train = pd.read_csv("/kaggle/input/features/modelde_kullanlan_Data.csv")

In [None]:
import pandas as pd
import torch
from torch_geometric.loader import DataLoader
import torch_geometric.data

# Load the original CSV containing the ground truth band_gap values
df_bandgap = pd.read_csv("/kaggle/input/features/modelde_kullanlan_Data.csv")

# Create a mapping dictionary for quick lookup: material_id -> band_gap
material_to_bandgap = dict(zip(df_bandgap["material_id"], df_bandgap["band_gap"]))

def add_targets_to_dataset(dataset):
    """Function to verify all materials in the dataset have a target value."""
    for idx in range(len(dataset.df)):
        material_id = dataset.df.iloc[idx]["material_id"]
        band_gap = material_to_bandgap.get(material_id, None)
        if band_gap is None:
            raise ValueError(f"Band gap value not found for: {material_id}")
    print("Band gap values successfully mapped.")

# Subclassing CrystalDataset to include the target variable 'y'
class CrystalDatasetWithTarget(CrystalDataset):
    def __getitem__(self, idx):
        # Access the structure pre-loaded in the parent class
        structure = self.structures[idx]
        
        # Atomic numbers and Cartesian coordinates
        z = torch.tensor([site.specie.number for site in structure.sites], dtype=torch.long)
        pos = torch.tensor(structure.cart_coords, dtype=torch.float)
        
        # Initialize the PyG Data object
        data = torch_geometric.data.Data(z=z, pos=pos)
        
        # Retrieve and assign the actual band_gap target
        material_id = self.df.iloc[idx]["material_id"]
        band_gap = material_to_bandgap.get(material_id, None)
        
        if band_gap is None:
            raise ValueError(f"Band gap value not found for material: {material_id}")
            
        # Target 'y' must be a tensor for PyTorch Geometric models
        data.y = torch.tensor(band_gap, dtype=torch.float)
        
        return data

# Initialize the new dataset with targets
dataset_with_target = CrystalDatasetWithTarget(df_filtered, cif_dir)

# Create the DataLoader
# Note: Batch size and workers can be adjusted based on your hardware
loader = DataLoader(dataset_with_target, batch_size=16, shuffle=True, num_workers=0)

In [None]:
import torch
import pandas as pd

# Device configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Initialize SchNet model
model = SchNet(
    hidden_channels=128, 
    num_filters=128, 
    num_interactions=6,
    num_gaussians=50, 
    cutoff=10.0, 
    readout="add"
).to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
loss_fn = torch.nn.MSELoss()

# Training Loop
epochs = 3
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for batch in loader:
        batch = batch.to(device)
        optimizer.zero_grad()
        
        # Forward pass: Passing atomic numbers (z) and positions (pos)
        pred = model(batch.z, batch.pos).view(-1)
        
        # Calculate loss using ground truth (batch.y)
        loss = loss_fn(pred, batch.y)
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
    
    avg_loss = total_loss / len(loader)
    print(f"Epoch {epoch+1}/{epochs} - Loss: {avg_loss:.4f}")

# Evaluation and Data Collection
model.eval()
predictions = []
true_values = []

with torch.no_grad():
    for batch in loader:
        batch = batch.to(device)
        pred = model(batch.z, batch.pos).view(-1)
        
        # Move data to CPU and convert to list for storage
        predictions.extend(pred.cpu().tolist())
        true_values.extend(batch.y.cpu().tolist())

print(f"Total Predictions: {len(predictions)} - Total Ground Truths: {len(true_values)}")

# Merge results with the filtered dataframe and save to CSV
df_filtered["predicted_band_gap"] = predictions
df_filtered["true_band_gap"] = true_values

output_file = "/kaggle/working/band_gap_predictions_with_true.csv"
df_filtered.to_csv(output_file, index=False)
print(f"Results saved to: {output_file}")

In [None]:
class CrystalDataset(Dataset):
    def __init__(self, dataframe, cif_dir):
        super().__init__()
        self.df = dataframe
        self.cif_dir = cif_dir
        self.structures = []
        
        # Pre-loading CIF structures to memory for faster training access
        print("Pre-loading CIF files... (this may take a while)")
        for idx in range(len(self.df)):
            material_id = self.df.iloc[idx]["material_id"]
            cif_path = os.path.join(self.cif_dir, f"{material_id}.cif")
            structure = Structure.from_file(cif_path)
            self.structures.append(structure)
        print("CIF files loaded successfully.")

    def __len__(self):
        # Return the total number of samples
        return len(self.df)

    def __getitem__(self, idx):
        # Retrieve the pre-loaded pymatgen structure
        structure = self.structures[idx]
        
        # Extract atomic numbers (z) and Cartesian coordinates (pos)
        z = torch.tensor([site.specie.number for site in structure.sites], dtype=torch.long)
        pos = torch.tensor(structure.cart_coords, dtype=torch.float)
        
        # Create a PyTorch Geometric Data object
        data = torch_geometric.data.Data(z=z, pos=pos)
        
        # Convert band_gap target value to a tensor and assign to data.y
        target = torch.tensor(self.df.iloc[idx]["band_gap"], dtype=torch.float)
        data.y = target
        
        return data

In [None]:
import torch
import pandas as pd

# Device configuration (GPU if available, otherwise CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Initialize SchNet model with specific hyperparameters for crystal structures
model = SchNet(
    hidden_channels=128, 
    num_filters=128, 
    num_interactions=6,
    num_gaussians=50, 
    cutoff=10.0, 
    readout="add"
).to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
loss_fn = torch.nn.MSELoss()

# Training loop
epochs = 3
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for batch in loader:
        batch = batch.to(device)
        optimizer.zero_grad()
        
        # Forward pass: using atomic numbers (z) and 3D positions (pos)
        pred = model(batch.z, batch.pos).view(-1)
        
        # Calculate Mean Squared Error against true values
        loss = loss_fn(pred, batch.y)
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
        
    avg_loss = total_loss / len(loader)
    print(f"Epoch {epoch+1}/{epochs} - Training Loss: {avg_loss:.4f}")

# Model Evaluation Phase
model.eval()
predictions = []
true_values = []

# Disable gradient calculation for efficiency during inference
with torch.no_grad():
    for batch in loader:
        batch = batch.to(device)
        pred = model(batch.z, batch.pos).view(-1)
        
        # Collect results back to CPU
        predictions.extend(pred.cpu().tolist())
        true_values.extend(batch.y.cpu().tolist())

print(f"Total predictions: {len(predictions)} - Total ground truth values: {len(true_values)}")

# Append results to the filtered dataframe
df_filtered["predicted_band_gap"] = predictions
df_filtered["true_band_gap"] = true_values

# Save comparison results to a CSV file
output_path = "/kaggle/working/band_gap_predictions_with_true.csv"
df_filtered.to_csv(output_path, index=False)
print(f"Results successfully saved to: {output_path}")

In [None]:
# Verify the total number of samples in the dataset and the loader
print(f"Dataset length: {len(dataset)}")
print(f"Total samples in Loader: {len(loader.dataset)}")

In [None]:
# Re-initializing loader for inference (Shuffle is False to maintain order)
loader = DataLoader(dataset_with_target, batch_size=16, shuffle=False, num_workers=0)

model.eval()
predictions = []
true_values = []

# Gradient calculation is disabled for faster and more memory-efficient inference
with torch.no_grad():
    for batch in loader:
        batch = batch.to(device)
        # Forward pass: providing atomic numbers and positions
        pred = model(batch.z, batch.pos).view(-1)
        
        # Collect results and move to CPU
        predictions.extend(pred.cpu().tolist())
        true_values.extend(batch.y.cpu().tolist())

print(f"Total predictions: {len(predictions)} - Total ground truth values: {len(true_values)}")
print(f"Dataset length: {len(dataset_with_target)}")

# Append predictions and actual values to the dataframe
import pandas as pd
df_filtered["predicted_band_gap"] = predictions
df_filtered["true_band_gap"] = true_values

# Save to CSV for analysis
output_csv = "/kaggle/working/band_gap_predictions_with_true.csv"
df_filtered.to_csv(output_csv, index=False)
print(f"Predictions and ground truths saved to: {output_csv}")

In [None]:
# Ensure Dataset and DataLoader are loaded correctly
print(f"Dataset length: {len(dataset_with_target)}")  # Target: 123,634
loader = DataLoader(dataset_with_target, batch_size=16, shuffle=False, num_workers=0)
print(f"Loader sample count: {len(loader.dataset)}")  # Should match dataset length

# Model and Optimizer Definition
# Using SchNet for continuous property prediction (regression)
model = SchNet(
    hidden_channels=128, 
    num_filters=128, 
    num_interactions=6,
    num_gaussians=50, 
    cutoff=10.0, 
    readout="add"
).to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
loss_fn = torch.nn.MSELoss()

# Training (If loss returns 'nan', verify your input data/scaling)
for epoch in range(3):
    model.train()
    total_loss = 0
    for batch in loader:
        batch = batch.to(device)
        optimizer.zero_grad()
        
        # Forward pass: predicted band gaps
        pred = model(batch.z, batch.pos).view(-1)
        
        # Calculate loss against ground truth
        loss = loss_fn(pred, batch.y)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        
    print(f"Epoch {epoch+1}/3 - Average Loss: {total_loss/len(loader):.4f}")

# Inference Phase (Setting shuffle=False is critical for data alignment)
model.eval()
predictions = []
true_values = []

with torch.no_grad():
    for batch in loader:
        batch = batch.to(device)
        pred = model(batch.z, batch.pos).view(-1)
        
        # Collect values for final dataframe construction
        predictions.extend(pred.cpu().tolist())
        true_values.extend(batch.y.cpu().tolist())

print(f"Total Predictions: {len(predictions)} - Total True Values: {len(true_values)}")

# Data Integrity Checks
# Ensuring output counts perfectly match the input dataset
assert len(predictions) == len(dataset_with_target), "Prediction count mismatch!"
assert len(true_values) == len(dataset_with_target), "Ground truth count mismatch!"

# Store results back to the dataframe
df_filtered["predicted_band_gap"] = predictions
df_filtered["true_band_gap"] = true_values

# Export the results
output_path = "/kaggle/working/band_gap_predictions_with_true.csv"
df_filtered.to_csv(output_path, index=False)
print(f"✅ Predictions and true values successfully saved to: {output_path}")

In [None]:
import torch
import numpy as np
from torch_geometric.loader import DataLoader
from torch.utils.data import Subset

# 1. Cleaning Problematic Data
def clean_dataset(dataset):
    """Removes data points with NaN/Inf values or invalid atomic numbers."""
    clean_indices = []
    problematic_count = 0
    
    print("Cleaning dataset...")
    for i in range(len(dataset)):
        try:
            data = dataset[i]
            # Check Target (y)
            if torch.isnan(data.y) or torch.isinf(data.y):
                problematic_count += 1
                continue
            # Check Positions (pos)
            if torch.isnan(data.pos).any() or torch.isinf(data.pos).any():
                problematic_count += 1
                continue
            # Check Atomic Numbers (z) - must be between 1 and 118
            if (data.z <= 0).any() or (data.z > 118).any():
                problematic_count += 1
                continue
            clean_indices.append(i)
        except Exception as e:
            problematic_count += 1
            if i < 10: 
                print(f"Error at Index {i}: {e}")
            continue
        
        if i % 10000 == 0:
            print(f"Processed: {i}/{len(dataset)}, Clean: {len(clean_indices)}, Problematic: {problematic_count}")
    
    print(f"Clean samples: {len(clean_indices)}")
    print(f"Problematic samples: {problematic_count}")
    return clean_indices

# Execute cleaning
clean_indices = clean_dataset(dataset_with_target)

# Create a clean Subset
clean_dataset_obj = Subset(dataset_with_target, clean_indices)

# Optimized DataLoader
loader = DataLoader(
    clean_dataset_obj, 
    batch_size=32, 
    shuffle=True, 
    num_workers=2, 
    pin_memory=True if torch.cuda.is_available() else False
)

# 2. Target Statistics (for normalization context)
all_y_values = []
print("Collecting Y statistics...")
for i, idx in enumerate(clean_indices[:1000]):
    try:
        y_val = dataset_with_target[idx].y.item()
        all_y_values.append(y_val)
    except:
        continue

y_mean, y_std = np.mean(all_y_values), np.std(all_y_values)
print(f"Y Stats - Mean: {y_mean:.4f}, Std: {y_std:.4f}")

# 3. Model Definition - Scaled down for stability
model = SchNet(
    hidden_channels=64,
    num_filters=64,
    num_interactions=4,
    num_gaussians=25,
    cutoff=8.0,
    readout="mean" # Using 'mean' instead of 'add' for smoother gradients
).to(device)

# 4. Optimizer & Scheduler
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3, weight_decay=1e-4)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=5, factor=0.5)
loss_fn = torch.nn.MSELoss()

# 5. Training Loop
print("Starting training...")
for epoch in range(5):
    model.train()
    total_loss, valid_batches = 0, 0
    epoch_losses = []
    
    for batch_idx, batch in enumerate(loader):
        try:
            batch = batch.to(device)
            if torch.isnan(batch.y).any(): continue
                
            optimizer.zero_grad()
            
            # Prediction with squeeze to ensure 1D tensor
            pred = model(batch.z, batch.pos, batch.batch).squeeze()
            
            # Dimension handling
            if pred.dim() == 0: pred = pred.unsqueeze(0)
            if len(pred) != len(batch.y): continue
            
            # Validation checks for stability
            if torch.isnan(pred).any(): continue
            
            loss = loss_fn(pred, batch.y)
            if torch.isnan(loss): continue
            
            loss.backward()
            
            # Gradient clipping to prevent exploding gradients
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            
            optimizer.step()
            
            total_loss += loss.item()
            epoch_losses.append(loss.item())
            valid_batches += 1
            
            if batch_idx % 1000 == 0 and batch_idx > 0:
                print(f"Epoch {epoch+1}, Batch {batch_idx}: Avg Loss = {total_loss/valid_batches:.4f}")
            
            # Optional: early break for quick testing
            # if batch_idx > 5000: break
                
        except Exception as e:
            print(f"Batch {batch_idx} Error: {str(e)[:50]}")
            continue
    
    # Epoch summary
    if valid_batches > 0:
        avg_loss = total_loss / valid_batches
        scheduler.step(avg_loss)
        print(f"Epoch {epoch+1} Complete: Avg Loss: {avg_loss:.4f}, LR: {optimizer.param_groups[0]['lr']:.6f}")

# 6. Evaluation (Sample)
print("\nRunning Evaluation (First 1000 samples)...")
model.eval()
test_subset = Subset(clean_dataset_obj, range(min(1000, len(clean_dataset_obj))))
test_loader = DataLoader(test_subset, batch_size=32, shuffle=False)

preds, actuals = [], []
with torch.no_grad():
    for batch in test_loader:
        batch = batch.to(device)
        pred = model(batch.z, batch.pos, batch.batch).squeeze()
        if pred.dim() == 0: pred = pred.unsqueeze(0)
        preds.extend(pred.cpu().numpy())
        actuals.extend(batch.y.cpu().numpy())

# Final Metrics
mae = np.mean(np.abs(np.array(preds) - np.array(actuals)))
correlation = np.corrcoef(preds, actuals)[0, 1]
print(f"Final MAE: {mae:.4f}")
print(f"Correlation: {correlation:.4f}")

In [None]:
import torch
import numpy as np
from torch_geometric.loader import DataLoader
from torch.utils.data import Subset

# 1. Cleaning Problematic Data
def clean_dataset(dataset):
    """Removes data points with NaN/Inf values or invalid atomic numbers."""
    clean_indices = []
    problematic_count = 0
    
    print("Cleaning dataset...")
    for i in range(len(dataset)):
        try:
            data = dataset[i]
            # Check Target (y)
            if torch.isnan(data.y) or torch.isinf(data.y):
                problematic_count += 1
                continue
            # Check Positions (pos)
            if torch.isnan(data.pos).any() or torch.isinf(data.pos).any():
                problematic_count += 1
                continue
            # Check Atomic Numbers (z) - must be between 1 and 118
            if (data.z <= 0).any() or (data.z > 118).any():
                problematic_count += 1
                continue
            clean_indices.append(i)
        except Exception as e:
            problematic_count += 1
            if i < 10: 
                print(f"Error at Index {i}: {e}")
            continue
        
        if i % 10000 == 0:
            print(f"Processed: {i}/{len(dataset)}, Clean: {len(clean_indices)}, Problematic: {problematic_count}")
    
    print(f"Clean samples: {len(clean_indices)}")
    print(f"Problematic samples: {problematic_count}")
    return clean_indices

# Execute cleaning
clean_indices = clean_dataset(dataset_with_target)

# Create a clean Subset
clean_dataset_obj = Subset(dataset_with_target, clean_indices)

# Optimized DataLoader
loader = DataLoader(
    clean_dataset_obj, 
    batch_size=32, 
    shuffle=True, 
    num_workers=2, 
    pin_memory=True if torch.cuda.is_available() else False
)

# 2. Target Statistics (for normalization context)
all_y_values = []
print("Collecting Y statistics...")
for i, idx in enumerate(clean_indices[:1000]):
    try:
        y_val = dataset_with_target[idx].y.item()
        all_y_values.append(y_val)
    except:
        continue

y_mean, y_std = np.mean(all_y_values), np.std(all_y_values)
print(f"Y Stats - Mean: {y_mean:.4f}, Std: {y_std:.4f}")

# 3. Model Definition - Scaled down for stability
model = SchNet(
    hidden_channels=64,
    num_filters=64,
    num_interactions=4,
    num_gaussians=25,
    cutoff=8.0,
    readout="mean" # Using 'mean' instead of 'add' for smoother gradients
).to(device)

# 4. Optimizer & Scheduler
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3, weight_decay=1e-4)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=5, factor=0.5)
loss_fn = torch.nn.MSELoss()

# 5. Training Loop
print("Starting training...")
for epoch in range(5):
    model.train()
    total_loss, valid_batches = 0, 0
    epoch_losses = []
    
    for batch_idx, batch in enumerate(loader):
        try:
            batch = batch.to(device)
            if torch.isnan(batch.y).any(): continue
                
            optimizer.zero_grad()
            
            # Prediction with squeeze to ensure 1D tensor
            pred = model(batch.z, batch.pos, batch.batch).squeeze()
            
            # Dimension handling
            if pred.dim() == 0: pred = pred.unsqueeze(0)
            if len(pred) != len(batch.y): continue
            
            # Validation checks for stability
            if torch.isnan(pred).any(): continue
            
            loss = loss_fn(pred, batch.y)
            if torch.isnan(loss): continue
            
            loss.backward()
            
            # Gradient clipping to prevent exploding gradients
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            
            optimizer.step()
            
            total_loss += loss.item()
            epoch_losses.append(loss.item())
            valid_batches += 1
            
            if batch_idx % 1000 == 0 and batch_idx > 0:
                print(f"Epoch {epoch+1}, Batch {batch_idx}: Avg Loss = {total_loss/valid_batches:.4f}")
            
            # Optional: early break for quick testing
            # if batch_idx > 5000: break
                
        except Exception as e:
            print(f"Batch {batch_idx} Error: {str(e)[:50]}")
            continue
    
    # Epoch summary
    if valid_batches > 0:
        avg_loss = total_loss / valid_batches
        scheduler.step(avg_loss)
        print(f"Epoch {epoch+1} Complete: Avg Loss: {avg_loss:.4f}, LR: {optimizer.param_groups[0]['lr']:.6f}")

# 6. Evaluation (Sample)
print("\nRunning Evaluation (First 1000 samples)...")
model.eval()
test_subset = Subset(clean_dataset_obj, range(min(1000, len(clean_dataset_obj))))
test_loader = DataLoader(test_subset, batch_size=32, shuffle=False)

preds, actuals = [], []
with torch.no_grad():
    for batch in test_loader:
        batch = batch.to(device)
        pred = model(batch.z, batch.pos, batch.batch).squeeze()
        if pred.dim() == 0: pred = pred.unsqueeze(0)
        preds.extend(pred.cpu().numpy())
        actuals.extend(batch.y.cpu().numpy())

# Final Metrics
mae = np.mean(np.abs(np.array(preds) - np.array(actuals)))
correlation = np.corrcoef(preds, actuals)[0, 1]
print(f"Final MAE: {mae:.4f}")
print(f"Correlation: {correlation:.4f}")