In [1]:
import torch
import pandas as pd
import torch.nn.functional as F
import matplotlib.pyplot as plt
import random



In [2]:
df = pd.read_csv('../data/processed/qm9_clean.csv')
smiles_list = df['smiles'].astype('str').tolist()
gap_values = df['gap_ev'].values

# Create Vocabulary
chars = sorted(list(set(''.join(smiles_list))))
stoi = {s:i+1 for i,s in enumerate(chars)}
stoi['.'] = 0
itos = {i:s for s,i in stoi.items()}
smiles_vocab_size = len(itos)

# Binning the HOMO-LUMO Gap (eV) into 50 discrete tokens
num_buckets = 50
gap_min, gap_max = gap_values.min(), gap_values.max()
gap_buckets = ((gap_values - gap_min) / (gap_max - gap_min) * (num_buckets - 1)).astype(int)


In [3]:

total_vocab_size = smiles_vocab_size + num_buckets
block_size = 24

def build_conditioned_dataset(smiles, buckets):
    X, Y = [], []
    for s, b in zip(smiles, buckets):
        # Prepend the property token (target) to the sequence context
        target_token = smiles_vocab_size + b 
        context = [target_token] + [0] * (block_size - 1)
        for ch in s + '.':
            ix = stoi[ch]
            X.append(context)
            Y.append(ix)
            context = context[1:] + [ix]
    return torch.tensor(X), torch.tensor(Y)

In [4]:
# Split and Build
n1 = int(0.8 * len(smiles_list))
Xtr, Ytr = build_conditioned_dataset(smiles_list[:n1], gap_buckets[:n1])
Xdev, Ydev = build_conditioned_dataset(smiles_list[n1:], gap_buckets[n1:])

In [5]:
class Linear:
    def __init__(self, fan_in, fan_out, bias=True):
        self.weight = torch.randn((fan_in, fan_out)) / fan_in**0.5
        self.bias = torch.zeros(fan_out) if bias else None
    def __call__(self, x):
        self.out = x @ self.weight
        if self.bias is not None: self.out += self.bias
        return self.out
    def parameters(self):
        return [self.weight] + ([] if self.bias is None else [self.bias])

class BatchNorm1d:
    def __init__(self, dim, eps=1e-5, momentum=0.1):
        self.eps, self.momentum, self.training = eps, momentum, True
        self.gamma, self.beta = torch.ones(dim), torch.zeros(dim)
        self.running_mean, self.running_var = torch.zeros(dim), torch.ones(dim)
    def __call__(self, x):
        if self.training:
            dim = (0, 1) if x.ndim == 3 else 0
            xmean = x.mean(dim, keepdim=True)
            xvar = x.var(dim, keepdim=True)
        else:
            xmean, xvar = self.running_mean, self.running_var
        xhat = (x - xmean) / torch.sqrt(xvar + self.eps)
        self.out = self.gamma * xhat + self.beta
        if self.training:
            with torch.no_grad():
                self.running_mean = (1-self.momentum)*self.running_mean + self.momentum*xmean
                self.running_var = (1-self.momentum)*self.running_var + self.momentum*xvar
        return self.out
    def parameters(self):
        return [self.gamma, self.beta]

class Tanh:
    def __call__(self, x):
        self.out = torch.tanh(x)
        return self.out
    def parameters(self):
        return []

class Embedding:
    def __init__(self, num_embeddings, embedding_dim):
        self.weight = torch.randn((num_embeddings, embedding_dim))
    def __call__(self, IX):
        self.out = self.weight[IX]
        return self.out
    def parameters(self):
        return [self.weight]

class FlattenConsecutive:
    def __init__(self, n):
        self.n = n
    def __call__(self, x):
        B, T, C = x.shape
        x = x.view(B, T // self.n, C * self.n)
        if x.shape[1] == 1: x = x.squeeze(1)
        self.out = x
        return self.out
    def parameters(self):
        return []

class Sequential:
    def __init__(self, layers):
        self.layers = layers
    def __call__(self, x):
        for layer in self.layers: x = layer(x)
        self.out = x
        return self.out
    def parameters(self):
        return [p for layer in self.layers for p in layer.parameters()]

In [6]:
n_embd = 24
n_hidden = 512

model = Sequential([
    Embedding(total_vocab_size, n_embd),
    FlattenConsecutive(2), Linear(n_embd * 2, n_hidden, bias=False), BatchNorm1d(n_hidden), Tanh(),
    FlattenConsecutive(2), Linear(n_hidden * 2, n_hidden, bias=False), BatchNorm1d(n_hidden), Tanh(),
    FlattenConsecutive(2), Linear(n_hidden * 2, n_hidden, bias=False), BatchNorm1d(n_hidden), Tanh(),
    Linear(n_hidden, smiles_vocab_size), # Predicting only SMILES characters
])

In [7]:
parameters = model.parameters()
for p in parameters: p.requires_grad = True

In [8]:
max_steps = 150000
batch_size = 32
lossi = []

for i in range(max_steps):
    ix = torch.randint(0, Xtr.shape[0], (batch_size,))
    Xb, Yb = Xtr[ix], Ytr[ix]
    
    logits = model(Xb)
    if logits.ndim == 3: logits = logits[:, -1, :]
    loss = F.cross_entropy(logits, Yb)
    
    for p in parameters: p.grad = None
    loss.backward()
    
    lr = 0.1 if i < 100000 else 0.01
    for p in parameters: p.data += -lr * p.grad
    
    if i % 10000 == 0:
        print(f'{i:7d}/{max_steps:7d}: loss {loss.item():.4f}')
    lossi.append(loss.log10().item())

      0/ 150000: loss 3.2872
  10000/ 150000: loss 0.8303
  20000/ 150000: loss 1.4544
  30000/ 150000: loss 0.7955
  40000/ 150000: loss 0.9836
  50000/ 150000: loss 1.0029
  60000/ 150000: loss 0.9225
  70000/ 150000: loss 0.7557
  80000/ 150000: loss 0.6870
  90000/ 150000: loss 0.9287
 100000/ 150000: loss 0.8494
 110000/ 150000: loss 0.9757
 120000/ 150000: loss 1.0272
 130000/ 150000: loss 0.9355
 140000/ 150000: loss 0.8669


In [9]:
@torch.no_grad()
def generate_for_property(target_ev):
    for layer in model.layers:
        if hasattr(layer, 'training'): layer.training = False
    
    # Map target eV to specific bucket token
    b = int(((target_ev - gap_min) / (gap_max - gap_min) * (num_buckets - 1)))
    b = max(0, min(num_buckets - 1, b))
    condition_token = smiles_vocab_size + b
    
    context = [condition_token] + [0] * (block_size - 1)
    out = []
    while True:
        logits = model(torch.tensor([context]))
        if logits.ndim == 3: logits = logits[:, -1, :]
        probs = F.softmax(logits, dim=1)
        ix = torch.multinomial(probs, num_samples=1).item()
        context = context[1:] + [ix]
        if ix == 0: break
        out.append(itos[ix])
    return "".join(out)

In [10]:
target_gap = 7.0 
print(f"Generating molecule for target: {target_gap} eV")
print(generate_for_property(target_gap))

Generating molecule for target: 7.0 eV
CC12COC3CC2C13


In [14]:
# 1. Define the Regression Model Architecture (from Task 2)
# It must have the exact same structure: Linear(n_hidden, 1) at the end
predictor_model = Sequential([
    Embedding(total_vocab_size, n_embd),
    FlattenConsecutive(2), Linear(n_embd * 2, n_hidden, bias=False), BatchNorm1d(n_hidden), Tanh(),
    FlattenConsecutive(2), Linear(n_hidden * 2, n_hidden, bias=False), BatchNorm1d(n_hidden), Tanh(),
    FlattenConsecutive(2), Linear(n_hidden * 2, n_hidden, bias=False), BatchNorm1d(n_hidden), Tanh(),
    FlattenConsecutive(2), Linear(n_hidden * 2, n_hidden, bias=False), BatchNorm1d(n_hidden), Tanh(),
    FlattenConsecutive(2), Linear(n_hidden * 2, n_hidden, bias=False), BatchNorm1d(n_hidden), Tanh(),
    Linear(n_hidden, 1), 
])

saved_params = torch.load('predictor_weights.pt')
for p, p_saved in zip(predictor_model.parameters(), saved_params):
    p.data = p_saved.data
    p.requires_grad = False # Freeze weights so they don't change during gen tests

print("Predictor model successfully loaded in Task 3 notebook.")

Predictor model successfully loaded in Task 3 notebook.


In [20]:
@torch.no_grad()
def validate_inverse_design(target_ev):
    # 1. Generate the molecule using the Conditioned Generator (model)
    gen_smiles = generate_for_property(target_ev)
    
    # 2. Prepare the SMILES for the Predictor
    encoded = [stoi.get(ch, 0) for ch in gen_smiles[:32]]
    padded = encoded + [0] * (32 - len(encoded))
    input_tensor = torch.tensor([padded])
    
    # 3. Predict using the loaded Predictor model
    for layer in predictor_model.layers:
        if hasattr(layer, 'training'): layer.training = False
    
    pred_ev = predictor_model(input_tensor).item()
    
    print(f"Target: {target_ev} eV")
    print(f"Generated: {gen_smiles}")
    print(f"Predictor says: {pred_ev:.4f} eV")
    print(f"Accuracy Error: {abs(target_ev - pred_ev):.4f} eV")

# RUN THE TEST
validate_inverse_design(2.0)

Target: 2.0 eV
Generated: CC1C2COCCCC2O1
Predictor says: 7.3022 eV
Accuracy Error: 5.3022 eV


In [17]:
# Create a set of all training molecules for fast lookup
training_set = set(df['smiles'].tolist())

# Check your generated molecule
if generated_smiles in training_set:
    print(f"Molecule {generated_smiles} was found in the training data (Memorization).")
else:
    print(f"Molecule {generated_smiles} is NEW (Discovery!).")

Molecule N=C1ON=C(N)C#N is NEW (Discovery!).


In [19]:
from rdkit import Chem  # This fixes the NameError

# 1. Prepare the training set for the novelty check
training_set = set(df['smiles'].tolist())

n_to_generate = 100
target_ev = 7.0
results = []

print(f"Starting batch generation for {target_ev} eV...")

for i in range(n_to_generate):
    # Generate the SMILES string
    smiles = generate_for_property(target_ev)
    
    # 1. Check Validity using RDKit
    # We remove the stop token '.' so RDKit can read it properly
    mol = Chem.MolFromSmiles(smiles.replace('.', ''))
    is_valid = mol is not None
    
    # 2. Check Novelty
    # A molecule is novel if it is NOT in the original training CSV
    is_novel = smiles not in training_set
    
    # 3. Predict Property (only if valid)
    pred_ev = predict_generated_property(smiles) if is_valid else None
    
    results.append({
        'smiles': smiles,
        'valid': is_valid,
        'novel': is_novel,
        'pred_ev': pred_ev
    })

# --- CALCULATION ---
valid_count = sum(1 for r in results if r['valid'])
if valid_count > 0:
    novel_count = sum(1 for r in results if r['valid'] and r['novel'])
    avg_error = sum(abs(r['pred_ev'] - target_ev) for r in results if r['valid']) / valid_count
    
    print(f"\n--- Final Performance Report ---")
    print(f"Total Generated: {n_to_generate}")
    print(f"Validity Rate:   {(valid_count/n_to_generate)*100:.1f}%")
    print(f"Novelty Rate:    {(novel_count/valid_count)*100:.1f}% (of valid molecules)")
    print(f"Avg Prediction Error: {avg_error:.4f} eV")
else:
    print("No valid molecules generated in this batch. Try training for more steps.")

Starting batch generation for 7.0 eV...


[04:06:16] SMILES Parse Error: unclosed ring for input: 'NC(=N)C1=O'
[04:06:16] SMILES Parse Error: unclosed ring for input: 'CCC1CC(C)C(C)(C)CO'
[04:06:16] SMILES Parse Error: unclosed ring for input: 'CC(C#C)C1C'
[04:06:17] SMILES Parse Error: unclosed ring for input: 'NCC1C=C(OC=O)C=O'
[04:06:17] Explicit valence for atom # 4 C, 5, is greater than permitted
[04:06:17] SMILES Parse Error: unclosed ring for input: 'CC1(O)C(C)OC=O'
[04:06:17] SMILES Parse Error: unclosed ring for input: 'O=CC1CC1CN1'
[04:06:17] Explicit valence for atom # 7 C, 5, is greater than permitted
[04:06:17] SMILES Parse Error: syntax error while parsing: NC(C[NH2+]2)C[N
[04:06:17] SMILES Parse Error: check for mistakes around position 15:
[04:06:17] NC(C[NH2+]2)C[N
[04:06:17] ~~~~~~~~~~~~~~^
[04:06:17] SMILES Parse Error: Failed parsing SMILES 'NC(C[NH2+]2)C[N' for input: 'NC(C[NH2+]2)C[N'
[04:06:17] SMILES Parse Error: unclosed ring for input: 'CNC1C(O)CN#C'
[04:06:17] SMILES Parse Error: unclosed ring for in


--- Final Performance Report ---
Total Generated: 100
Validity Rate:   49.0%
Novelty Rate:    53.1% (of valid molecules)
Avg Prediction Error: 0.5174 eV


[04:06:18] SMILES Parse Error: unclosed ring for input: 'CC12OC1C1(CC1)C1(CO1)C#N'
[04:06:19] SMILES Parse Error: unclosed ring for input: 'O=CCC1NC1CC21'


In [24]:
# pip install py3Dmol

In [25]:
from rdkit.Chem import AllChem
import py3Dmol

def visualize_3d(smiles):
    mol = Chem.MolFromSmiles(smiles)
    mol = Chem.AddHs(mol) # 3D needs hydrogens
    AllChem.EmbedMolecule(mol, AllChem.ETKDG()) # Generate 3D coordinates
    
    # Convert RDKit mol to block for py3Dmol
    mblock = Chem.MolToMolBlock(mol)
    view = py3Dmol.view(width=400, height=400)
    view.addModel(mblock, 'mol')
    view.setStyle({'stick': {}, 'sphere': {'scale': 0.3}})
    view.zoomTo()
    return view.show()

# Test it with your discovery: CC1C2COCCCC2O1
visualize_3d("CC1C2COCCCC2O1")

In [26]:
torch.save(model.parameters(), 'generator_weights.pt')
print("Weights saved successfully from Task 3.")

Weights saved successfully from Task 3.
