In [None]:
import os
import pandas as pd
import numpy as np
from torchtyping import TensorType

import torch
from torch import nn
import torch.nn.functional as F
from torch_geometric.nn import GATConv, global_mean_pool
from torch.utils.data import DataLoader, TensorDataset

from tqdm import trange
from rdkit import Chem
from rdkit import DataStructs

device = torch.device('cuda')
work_dir = r'E:/Coding/jupyter_root/Kaggle/neurips-open-polymer-prediction-2025/polymer_prediction_notebook'

In [2]:
train_csv = pd.read_csv(os.path.join(work_dir, './train.csv'))
test_csv = pd.read_csv(os.path.join(work_dir, './test.csv'))
merge_csv = pd.concat([train_csv, test_csv], axis = 0)
merge_csv = merge_csv.drop(columns = 'id')
#merge_csv = merge_csv.fillna(merge_csv['FFV'], axis=0).mean()
merge_csv = merge_csv.fillna({'FFV': merge_csv['FFV'].mean()})
merge_csv.to_csv(os.path.join(work_dir, './merge.csv'))

In [3]:
merge_csv

Unnamed: 0,SMILES,Tg,FFV,Tc,Density,Rg
0,*CC(*)c1ccccc1C(=O)OCCCCCC,,0.374645,0.205667,,
1,*Nc1ccc([C@H](CCC)c2ccc(C3(c4ccc([C@@H](CCC)c5...,,0.370410,,,
2,*Oc1ccc(S(=O)(=O)c2ccc(Oc3ccc(C4(c5ccc(Oc6ccc(...,,0.378860,,,
3,*Nc1ccc(-c2c(-c3ccc(C)cc3)c(-c3ccc(C)cc3)c(N*)...,,0.387324,,,
4,*Oc1ccc(OC(=O)c2cc(OCCCCCCCCCOCC3CCCN3c3ccc([N...,,0.355470,,,
...,...,...,...,...,...,...
7971,*C=C(*)c1ccccc1C,261.662355,0.367212,,,
7972,*c1ccc(OCCCCCCCCCCCOC(=O)CCCCC(=O)OCCCCCCCCCCC...,,0.374049,,,
0,*Oc1ccc(C=NN=Cc2ccc(Oc3ccc(C(c4ccc(*)cc4)(C(F)...,,0.367212,,,
1,*Oc1ccc(C(C)(C)c2ccc(Oc3ccc(C(=O)c4cccc(C(=O)c...,,0.367212,,,


In [4]:
def transform_matrix(df:pd.DataFrame):

    '''
    Add a new column *input* containing feature matrix 
    in the shape of (num_atoms, features)
    for each molecule.

    '''
    def _get_feature_vec(atom):
        return np.array([
            atom.GetAtomicNum(), 
            atom.GetTotalDegree(), 
            atom.GetFormalCharge(), 
            int(atom.GetIsAromatic()), 
            atom.GetTotalNumHs()
        ], dtype=float)
    def _molecule_features(molecule):
        try:
            feature_mtx = torch.tensor(
                [_get_feature_vec(atom) for atom in molecule.GetAtoms()], 
                dtype = torch.float
            )
            return feature_mtx
        except Exception as e:
            print(f'Error occurs: {e}')
            return None
       
    
    molecules = df['SMILES'].apply(lambda smile : Chem.MolFromSmiles(smile))
    df['input'] = molecules.apply(_molecule_features)
    return df


def similarity(
        df,
        n_componds:int
):
    all_smiles = df['SMILES'][:n_componds]
    mols = [Chem.MolFromSmiles(smile) for smile in all_smiles]
    fps = [Chem.RDKFingerprint(mol) for mol in mols]
    similarity_matrix = torch.empty((len(fps), len(fps)), dtype=torch.float32)
    for i in trange(len(similarity_matrix)):
        for j in range(i+1, len(similarity_matrix)):
            similarity_matrix[i][j] = DataStructs.FingerprintSimilarity(fps[i], fps[j])

    return similarity_matrix


def get_symetric_similarity_matrix(
        matrix_tensor:str, 
        threshold:float, 
        set_one:bool, 
        save
):
    similarity_matrix = torch.load(os.path.join(work_dir, matrix_tensor))
    mask = (similarity_matrix>= threshold)
    
    if set_one:
        mask:bool = mask | mask.T
        matrix:TensorType = 1*mask
    else:
        matrix = similarity_matrix * mask
        matrix = matrix + matrix.T 
    matrix = matrix.fill_diagonal_(True)
    if save:
        torch.save(matrix, f'{os.path.join(work_dir, save)}.pt')
    return matrix


In [5]:
merge_csv = transform_matrix(merge_csv)
merge_csv

  feature_mtx = torch.tensor(


Unnamed: 0,SMILES,Tg,FFV,Tc,Density,Rg,input
0,*CC(*)c1ccccc1C(=O)OCCCCCC,,0.374645,0.205667,,,"[[tensor(0.), tensor(1.), tensor(0.), tensor(0..."
1,*Nc1ccc([C@H](CCC)c2ccc(C3(c4ccc([C@@H](CCC)c5...,,0.370410,,,,"[[tensor(0.), tensor(1.), tensor(0.), tensor(0..."
2,*Oc1ccc(S(=O)(=O)c2ccc(Oc3ccc(C4(c5ccc(Oc6ccc(...,,0.378860,,,,"[[tensor(0.), tensor(1.), tensor(0.), tensor(0..."
3,*Nc1ccc(-c2c(-c3ccc(C)cc3)c(-c3ccc(C)cc3)c(N*)...,,0.387324,,,,"[[tensor(0.), tensor(1.), tensor(0.), tensor(0..."
4,*Oc1ccc(OC(=O)c2cc(OCCCCCCCCCOCC3CCCN3c3ccc([N...,,0.355470,,,,"[[tensor(0.), tensor(1.), tensor(0.), tensor(0..."
...,...,...,...,...,...,...,...
7971,*C=C(*)c1ccccc1C,261.662355,0.367212,,,,"[[tensor(0.), tensor(1.), tensor(0.), tensor(0..."
7972,*c1ccc(OCCCCCCCCCCCOC(=O)CCCCC(=O)OCCCCCCCCCCC...,,0.374049,,,,"[[tensor(0.), tensor(1.), tensor(0.), tensor(0..."
0,*Oc1ccc(C=NN=Cc2ccc(Oc3ccc(C(c4ccc(*)cc4)(C(F)...,,0.367212,,,,"[[tensor(0.), tensor(1.), tensor(0.), tensor(0..."
1,*Oc1ccc(C(C)(C)c2ccc(Oc3ccc(C(=O)c4cccc(C(=O)c...,,0.367212,,,,"[[tensor(0.), tensor(1.), tensor(0.), tensor(0..."


In [6]:
all_smiles = merge_csv['SMILES']
similarity_matrix = torch.load(os.path.join(work_dir, './similarity_matrix.pt'))
weighted_similarity_matrix = get_symetric_similarity_matrix(    
    './similarity_matrix.pt', 
    0.7, False, 'similarity_matrix_weighted'
)
# Use torch.load() when the weighted_similarity_matrix is saved. 
# This matrix is expected to be features input
edge_index = weighted_similarity_matrix.to_sparse_coo()
edge_index

  similarity_matrix = torch.load(os.path.join(work_dir, './similarity_matrix.pt'))
  similarity_matrix = torch.load(os.path.join(work_dir, matrix_tensor))


tensor(indices=tensor([[   0,    0,    0,  ..., 7974, 7974, 7974],
                       [   0,  341,  756,  ..., 7355, 7731, 7974]]),
       values=tensor([1.0000, 0.9112, 0.9704,  ..., 0.8136, 0.7405, 1.0000]),
       size=(7975, 7975), nnz=237003, layout=torch.sparse_coo)

In [None]:
all_labels = merge_csv.drop(columns=['input', 'SMILES'])
df_without_na = all_labels.dropna(axis=0, how='any')
labeled_index = df_without_na.index
train_mask = torch.zeros(len(all_labels), dtype = torch.bool)
train_mask[labeled_index] = True

In [11]:
feature_matrix = torch.load(os.path.join(work_dir, 'input_tensors.pt'))
feature_matrix

  feature_matrix = torch.load(os.path.join(work_dir, 'input_tensors.pt'))


[tensor([[0., 1., 0., 0., 0.],
         [6., 4., 0., 0., 2.],
         [6., 4., 0., 0., 1.],
         [0., 1., 0., 0., 0.],
         [6., 3., 0., 1., 0.],
         [6., 3., 0., 1., 1.],
         [6., 3., 0., 1., 1.],
         [6., 3., 0., 1., 1.],
         [6., 3., 0., 1., 1.],
         [6., 3., 0., 1., 0.],
         [6., 3., 0., 0., 0.],
         [8., 1., 0., 0., 0.],
         [8., 2., 0., 0., 0.],
         [6., 4., 0., 0., 2.],
         [6., 4., 0., 0., 2.],
         [6., 4., 0., 0., 2.],
         [6., 4., 0., 0., 2.],
         [6., 4., 0., 0., 2.],
         [6., 4., 0., 0., 3.]]),
 tensor([[0., 1., 0., 0., 0.],
         [7., 3., 0., 0., 1.],
         [6., 3., 0., 1., 0.],
         [6., 3., 0., 1., 1.],
         [6., 3., 0., 1., 1.],
         [6., 3., 0., 1., 0.],
         [6., 4., 0., 0., 1.],
         [6., 4., 0., 0., 2.],
         [6., 4., 0., 0., 2.],
         [6., 4., 0., 0., 3.],
         [6., 3., 0., 1., 0.],
         [6., 3., 0., 1., 1.],
         [6., 3., 0., 1., 1.],
      

In [None]:
class AtomGAT(nn.Module):
    def __init__(
            self, 
            in_channels, 
            num_hidden, 
            out_channels, 
            heads, 
            dropout
    ):
        super().__init__()
        self.gatlayer1 = GATConv(
            in_channels=in_channels, 
            out_channels=num_hidden, 
            heads = heads, dropout=dropout, 
        )
        self.gatlayer2 = GATConv(
            in_channels= num_hidden * heads, 
            out_channels=out_channels, 
            heads = 1, 
            concat=False
        )
    def forward(
            self, 
            x, 
            edge_index
    ):
        x = F.elu(self.gatlayer1(x, edge_index))
        x = self.gatlayer2(x, edge_index)
        return x
    
#model = AtomGAT(
    #feature_matrix[0].shape[1], 
    #64, 5, 4, 0.1
#)

class MolGAT(nn.Module):
    def __inti__(
            self, 
            atom_infeature, 
            atom_hidden, 
            atom_outfeature,
            atom_gat_heads, 
            atom_dropout, 
            n_hidden, 
            n_heads, 
            n_class,
            dropout
    ):
        self.atomgat = AtomGAT(atom_infeature, atom_hidden, atom_outfeature, atom_gat_heads, atom_dropout)
        self.gatlayer1 = GATConv(atom_outfeature, n_hidden, n_heads, dropout=dropout)
        self.gatlayer2 = GATConv(n_hidden * n_heads, n_class, concat = False)

    def forward(
            self, 
            atom_feature_matrix, atom_edge_idx, 
            mol_edge_index,
            batch, 
    ):
        x = self.atomgat(atom_feature_matrix, atom_edge_idx)
        x = global_mean_pool(x, batch)
        x = F.elu(self.gatlayer1(x, mol_edge_index))
        x = self.gatlayer2(x, mol_edge_index)
        return x


In [None]:
def trainer(
        x, 
        edge_index, 
        lr,
        num_epoch, 

):
    loss_fn = nn.MSELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr = lr)
    for epoch in range(num_epoch):
        model.train()
        optimizer.zero_grad()
        pred = model(x, edge_index)

        ls = loss_fn(pred[train_mask.numpy()], all_labels[train_mask.numpy()])
        ls.backward()
        optimizer.step()
        
        model.eval()
        with torch.no_grad():
            test_ls = loss_fn(pred(val_mask.numpy()), all_labels[val_mask.numpy()])