In [1]:
import torch
import schnetpack as spk
from ase.db import connect
import ase
#from GOLF.utils import get_radial_basis_by_string, get_cutoff_by_string
import schnetpack.nn as snn
from schnetpack.interfaces import AtomsConverter
from schnetpack.transform import ASENeighborList
import math
from schnetpack import properties
from schnetpack.nn import scatter_add
from copy import copy
import pandas as pd
import pickle as pkl

import numpy as np
from typing import Any, Dict, Optional, Tuple, Type, Union

import pytorch_lightning as pl
import torch
from torch import nn
from torch_geometric.data import Data
from torch_geometric.nn.models import DimeNetPlusPlus

In [2]:
device = torch.device("cuda:1")

In [3]:
class Swish(torch.nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, x):
        return x * x.sigmoid()


class DimeNetPlusPlusPotential(nn.Module):
    def __init__(
        self,
        node_latent_dim: int,
        scaler=None,
        dimenet_hidden_channels=128,
        dimenet_num_blocks=4,
        dimenet_int_emb_size=64,
        dimenet_basis_emb_size=8,
        dimenet_out_emb_channels=256,
        dimenet_num_spherical=7,
        dimenet_num_radial=6,
        dimenet_max_num_neighbors=32,
        dimenet_envelope_exponent=5,
        dimenet_num_before_skip=1,
        dimenet_num_after_skip=2,
        dimenet_num_output_layers=3,
        cutoff=5.0,
        do_postprocessing=False,
    ):
        super().__init__()
        self.scaler = scaler

        self.node_latent_dim = node_latent_dim
        self.dimenet_hidden_channels = dimenet_hidden_channels
        self.dimenet_num_blocks = dimenet_num_blocks
        self.dimenet_int_emb_size = dimenet_int_emb_size
        self.dimenet_basis_emb_size = dimenet_basis_emb_size
        self.dimenet_out_emb_channels = dimenet_out_emb_channels
        self.dimenet_num_spherical = dimenet_num_spherical
        self.dimenet_num_radial = dimenet_num_radial
        self.dimenet_max_num_neighbors = dimenet_max_num_neighbors
        self.dimenet_envelope_exponent = dimenet_envelope_exponent
        self.dimenet_num_before_skip = dimenet_num_before_skip
        self.dimenet_num_after_skip = dimenet_num_after_skip
        self.dimenet_num_output_layers = dimenet_num_output_layers
        self.cutoff = cutoff

        self.linear_output_size = 1

        self.scaler = scaler
        self.do_postprocessing = do_postprocessing

        self.net = DimeNetPlusPlus(
            hidden_channels=self.dimenet_hidden_channels,
            out_channels=self.node_latent_dim,
            num_blocks=self.dimenet_num_blocks,
            int_emb_size=self.dimenet_int_emb_size,
            basis_emb_size=self.dimenet_basis_emb_size,
            out_emb_channels=self.dimenet_out_emb_channels,
            num_spherical=self.dimenet_num_spherical,
            num_radial=self.dimenet_num_radial,
            cutoff=self.cutoff,
            max_num_neighbors=self.dimenet_max_num_neighbors,
            envelope_exponent=self.dimenet_envelope_exponent,
            num_before_skip=self.dimenet_num_before_skip,
            num_after_skip=self.dimenet_num_after_skip,
            num_output_layers=self.dimenet_num_output_layers,
        )

        regr_or_cls_input_dim = self.node_latent_dim
        self.regr_or_cls_nn = nn.Sequential(
            nn.Linear(regr_or_cls_input_dim, regr_or_cls_input_dim),
            Swish(),
            nn.Linear(regr_or_cls_input_dim, regr_or_cls_input_dim // 2),
            Swish(),
            nn.Linear(regr_or_cls_input_dim // 2, regr_or_cls_input_dim // 2),
            Swish(),
            nn.Linear(regr_or_cls_input_dim // 2, self.linear_output_size),
        )

    @torch.enable_grad()
    def forward(self, data: Data):
        pos, atom_z, batch_mapping = data.pos, data.z, data.batch
        pos = pos.requires_grad_(True)
        graph_embeddings = self.net(pos=pos, z=atom_z, batch=batch_mapping)
        # predictions = torch.flatten(self.regr_or_cls_nn(graph_embeddings).contiguous())
        # forces = (
        #     -1
        #     * (
        #         torch.autograd.grad(
        #             predictions,
        #             pos,
        #             grad_outputs=torch.ones_like(predictions),
        #             create_graph=self.training,
        #         )[0]
        #     )
        # )

        if self.scaler and self.do_postprocessing:
            predictions = self.scaler["scale_"] * predictions + self.scaler["mean_"]
        return graph_embeddings

### Representation

In [5]:
model = DimeNetPlusPlusPotential(
    node_latent_dim=50, 
    scaler={"scale_": 0.870582896669776, "mean_": -7.349405628928332},
    dimenet_hidden_channels=256,
    dimenet_num_blocks=6, 
    dimenet_int_emb_size=64,
    dimenet_basis_emb_size=8,
    dimenet_out_emb_channels=256,
    dimenet_num_spherical=7,
    dimenet_num_radial=6,
    dimenet_max_num_neighbors=32,
    dimenet_envelope_exponent=5,
    dimenet_num_before_skip=1,
    dimenet_num_after_skip=2,
    dimenet_num_output_layers=3,
    cutoff=5.0,
    
)
model.to(device)

DimeNetPlusPlusPotential(
  (net): DimeNetPlusPlus(
    (rbf): BesselBasisLayer(
      (envelope): Envelope()
    )
    (sbf): SphericalBasisLayer(
      (envelope): Envelope()
    )
    (emb): EmbeddingBlock(
      (emb): Embedding(95, 256)
      (lin_rbf): Linear(in_features=6, out_features=256, bias=True)
      (lin): Linear(in_features=768, out_features=256, bias=True)
    )
    (output_blocks): ModuleList(
      (0-6): 7 x OutputPPBlock(
        (lin_rbf): Linear(in_features=6, out_features=256, bias=False)
        (lin_up): Linear(in_features=256, out_features=256, bias=False)
        (lins): ModuleList(
          (0-2): 3 x Linear(in_features=256, out_features=256, bias=True)
        )
        (lin): Linear(in_features=256, out_features=50, bias=False)
      )
    )
    (interaction_blocks): ModuleList(
      (0-5): 6 x InteractionPPBlock(
        (lin_rbf1): Linear(in_features=6, out_features=8, bias=False)
        (lin_rbf2): Linear(in_features=8, out_features=256, bias=False)

### Output modules

In [6]:
#DimeNet_train_large_traj_medium
!wget "https://a002dlils-kadurin-nabladft.obs.ru-moscow-1.hc.sbercloud.ru/data/nablaDFTv2/models_checkpoints/DimeNet%2b%2b/DimeNet%2b%2b_dataset_train_100k_epoch=0258.ckpt"

--2024-08-29 18:02:41--  https://a002dlils-kadurin-nabladft.obs.ru-moscow-1.hc.sbercloud.ru/data/nablaDFTv2/models_checkpoints/DimeNet%2b%2b/DimeNet%2b%2b_dataset_train_100k_epoch=0258.ckpt
Resolving a002dlils-kadurin-nabladft.obs.ru-moscow-1.hc.sbercloud.ru (a002dlils-kadurin-nabladft.obs.ru-moscow-1.hc.sbercloud.ru)... 46.243.206.34, 46.243.206.35
Connecting to a002dlils-kadurin-nabladft.obs.ru-moscow-1.hc.sbercloud.ru (a002dlils-kadurin-nabladft.obs.ru-moscow-1.hc.sbercloud.ru)|46.243.206.34|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 61984533 (59M) [binary/octet-stream]
Saving to: ‘DimeNet++_dataset_train_100k_epoch=0258.ckpt.5’


2024-08-29 18:02:42 (135 MB/s) - ‘DimeNet++_dataset_train_100k_epoch=0258.ckpt.5’ saved [61984533/61984533]



In [6]:
checkpoint = torch.load('DimeNet++_dataset_train_100k_epoch=0258.ckpt')
state_dict = checkpoint["state_dict"]

keys = copy(list(state_dict.keys()))
new_keys = [".".join(old_key.split(".")[1:]) for old_key in keys]

for key, new_key in zip(keys, new_keys):
    if "postprocessors" in key:
        state_dict.pop(key)
    else:
        state_dict[new_key] = state_dict.pop(key)

model.load_state_dict(state_dict)

<All keys matched successfully>

# TRAIN

### Load Data files

In [7]:
from torch_geometric.data import Data, Batch

In [56]:
train_clean = pd.read_csv('../../data/train_clean.csv')
train_clean_tmp = train_clean.copy()

In [9]:
train_clean.head(10)

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,Tag,Chromophore,Solvent,Absorption_max_nm,Emission_max_nm,Lifetime (ns),Quantum_yield,log(e/mol-1 dm3 cm-1),abs FWHM (cm-1),emi FWHM (cm-1),abs FWHM (nm),emi FWHM (nm),Molecular weight (g mol-1),Reference,Stokes_shift
0,0,71,72,O=C([O-])c1ccccc1-c1c2cc(Br)c(=O)c(Br)c-2oc2c(...,O,515.0,538.0,1.425,0.2,,,,,,645.87864,doi/10.1021/ja00455a017,23.0
1,1,72,73,O=C([O-])c1ccccc1-c1c2cc(I)c(=O)c(I)c-2oc2c(I)...,O,521.0,534.0,0.115,0.02,,,,,,833.86264,doi/10.1021/ja00455a017,13.0
2,2,75,76,O=C([O-])c1ccccc1-c1c2cc(I)c(=O)c(I)c-2oc2c(I)...,CO,526.0,545.0,0.5,0.08,,,,,,833.86264,doi/10.1021/ja00455a017,19.0
3,3,78,79,O=C([O-])c1ccccc1-c1c2cc(I)c(=O)c(I)c-2oc2c(I)...,CCO,532.0,551.0,0.565,0.08,,,,,,833.86264,doi/10.1021/ja00455a017,19.0
4,4,80,81,O=C([O-])c1ccccc1-c1c2cc(Br)c(=O)c(Br)c-2oc2c(...,CC(C)O,529.0,551.0,3.715,0.76,,,,,,645.87864,doi/10.1021/ja00455a017,22.0
5,5,81,82,O=C([O-])c1ccccc1-c1c2cc(I)c(=O)c(I)c-2oc2c(I)...,CC(C)O,532.0,557.0,0.66,0.1,,,,,,833.86264,doi/10.1021/ja00455a017,25.0
6,6,98,99,C[Si](C)(C)c1cccc2ccccc12,C1CCCCC1,294.0,328.0,64.0,0.3,3.73,,,,29.6,200.35604,https://doi.org/10.3390/molecules17055108,34.0
7,7,103,104,C[Si](C)(C)c1ccc([Si](C)(C)C)c2ccccc12,C1CCCCC1,300.0,333.0,23.0,0.33,3.87,,,,34.2,272.53856,https://doi.org/10.3390/molecules17055108,33.0
8,8,106,107,COc1ccc([Si](C)(C)C)c2ccccc12,C1CCCCC1,312.0,327.0,10.0,0.65,3.71,,,,38.8,230.38192,https://doi.org/10.3390/molecules17055108,15.0
9,9,107,108,C[Si](C)(C)c1ccc(C#N)c2ccccc12,C1CCCCC1,315.0,333.0,11.0,0.66,3.88,,,,34.1,225.3661,https://doi.org/10.3390/molecules17055108,18.0


#### Embeddings for chromophores

In [57]:
chromophore_smiles = []
chromophore_atoms = []

with connect("../../data/conformers_1_chromophore_train.db") as conn:
    print(len(conn))
    for row in conn.select():
        chromophore_atoms.append(row.toatoms())
        chromophore_smiles.append(row.smiles)

4384


In [58]:
batch_size = 16
chromophore_embeddings = {}

n_batches = len(chromophore_smiles) // batch_size
tail_batch_size = len(chromophore_smiles) - n_batches * batch_size

for i in range(len(chromophore_smiles) // batch_size):
    print(i)
    atoms_list = chromophore_atoms[i * batch_size: (i + 1) * batch_size]
    smiles_list = chromophore_smiles[i * batch_size: (i + 1) * batch_size]

    data_list = [Data(z=torch.tensor(atoms.get_atomic_numbers(), dtype=torch.long), pos=torch.tensor(atoms.get_positions(), dtype=torch.float32)) for atoms in atoms_list]
    batch = Batch.from_data_list(data_list).to(device)
    
    emb = model(batch).detach().cpu()
    print(emb.shape)
    
    #aggregated_emb = scatter_add(emb["scalar_representation"], batch[properties.idx_m], batch_size).detach().cpu()
    
    for j, smiles in enumerate(smiles_list):
        chromophore_embeddings[smiles] = emb[j]
    
if tail_batch_size > 0:
    atoms_list_tail = chromophore_atoms[(i + 1) * batch_size:]
    smiles_list_tail = chromophore_smiles[(i + 1) * batch_size:]
    print(smiles_list_tail)

    data_list = [Data(z=torch.tensor(atoms.get_atomic_numbers(), dtype=torch.long), pos=torch.tensor(atoms.get_positions(), dtype=torch.float32)) for atoms in atoms_list_tail]
    tail_batch = Batch.from_data_list(data_list).to(device)

    emb = model(batch).detach().cpu()
    print(emb.shape)

    for j, smiles in enumerate(smiles_list_tail):
        chromophore_embeddings[smiles] = emb[j]

    #print(atoms_list_tail)
    
#    tail_batch = converter(atoms_list_tail)
#    tail_batch = {k:v.to(device) for k, v in tail_batch.items()}
    #print(tail_batch)

#     assert tail_batch_size == len(tail_batch[properties.R])
    
#    tail_batch = spk.atomistic.PairwiseDistances()(tail_batch)
#    emb_tail = model.representation(tail_batch)
#    aggregated_emb = scatter_add(emb_tail["scalar_representation"], tail_batch[properties.idx_m], tail_batch_size).detach().cpu()
    
#     for j, smiles in enumerate(smiles_list_tail):
#         chromophore_embeddings[smiles] = aggregated_emb[j]

0
torch.Size([16, 50])
1
torch.Size([16, 50])
2
torch.Size([16, 50])
3
torch.Size([16, 50])
4
torch.Size([16, 50])
5
torch.Size([16, 50])
6
torch.Size([16, 50])
7
torch.Size([16, 50])
8
torch.Size([16, 50])
9
torch.Size([16, 50])
10
torch.Size([16, 50])
11
torch.Size([16, 50])
12
torch.Size([16, 50])
13
torch.Size([16, 50])
14
torch.Size([16, 50])
15
torch.Size([16, 50])
16
torch.Size([16, 50])
17
torch.Size([16, 50])
18
torch.Size([16, 50])
19
torch.Size([16, 50])
20
torch.Size([16, 50])
21
torch.Size([16, 50])
22
torch.Size([16, 50])
23
torch.Size([16, 50])
24
torch.Size([16, 50])
25
torch.Size([16, 50])
26
torch.Size([16, 50])
27
torch.Size([16, 50])
28
torch.Size([16, 50])
29
torch.Size([16, 50])
30
torch.Size([16, 50])
31
torch.Size([16, 50])
32
torch.Size([16, 50])
33
torch.Size([16, 50])
34
torch.Size([16, 50])
35
torch.Size([16, 50])
36
torch.Size([16, 50])
37
torch.Size([16, 50])
38
torch.Size([16, 50])
39
torch.Size([16, 50])
40
torch.Size([16, 50])
41
torch.Size([16, 50])
42

In [59]:
# Initialize the new column
train_clean_tmp['Chromophore_embedding'] = None

# for smiles, embedding in chromophore_embeddings.items():
#     mask = train_clean_tmp.Chromophore == smiles
#     train_clean_tmp[mask].Chromophore_embedding = embedding

for index in range(len(train_clean_tmp)):
    smiles_key = train_clean_tmp.iloc[index]['Chromophore']
    train_clean_tmp.at[index, 'Chromophore_embedding'] = chromophore_embeddings.get(smiles_key)

In [60]:
train_clean_tmp

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,Tag,Chromophore,Solvent,Absorption_max_nm,Emission_max_nm,Lifetime (ns),Quantum_yield,log(e/mol-1 dm3 cm-1),abs FWHM (cm-1),emi FWHM (cm-1),abs FWHM (nm),emi FWHM (nm),Molecular weight (g mol-1),Reference,Stokes_shift,Chromophore_embedding
0,0,71,72,O=C([O-])c1ccccc1-c1c2cc(Br)c(=O)c(Br)c-2oc2c(...,O,515.0,538.0,1.425,0.20,,,,,,645.87864,doi/10.1021/ja00455a017,23.0,"[tensor(-15244.9355), tensor(15436.5957), tens..."
1,1,72,73,O=C([O-])c1ccccc1-c1c2cc(I)c(=O)c(I)c-2oc2c(I)...,O,521.0,534.0,0.115,0.02,,,,,,833.86264,doi/10.1021/ja00455a017,13.0,
2,2,75,76,O=C([O-])c1ccccc1-c1c2cc(I)c(=O)c(I)c-2oc2c(I)...,CO,526.0,545.0,0.500,0.08,,,,,,833.86264,doi/10.1021/ja00455a017,19.0,
3,3,78,79,O=C([O-])c1ccccc1-c1c2cc(I)c(=O)c(I)c-2oc2c(I)...,CCO,532.0,551.0,0.565,0.08,,,,,,833.86264,doi/10.1021/ja00455a017,19.0,
4,4,80,81,O=C([O-])c1ccccc1-c1c2cc(Br)c(=O)c(Br)c-2oc2c(...,CC(C)O,529.0,551.0,3.715,0.76,,,,,,645.87864,doi/10.1021/ja00455a017,22.0,"[tensor(-15244.9355), tensor(15436.5957), tens..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10336,10336,20208,20209,Cc1cc(C)c(-c2cc(-c3c(C)cc(C)cc3C)c3ccc4c(-c5c(...,ClCCl,363.0,411.0,,0.28,,,,,,674.96900,DOI: 10.1021/ol7023136,48.0,"[tensor(70.2372), tensor(-39.0910), tensor(-2...."
10337,10337,20209,20210,Cc1cc(C)c(C)c(-c2cc(-c3c(C)c(C)cc(C)c3C)c3ccc4...,ClCCl,364.0,412.0,,0.33,,,,,,731.07652,DOI: 10.1021/ol7023136,48.0,"[tensor(77.2582), tensor(-44.5390), tensor(-2...."
10338,10338,20210,20211,COc1cc(C)c(-c2cc(-c3c(C)cc(OC)cc3C)c3ccc4c(-c5...,ClCCl,367.0,411.0,,0.38,,,,,,738.96500,DOI: 10.1021/ol7023136,44.0,"[tensor(72.2080), tensor(-31.8788), tensor(-1...."
10339,10339,20214,20215,c1cc2c3c(c1)-c1cccc4cccc(c14)B3c1cccc3cccc-2c13,Cc1ccccc1,466.0,491.0,6.310,0.90,4.363612,,,,,338.21610,DOI: 10.1021/acs.orglett.5b03167,25.0,"[tensor(31.5132), tensor(-4.3084), tensor(-0.2..."


#### Embeddings for solvents

In [61]:
solvent_smiles = []
solvent_atoms = []

with connect("../../data/conformers_1_solvent_train.db") as conn:
    for row in conn.select():
        solvent_atoms.append(row.toatoms())
        solvent_smiles.append(row.smiles)

In [62]:
batch_size = 16
solvent_embeddings = {}

n_batches = len(solvent_smiles) // batch_size
tail_batch_size = len(solvent_smiles) - n_batches * batch_size

for i in range(len(solvent_smiles) // batch_size):
    print(i)
    atoms_list = solvent_atoms[i * batch_size: (i + 1) * batch_size]
    smiles_list = solvent_smiles[i * batch_size: (i + 1) * batch_size]

    data_list = [Data(z=torch.tensor(atoms.get_atomic_numbers(), dtype=torch.long), pos=torch.tensor(atoms.get_positions(), dtype=torch.float32)) for atoms in atoms_list]
    batch = Batch.from_data_list(data_list).to(device)
    
    emb = model(batch).detach().cpu()
    print(emb.shape)
    
    #aggregated_emb = scatter_add(emb["scalar_representation"], batch[properties.idx_m], batch_size).detach().cpu()
    
    for j, smiles in enumerate(smiles_list):
        solvent_embeddings[smiles] = emb[j]
    
if tail_batch_size > 0:
    atoms_list_tail = solvent_atoms[(i + 1) * batch_size:]
    smiles_list_tail = solvent_smiles[(i + 1) * batch_size:]
    print(smiles_list_tail)

    data_list = [Data(z=torch.tensor(atoms.get_atomic_numbers(), dtype=torch.long), pos=torch.tensor(atoms.get_positions(), dtype=torch.float32)) for atoms in atoms_list_tail]
    tail_batch = Batch.from_data_list(data_list).to(device)

    emb = model(batch).detach().cpu()
    print(emb.shape)

    for j, smiles in enumerate(smiles_list_tail):
        solvent_embeddings[smiles] = emb[j]

0
torch.Size([16, 50])
1
torch.Size([16, 50])
2
torch.Size([16, 50])
3
torch.Size([16, 50])
4
torch.Size([16, 50])
5
torch.Size([16, 50])
6
torch.Size([16, 50])
7
torch.Size([16, 50])
8
torch.Size([16, 50])
9
torch.Size([16, 50])
10
torch.Size([16, 50])
11
torch.Size([16, 50])
12
torch.Size([16, 50])
13
torch.Size([16, 50])
14
torch.Size([16, 50])
15
torch.Size([16, 50])
['CC(C)[Si](C#Cc1c2nc3ccccc3nc2c(C#C[Si](C(C)C)(C(C)C)C(C)C)c2nc3c4ccccc4c(=O)c4ccccc4c3nc12)(C(C)C)C(C)C', 'CC(C)[Si](C#Cc1c2nsnc2c(C#C[Si](C(C)C)(C(C)C)C(C)C)c2nc3c4ccccc4c(=O)c4ccccc4c3nc12)(C(C)C)C(C)C', 'N#Cc1ccc(N(c2ccc(C#N)cc2)c2ccc3c4cccc5c(N(c6ccc(C#N)cc6)c6ccc(C#N)cc6)ccc(c6cccc2c63)c54)cc1', 'Fc1ccc(N(c2ccc(F)cc2)c2ccc3c4cccc5c(N(c6ccc(F)cc6)c6ccc(F)cc6)ccc(c6cccc2c63)c54)cc1', 'c1ccc(N(c2ccccc2)c2ccc3c4cccc5c(N(c6ccccc6)c6ccccc6)ccc(c6cccc2c63)c54)cc1', 'Cc1ccc(N(c2ccc(C)cc2)c2ccc3c4cccc5c(N(c6ccc(C)cc6)c6ccc(C)cc6)ccc(c6cccc2c63)c54)cc1', 'COc1ccc(N(c2ccc(OC)cc2)c2ccc3c4cccc5c(N(c6ccc(OC)cc6)c6ccc(OC)cc6)c

In [63]:
# Initialize the new column
train_clean_tmp['Solvent_embedding'] = None

for index in range(len(train_clean_tmp)):
    smiles_key = train_clean_tmp.iloc[index]['Solvent']
    train_clean_tmp.at[index, 'Solvent_embedding'] = solvent_embeddings.get(smiles_key)

In [64]:
for emb in solvent_embeddings.values():
    assert torch.isnan(emb).sum() == 0

In [65]:
train_clean_tmp.columns

Index(['Unnamed: 0.1', 'Unnamed: 0', 'Tag', 'Chromophore', 'Solvent',
       'Absorption_max_nm', 'Emission_max_nm', 'Lifetime (ns)',
       'Quantum_yield', 'log(e/mol-1 dm3 cm-1)', 'abs FWHM (cm-1)',
       'emi FWHM (cm-1)', 'abs FWHM (nm)', 'emi FWHM (nm)',
       'Molecular weight (g mol-1)', 'Reference', 'Stokes_shift',
       'Chromophore_embedding', 'Solvent_embedding'],
      dtype='object')

### Clean df from non-optimized systems (chromophore or solvent)

In [66]:
rows_to_drop = (train_clean_tmp.Chromophore_embedding.isna() | train_clean_tmp.Solvent_embedding.isna())

In [67]:
train_clean_tmp = train_clean_tmp[~rows_to_drop]

In [68]:
train_clean_tmp

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,Tag,Chromophore,Solvent,Absorption_max_nm,Emission_max_nm,Lifetime (ns),Quantum_yield,log(e/mol-1 dm3 cm-1),abs FWHM (cm-1),emi FWHM (cm-1),abs FWHM (nm),emi FWHM (nm),Molecular weight (g mol-1),Reference,Stokes_shift,Chromophore_embedding,Solvent_embedding
0,0,71,72,O=C([O-])c1ccccc1-c1c2cc(Br)c(=O)c(Br)c-2oc2c(...,O,515.0,538.0,1.425,0.20,,,,,,645.87864,doi/10.1021/ja00455a017,23.0,"[tensor(-15244.9355), tensor(15436.5957), tens...","[tensor(1.1118), tensor(0.0048), tensor(-0.071..."
4,4,80,81,O=C([O-])c1ccccc1-c1c2cc(Br)c(=O)c(Br)c-2oc2c(...,CC(C)O,529.0,551.0,3.715,0.76,,,,,,645.87864,doi/10.1021/ja00455a017,22.0,"[tensor(-15244.9355), tensor(15436.5957), tens...","[tensor(5.2158), tensor(2.1571), tensor(0.0906..."
6,6,98,99,C[Si](C)(C)c1cccc2ccccc12,C1CCCCC1,294.0,328.0,64.000,0.30,3.730000,,,,29.6,200.35604,https://doi.org/10.3390/molecules17055108,34.0,"[tensor(18.0971), tensor(4.6580), tensor(0.232...","[tensor(9.9512), tensor(3.9310), tensor(0.0937..."
7,7,103,104,C[Si](C)(C)c1ccc([Si](C)(C)C)c2ccccc12,C1CCCCC1,300.0,333.0,23.000,0.33,3.870000,,,,34.2,272.53856,https://doi.org/10.3390/molecules17055108,33.0,"[tensor(24.1374), tensor(-0.5557), tensor(0.22...","[tensor(9.9512), tensor(3.9310), tensor(0.0937..."
8,8,106,107,COc1ccc([Si](C)(C)C)c2ccccc12,C1CCCCC1,312.0,327.0,10.000,0.65,3.710000,,,,38.8,230.38192,https://doi.org/10.3390/molecules17055108,15.0,"[tensor(20.4435), tensor(3.8566), tensor(0.199...","[tensor(9.9512), tensor(3.9310), tensor(0.0937..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10336,10336,20208,20209,Cc1cc(C)c(-c2cc(-c3c(C)cc(C)cc3C)c3ccc4c(-c5c(...,ClCCl,363.0,411.0,,0.28,,,,,,674.96900,DOI: 10.1021/ol7023136,48.0,"[tensor(70.2372), tensor(-39.0910), tensor(-2....","[tensor(1.0826), tensor(2.2852), tensor(0.2100..."
10337,10337,20209,20210,Cc1cc(C)c(C)c(-c2cc(-c3c(C)c(C)cc(C)c3C)c3ccc4...,ClCCl,364.0,412.0,,0.33,,,,,,731.07652,DOI: 10.1021/ol7023136,48.0,"[tensor(77.2582), tensor(-44.5390), tensor(-2....","[tensor(1.0826), tensor(2.2852), tensor(0.2100..."
10338,10338,20210,20211,COc1cc(C)c(-c2cc(-c3c(C)cc(OC)cc3C)c3ccc4c(-c5...,ClCCl,367.0,411.0,,0.38,,,,,,738.96500,DOI: 10.1021/ol7023136,44.0,"[tensor(72.2080), tensor(-31.8788), tensor(-1....","[tensor(1.0826), tensor(2.2852), tensor(0.2100..."
10339,10339,20214,20215,c1cc2c3c(c1)-c1cccc4cccc(c14)B3c1cccc3cccc-2c13,Cc1ccccc1,466.0,491.0,6.310,0.90,4.363612,,,,,338.21610,DOI: 10.1021/acs.orglett.5b03167,25.0,"[tensor(31.5132), tensor(-4.3084), tensor(-0.2...","[tensor(9.3040), tensor(4.8101), tensor(0.1969..."


In [69]:
# Save the DataFrame to a pickle file
train_clean_tmp.to_pickle('data_DimeNet/train_clean_scalar_dimenet.pkl')

# Load the DataFrame from the pickle file
#loaded_df = pd.read_pickle('../data/train_clean_scalar_painn.pkl')

### Concatenate embeddings of chromophores and solvents

In [70]:
def concatenate_embeddings(df, row):
    # Create tensors from the two columns
    chromophore_emb = df.iloc[row].Chromophore_embedding
    solvent_emb = df.iloc[row].Solvent_embedding
    
    # Concatenate the tensors
    concatenated_emb = torch.cat((chromophore_emb, solvent_emb), dim=0)
    
    return concatenated_emb

In [71]:
concatenated_embeddings = []
for i in range(len(train_clean_tmp)):
    concatenated_emb = concatenate_embeddings(train_clean_tmp, i)
    concatenated_embeddings.append(concatenated_emb)

In [72]:
train_clean_tmp['Concatenated_embedding'] = concatenated_embeddings

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_clean_tmp['Concatenated_embedding'] = concatenated_embeddings


In [74]:
#train_clean_tmp.to_csv('train_clean_scalar_painn.csv', index=True)
train_clean_tmp.to_pickle('data_DimeNet/train_clean_scalar_dimenet.pkl')

In [79]:
train_clean_tmp['Concatenated_embedding'].to_list()

[tensor([-1.5245e+04,  1.5437e+04,  3.1433e+03,  2.6811e+03, -9.9264e+02,
         -5.7958e+02, -8.6143e+02, -4.0209e+03,  1.7480e+03, -3.3326e+03,
          4.0523e+03,  1.7913e+04,  4.6355e+03, -3.0008e+03,  2.6124e+03,
          9.5562e+02, -2.2644e+03,  4.0716e+03,  3.0377e+03, -2.9135e+03,
          2.0791e+03,  1.2010e+03,  3.1896e+03,  1.0099e+03, -6.7163e+03,
         -3.0529e+03,  2.2528e+02,  5.0949e+03,  8.7015e+03, -6.6329e+03,
          9.0604e+02, -1.4212e+04,  2.6385e+03,  2.4816e+03, -1.8057e+03,
         -2.4811e+03,  4.5968e+02, -8.5509e+03,  3.5030e+03, -6.2393e+03,
          1.3497e+03,  1.5054e+04, -1.6451e+03,  1.9280e+04, -1.8648e+03,
         -2.8931e+03, -7.1440e+02, -1.0069e+03,  2.1222e+04, -3.5251e+03,
          1.1118e+00,  4.7746e-03, -7.1059e-02,  1.0234e-01, -4.9714e-02,
         -2.7417e-02,  6.4417e-02, -3.1469e-02, -4.1500e-02, -9.0486e-02,
          1.6728e-01, -1.1581e+00,  5.0338e-02,  8.4720e-03, -1.9787e-02,
         -1.0318e-01,  5.6024e-03,  8.

tensor([        nan,         nan,         nan,         nan,         nan,
                nan,         nan,         nan,         nan,         nan,
                nan,         nan,         nan,         nan,         nan,
                nan,         nan,         nan,         nan,         nan,
                nan,         nan,         nan,         nan,         nan,
                nan,         nan,         nan,         nan,         nan,
                nan,         nan,         nan,         nan,         nan,
                nan,         nan,         nan,         nan,         nan,
                nan,         nan,         nan,         nan,         nan,
                nan,         nan,         nan,         nan,         nan,
         1.0967e+01,  2.1320e+00,  1.1927e-01,  1.0457e-01, -9.4702e-02,
        -2.3379e-01,  1.9676e-02, -4.2237e-02, -1.0285e-01, -4.4828e-01,
         8.9049e-01, -1.1496e+01,  3.0897e-03,  2.1518e-01,  5.7324e-01,
        -3.4684e-02,  2.9107e-01, -8.5320e-02, -2.0

### Split df into train test val

In [38]:
from sklearn.model_selection import train_test_split

In [39]:
# Split the DataFrame into train, test, and validation sets
train_size = 0.9
test_size = 0.1

# Calculate the number of rows for each set
train_rows = int(len(train_clean_tmp) * train_size)
test_rows = int(len(train_clean_tmp) * test_size)

train_df, test_df = train_test_split(train_clean_tmp, train_size=0.9, test_size=0.1)

# Display the split DataFrames
print("Train DataFrame:")
print(train_df.shape)
print("\nTest DataFrame:")
print(test_df.shape)

Train DataFrame:
(9138, 20)

Test DataFrame:
(1016, 20)


In [40]:
train_df.to_pickle('embeddings_DimeNet/train_clean_scalar_dimenet_train.pkl')

In [41]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 9138 entries, 10187 to 10173
Data columns (total 20 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   Unnamed: 0.1                9138 non-null   int64  
 1   Unnamed: 0                  9138 non-null   int64  
 2   Tag                         9138 non-null   int64  
 3   Chromophore                 9138 non-null   object 
 4   Solvent                     9138 non-null   object 
 5   Absorption_max_nm           9138 non-null   float64
 6   Emission_max_nm             9138 non-null   float64
 7   Lifetime (ns)               4088 non-null   float64
 8   Quantum_yield               9138 non-null   float64
 9   log(e/mol-1 dm3 cm-1)       4883 non-null   float64
 10  abs FWHM (cm-1)             382 non-null    float64
 11  emi FWHM (cm-1)             357 non-null    float64
 12  abs FWHM (nm)               2087 non-null   float64
 13  emi FWHM (nm)               3527 

In [42]:
test_df.to_pickle('embeddings_DimeNet/train_clean_scalar_dimenet_test.pkl')

In [43]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1016 entries, 1725 to 7960
Data columns (total 20 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   Unnamed: 0.1                1016 non-null   int64  
 1   Unnamed: 0                  1016 non-null   int64  
 2   Tag                         1016 non-null   int64  
 3   Chromophore                 1016 non-null   object 
 4   Solvent                     1016 non-null   object 
 5   Absorption_max_nm           1016 non-null   float64
 6   Emission_max_nm             1016 non-null   float64
 7   Lifetime (ns)               424 non-null    float64
 8   Quantum_yield               1016 non-null   float64
 9   log(e/mol-1 dm3 cm-1)       548 non-null    float64
 10  abs FWHM (cm-1)             44 non-null     float64
 11  emi FWHM (cm-1)             45 non-null     float64
 12  abs FWHM (nm)               218 non-null    float64
 13  emi FWHM (nm)               378 non

In [44]:
test_df.Absorption_max_nm.isna().sum()

0

# TEST

### Load Data files

In [75]:
test_clean = pd.read_csv('../../data/test_clean.csv')
test_clean_tmp = test_clean.copy()

In [76]:
test_clean.head(10)

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,Tag,Chromophore,Solvent,Absorption_max_nm,Emission_max_nm,Lifetime (ns),Quantum_yield,log(e/mol-1 dm3 cm-1),abs FWHM (cm-1),emi FWHM (cm-1),abs FWHM (nm),emi FWHM (nm),Molecular weight (g mol-1),Reference,Stokes_shift
0,0,14307,14308,CC[C@H](C)COc1ccc(C2=C3C(C)=C(I)C(C)=[N+]3[B-]...,ClCCl,532.0,557.0,,0.06,,,,,,662.09838,https://doi.org/10.1016/j.inoche.2015.10.029,25.0
1,1,7421,7422,CN(C)c1ccc(C2=Nc3sc4cc(C(F)(F)F)ccc4[n+]3[B-](...,ClCCl,431.0,478.0,,1.0,,,,42.7,46.6,413.17722,DOI: 10.1021/acs.joc.8b02098,47.0
2,2,5928,5929,CCN1C(=O)C(C2C(=O)c3c4ccccc4cc4cc5ccccc5c2c34)...,ClCCl,532.107414,582.0,10.5,0.237,,,,91.1,84.5,466.55968,DOI: 10.1021/acs.joc.8b03083,49.892586
3,3,12238,12239,Fc1ccc(C#Cc2cc(C#Cc3ccc(F)cc3)c(C#Cc3ccc(F)cc3...,ClCCl,371.0,421.0,,0.43,4.724276,,,49.5,71.1,668.66274,DOI: 10.1021/ol2000183,50.0
4,4,19455,19456,COC(=O)c1[nH]c(-c2ccccc2)c2nnc3ccsc3c12,ClCCl,401.0,478.0,2.13,0.1105,3.778151,,,,,309.34834,DOI: 10.1021/acs.joc.6b01662,77.0
5,5,9004,9005,C(=C/c1cnc2ccccc2n1)\c1ccc(N2CCCCC2)cc1,ClCCl,414.0,583.0,,0.7,4.401401,,,,,315.41874,dx.doi.org/10.1021/jo3004919,169.0
6,6,624,625,Cc1nc(-c2cc(C(F)(F)F)ccc2O)n2c1CCCC2,ClCCl,316.0,432.0,3.7,0.61,5.232488,,,,73.1,296.2911,https://doi.org/10.1016/j.dyepig.2018.09.069,116.0
7,7,10680,10681,CCCCCCN1C(=O)c2cccc3c(-c4ccc(-c5cc(-c6ccc(OC)c...,ClCCl,403.0,521.0,,0.38,4.214,,,,80.8,611.76402,https://doi.org/10.1016/j.saa.2013.07.073,118.0
8,8,3529,3530,O=c1c2cc(I)ccc2nc2n1[B-](F)(F)[n+]1ccccc1-2,ClCCl,362.0,449.0,,0.73,,,,,,396.92558,DOI:10.1002/chem.201803428,87.0
9,9,19888,19889,CC(C)(C)c1ccc2c(c1)sc1[n+]2[B-](F)(F)n2c(c3ccc...,ClCCl,538.0,560.0,6.0,0.6,4.652246,,,,,508.19698,DOI: 10.1021/ol503379c,22.0


#### Embeddings for chromophores

In [77]:
chromophore_smiles = []
chromophore_atoms = []

with connect("../../data/conformers_1_chromophore_test.db") as conn:
    print(len(conn))
    for row in conn.select():
        chromophore_atoms.append(row.toatoms())
        chromophore_smiles.append(row.smiles)

898


In [48]:
len(chromophore_smiles)

898

In [78]:
batch_size = 16
chromophore_embeddings = {}

n_batches = len(chromophore_smiles) // batch_size
tail_batch_size = len(chromophore_smiles) - n_batches * batch_size

for i in range(len(chromophore_smiles) // batch_size):
    print(i)
    atoms_list = chromophore_atoms[i * batch_size: (i + 1) * batch_size]
    smiles_list = chromophore_smiles[i * batch_size: (i + 1) * batch_size]

    data_list = [Data(z=torch.tensor(atoms.get_atomic_numbers(), dtype=torch.long), pos=torch.tensor(atoms.get_positions(), dtype=torch.float32)) for atoms in atoms_list]
    batch = Batch.from_data_list(data_list).to(device)
    
    emb = model(batch).detach().cpu()
    print(emb.shape)
    
    #aggregated_emb = scatter_add(emb["scalar_representation"], batch[properties.idx_m], batch_size).detach().cpu()
    
    for j, smiles in enumerate(smiles_list):
        chromophore_embeddings[smiles] = emb[j]
    
if tail_batch_size > 0:
    atoms_list_tail = chromophore_atoms[(i + 1) * batch_size:]
    smiles_list_tail = chromophore_smiles[(i + 1) * batch_size:]
    print(smiles_list_tail)

    data_list = [Data(z=torch.tensor(atoms.get_atomic_numbers(), dtype=torch.long), pos=torch.tensor(atoms.get_positions(), dtype=torch.float32)) for atoms in atoms_list_tail]
    tail_batch = Batch.from_data_list(data_list).to(device)

    emb = model(batch).detach().cpu()
    print(emb.shape)

    for j, smiles in enumerate(smiles_list_tail):
        chromophore_embeddings[smiles] = emb[j]

0
torch.Size([16, 50])
1
torch.Size([16, 50])
2
torch.Size([16, 50])
3
torch.Size([16, 50])
4
torch.Size([16, 50])
5
torch.Size([16, 50])
6
torch.Size([16, 50])
7
torch.Size([16, 50])
8
torch.Size([16, 50])
9
torch.Size([16, 50])
10
torch.Size([16, 50])
11
torch.Size([16, 50])
12
torch.Size([16, 50])
13
torch.Size([16, 50])
14
torch.Size([16, 50])
15
torch.Size([16, 50])
16
torch.Size([16, 50])
17
torch.Size([16, 50])
18
torch.Size([16, 50])
19
torch.Size([16, 50])
20
torch.Size([16, 50])
21
torch.Size([16, 50])
22
torch.Size([16, 50])
23
torch.Size([16, 50])
24
torch.Size([16, 50])
25
torch.Size([16, 50])
26
torch.Size([16, 50])
27
torch.Size([16, 50])
28
torch.Size([16, 50])
29
torch.Size([16, 50])
30
torch.Size([16, 50])
31
torch.Size([16, 50])
32
torch.Size([16, 50])
33
torch.Size([16, 50])
34
torch.Size([16, 50])
35
torch.Size([16, 50])
36
torch.Size([16, 50])
37
torch.Size([16, 50])
38
torch.Size([16, 50])
39
torch.Size([16, 50])
40
torch.Size([16, 50])
41
torch.Size([16, 50])
42

In [79]:
# Initialize the new column
test_clean_tmp['Chromophore_embedding'] = None

# for smiles, embedding in chromophore_embeddings.items():
#     mask = train_clean_tmp.Chromophore == smiles
#     train_clean_tmp[mask].Chromophore_embedding = embedding

for index in range(len(test_clean_tmp)):
    smiles_key = test_clean_tmp.iloc[index]['Chromophore']
    test_clean_tmp.at[index, 'Chromophore_embedding'] = chromophore_embeddings.get(smiles_key)

### For solvents

In [80]:
solvent_smiles = []
solvent_atoms = []

with connect("../../data/conformers_1_solvent_test.db") as conn:
    for row in conn.select():
        solvent_atoms.append(row.toatoms())
        solvent_smiles.append(row.smiles)

In [81]:
batch_size = 32
solvent_embeddings = {}

n_batches = len(solvent_smiles) // batch_size
tail_batch_size = len(solvent_smiles) - n_batches * batch_size

for i in range(len(solvent_smiles) // batch_size):
    print(i)
    atoms_list = solvent_atoms[i * batch_size: (i + 1) * batch_size]
    smiles_list = solvent_smiles[i * batch_size: (i + 1) * batch_size]

    data_list = [Data(z=torch.tensor(atoms.get_atomic_numbers(), dtype=torch.long), pos=torch.tensor(atoms.get_positions(), dtype=torch.float32)) for atoms in atoms_list]
    batch = Batch.from_data_list(data_list).to(device)
    
    emb = model(batch).detach().cpu()
    print(emb.shape)
    
    #aggregated_emb = scatter_add(emb["scalar_representation"], batch[properties.idx_m], batch_size).detach().cpu()
    
    for j, smiles in enumerate(smiles_list):
        solvent_embeddings[smiles] = emb[j]
    
if tail_batch_size > 0:
    atoms_list_tail = solvent_atoms[(i + 1) * batch_size:]
    smiles_list_tail = solvent_smiles[(i + 1) * batch_size:]
    print(smiles_list_tail)

    data_list = [Data(z=torch.tensor(atoms.get_atomic_numbers(), dtype=torch.long), pos=torch.tensor(atoms.get_positions(), dtype=torch.float32)) for atoms in atoms_list_tail]
    tail_batch = Batch.from_data_list(data_list).to(device)

    emb = model(batch).detach().cpu()
    print(emb.shape)

    for j, smiles in enumerate(smiles_list_tail):
        solvent_embeddings[smiles] = emb[j]

0
torch.Size([32, 50])
['CC1CCCO1', 'NC=O', 'CCCCCCCCCCO', 'ClCCCl', 'CC(C)CC(C)(C)C', 'CNC=O', 'CCCCCCCO', 'CCCCCCCCCCCCCCCC', 'CC(C)OC(C)C', 'CCCCOC(C)=O', 'CCCC#N', 'ClC=C(Cl)Cl', 'CC(O)CO', 'CC(=O)C(C)(C)C', 'CCCCCCCCCCCO', 'COCCOC']
torch.Size([32, 50])


In [82]:
# Initialize the new column
test_clean_tmp['Solvent_embedding'] = None

for index in range(len(test_clean_tmp)):
    smiles_key = test_clean_tmp.iloc[index]['Solvent']
    test_clean_tmp.at[index, 'Solvent_embedding'] = solvent_embeddings.get(smiles_key)

In [83]:
test_clean_tmp

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,Tag,Chromophore,Solvent,Absorption_max_nm,Emission_max_nm,Lifetime (ns),Quantum_yield,log(e/mol-1 dm3 cm-1),abs FWHM (cm-1),emi FWHM (cm-1),abs FWHM (nm),emi FWHM (nm),Molecular weight (g mol-1),Reference,Stokes_shift,Chromophore_embedding,Solvent_embedding
0,0,14307,14308,CC[C@H](C)COc1ccc(C2=C3C(C)=C(I)C(C)=[N+]3[B-]...,ClCCl,532.000000,557.000000,,0.0600,,,,,,662.09838,https://doi.org/10.1016/j.inoche.2015.10.029,25.000000,,"[tensor(1.0826), tensor(2.2852), tensor(0.2100..."
1,1,7421,7422,CN(C)c1ccc(C2=Nc3sc4cc(C(F)(F)F)ccc4[n+]3[B-](...,ClCCl,431.000000,478.000000,,1.0000,,,,42.7,46.6,413.17722,DOI: 10.1021/acs.joc.8b02098,47.000000,"[tensor(26.6752), tensor(-0.7856), tensor(0.01...","[tensor(1.0826), tensor(2.2852), tensor(0.2100..."
2,2,5928,5929,CCN1C(=O)C(C2C(=O)c3c4ccccc4cc4cc5ccccc5c2c34)...,ClCCl,532.107414,582.000000,10.50,0.2370,,,,91.1,84.5,466.55968,DOI: 10.1021/acs.joc.8b03083,49.892586,"[tensor(39.2625), tensor(-13.2609), tensor(-0....","[tensor(1.0826), tensor(2.2852), tensor(0.2100..."
3,3,12238,12239,Fc1ccc(C#Cc2cc(C#Cc3ccc(F)cc3)c(C#Cc3ccc(F)cc3...,ClCCl,371.000000,421.000000,,0.4300,4.724276,,,49.5,71.1,668.66274,DOI: 10.1021/ol2000183,50.000000,"[tensor(53.5580), tensor(4.5459), tensor(-0.24...","[tensor(1.0826), tensor(2.2852), tensor(0.2100..."
4,4,19455,19456,COC(=O)c1[nH]c(-c2ccccc2)c2nnc3ccsc3c12,ClCCl,401.000000,478.000000,2.13,0.1105,3.778151,,,,,309.34834,DOI: 10.1021/acs.joc.6b01662,77.000000,"[tensor(22.7576), tensor(3.9541), tensor(0.083...","[tensor(1.0826), tensor(2.2852), tensor(0.2100..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1110,1110,6923,6924,c1ccc(-n2c(-c3ccc(-c4ccc(-c5ccc(-c6nc7c8ccccc8...,ClC=C(Cl)Cl,343.000000,417.000000,,0.8870,,,,,58.5,714.86996,DOI: 10.1021/acsami.6b14638,74.000000,"[tensor(67.0383), tensor(-43.2481), tensor(-1....","[tensor(7.7127), tensor(4.8227), tensor(0.3333..."
1111,1111,2920,2921,COC(=O)C(Cc1ccc2oc(-c3ccc(Nc4ccccc4)cc3)nc2c1)...,CC(O)CO,355.795915,438.000964,0.54,0.0900,,,,,,487.55426,DOI:10.1039/c1pp05123g,82.205048,"[tensor(43.7428), tensor(-20.0773), tensor(-0....","[tensor(1.1118), tensor(0.0048), tensor(-0.071..."
1112,1112,14071,14072,CC1=[N+]2C(=C(c3ccc(N(C)C)cc3)c3ccc(C)n3[B-]2(...,CC(=O)C(C)(C)C,505.000000,512.000000,,0.0240,,,,31.3,25.3,339.19580,https://doi.org/10.1016/j.dyepig.2017.10.018,7.000000,"[tensor(28.7233), tensor(-2.5031), tensor(-0.2...","[tensor(7.4292), tensor(6.7251), tensor(0.2054..."
1113,1113,3075,3076,CCCCCCC(CCCCCC)N1C(=O)c2cccc3c(-c4ccc(C#N)cc4)...,CCCCCCCCCCCO,365.800000,457.500000,2.12,0.7100,,,,,,480.64984,DOI: 10.1039/c6tc04453k,91.700000,"[tensor(nan), tensor(nan), tensor(nan), tensor...","[tensor(4.9129), tensor(2.5485), tensor(0.1355..."


In [85]:
for emb in solvent_embeddings.values():
    assert torch.isnan(emb).sum() == 0

### Clean df from non-optimized systems (chromophore or solvent)

In [86]:
rows_to_drop = (test_clean_tmp.Chromophore_embedding.isna() | test_clean_tmp.Solvent_embedding.isna())
test_clean_tmp = test_clean_tmp[~rows_to_drop]
# Save the DataFrame to a pickle file
test_clean_tmp.to_pickle('data_DimeNet/test_clean_scalar_dimenet.pkl')

# Load the DataFrame from the pickle file
#loaded_df = pd.read_pickle('../data/train_clean_scalar_painn.pkl')

### Concatenate embeddings of chromophores and solvents

In [87]:
def concatenate_embeddings(df, row):
    # Create tensors from the two columns
    chromophore_emb = df.iloc[row].Chromophore_embedding
    solvent_emb = df.iloc[row].Solvent_embedding
    
    # Concatenate the tensors
    concatenated_emb = torch.cat((chromophore_emb, solvent_emb), dim=0)
    
    return concatenated_emb

In [88]:
concatenated_embeddings = []
for i in range(len(test_clean_tmp)):
    concatenated_emb = concatenate_embeddings(test_clean_tmp, i)
    concatenated_embeddings.append(concatenated_emb)

In [89]:
test_clean_tmp['Concatenated_embedding'] = concatenated_embeddings

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_clean_tmp['Concatenated_embedding'] = concatenated_embeddings


In [90]:
test_clean_tmp.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1093 entries, 1 to 1114
Data columns (total 20 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   Unnamed: 0.1                1093 non-null   int64  
 1   Unnamed: 0                  1093 non-null   int64  
 2   Tag                         1093 non-null   int64  
 3   Chromophore                 1093 non-null   object 
 4   Solvent                     1093 non-null   object 
 5   Absorption_max_nm           1093 non-null   float64
 6   Emission_max_nm             1093 non-null   float64
 7   Lifetime (ns)               505 non-null    float64
 8   Quantum_yield               1093 non-null   float64
 9   log(e/mol-1 dm3 cm-1)       608 non-null    float64
 10  abs FWHM (cm-1)             43 non-null     float64
 11  emi FWHM (cm-1)             45 non-null     float64
 12  abs FWHM (nm)               249 non-null    float64
 13  emi FWHM (nm)               415 non-nu

In [91]:
#train_clean_tmp.to_csv('train_clean_scalar_painn.csv', index=True)
test_clean_tmp.to_pickle('embeddings_DimeNet/test_clean_scalar_dimenet.pkl')