In [1]:
import torch
import schnetpack as spk
from ase.db import connect
import ase
#from GOLF.utils import get_radial_basis_by_string, get_cutoff_by_string
import schnetpack.nn as snn
from schnetpack.interfaces import AtomsConverter
from schnetpack.transform import ASENeighborList
import math
from schnetpack import properties
from schnetpack.nn import scatter_add
from copy import copy
import pandas as pd
import pickle as pkl

import numpy as np

In [2]:
def get_radial_basis_by_string(radial_basis_type):
    if radial_basis_type == "Bessel":
        return snn.BesselRBF
    elif radial_basis_type == "Gaussian":
        return snn.GaussianRBF

    raise ValueError(f"Unexpected radial basis type:{radial_basis_type}")

def get_cutoff_by_string(cutoff_type):
    if cutoff_type == "cosine":
        return snn.cutoff.CosineCutoff

    raise ValueError(f"Unexpected cutoff type:{cutoff_type}")

In [3]:
device = torch.device("cuda:1")

### Representation

In [4]:
representation = spk.representation.SchNet(n_atom_basis=128,
                         n_interactions=6,
                         radial_basis=get_radial_basis_by_string("Gaussian")(n_rbf=100, cutoff=5.0),
                         cutoff_fn=get_cutoff_by_string("cosine")(5.0)
)

### Output modules

In [5]:
output_modules = [
    spk.atomistic.Atomwise(
        n_in=representation.n_atom_basis,
        n_out=1,
        output_key="energy",
    ),
    spk.atomistic.Forces(energy_key="energy", force_key="anti_gradient"),
]

### Model

In [6]:
model = spk.model.NeuralNetworkPotential(
    representation=representation,
    input_modules=[spk.atomistic.PairwiseDistances()],
    postprocessors=None,
    output_modules=output_modules,
)
model.to(device)

NeuralNetworkPotential(
  (postprocessors): ModuleList()
  (representation): SchNet(
    (radial_basis): GaussianRBF()
    (cutoff_fn): CosineCutoff()
    (embedding): Embedding(100, 128, padding_idx=0)
    (interactions): ModuleList(
      (0-5): 6 x SchNetInteraction(
        (in2f): Dense(
          in_features=128, out_features=128, bias=False
          (activation): Identity()
        )
        (f2out): Sequential(
          (0): Dense(in_features=128, out_features=128, bias=True)
          (1): Dense(
            in_features=128, out_features=128, bias=True
            (activation): Identity()
          )
        )
        (filter_network): Sequential(
          (0): Dense(in_features=100, out_features=128, bias=True)
          (1): Dense(
            in_features=128, out_features=128, bias=True
            (activation): Identity()
          )
        )
      )
    )
  )
  (input_modules): ModuleList(
    (0): PairwiseDistances()
  )
  (output_modules): ModuleList(
    (0): Atomw

### Get pretrained checkpoint

In [7]:
#PaiNN_train_large_traj_medium
! wget https://a002dlils-kadurin-nabladft.obs.ru-moscow-1.hc.sbercloud.ru/data/nablaDFTv2/models_checkpoints/SchNet/schnet_100k.ckpt

--2024-08-29 16:27:01--  https://a002dlils-kadurin-nabladft.obs.ru-moscow-1.hc.sbercloud.ru/data/nablaDFTv2/models_checkpoints/SchNet/schnet_100k.ckpt
Resolving a002dlils-kadurin-nabladft.obs.ru-moscow-1.hc.sbercloud.ru (a002dlils-kadurin-nabladft.obs.ru-moscow-1.hc.sbercloud.ru)... 46.243.206.34, 46.243.206.35
Connecting to a002dlils-kadurin-nabladft.obs.ru-moscow-1.hc.sbercloud.ru (a002dlils-kadurin-nabladft.obs.ru-moscow-1.hc.sbercloud.ru)|46.243.206.34|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 6007722 (5.7M) [binary/octet-stream]
Saving to: ‘schnet_100k.ckpt’


2024-08-29 16:27:01 (141 MB/s) - ‘schnet_100k.ckpt’ saved [6007722/6007722]



In [7]:
checkpoint = torch.load('schnet_100k.ckpt')
state_dict = checkpoint["state_dict"]

keys = copy(list(state_dict.keys()))
new_keys = [".".join(old_key.split(".")[1:]) for old_key in keys]

for key, new_key in zip(keys, new_keys):
    if "postprocessors" in key:
        state_dict.pop(key)
    else:
        state_dict[new_key] = state_dict.pop(key)

model.load_state_dict(state_dict)

<All keys matched successfully>

### Converter

In [8]:
converter = AtomsConverter(
    neighbor_list=ASENeighborList(cutoff=math.inf),
    dtype=torch.float32,
    device=device,
)

# TRAIN

### Load Data files

In [12]:
train_clean = pd.read_csv('../../data/train_clean.csv')
train_clean_tmp = train_clean.copy()

In [13]:
train_clean.head(10)

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,Tag,Chromophore,Solvent,Absorption_max_nm,Emission_max_nm,Lifetime (ns),Quantum_yield,log(e/mol-1 dm3 cm-1),abs FWHM (cm-1),emi FWHM (cm-1),abs FWHM (nm),emi FWHM (nm),Molecular weight (g mol-1),Reference,Stokes_shift
0,0,71,72,O=C([O-])c1ccccc1-c1c2cc(Br)c(=O)c(Br)c-2oc2c(...,O,515.0,538.0,1.425,0.2,,,,,,645.87864,doi/10.1021/ja00455a017,23.0
1,1,72,73,O=C([O-])c1ccccc1-c1c2cc(I)c(=O)c(I)c-2oc2c(I)...,O,521.0,534.0,0.115,0.02,,,,,,833.86264,doi/10.1021/ja00455a017,13.0
2,2,75,76,O=C([O-])c1ccccc1-c1c2cc(I)c(=O)c(I)c-2oc2c(I)...,CO,526.0,545.0,0.5,0.08,,,,,,833.86264,doi/10.1021/ja00455a017,19.0
3,3,78,79,O=C([O-])c1ccccc1-c1c2cc(I)c(=O)c(I)c-2oc2c(I)...,CCO,532.0,551.0,0.565,0.08,,,,,,833.86264,doi/10.1021/ja00455a017,19.0
4,4,80,81,O=C([O-])c1ccccc1-c1c2cc(Br)c(=O)c(Br)c-2oc2c(...,CC(C)O,529.0,551.0,3.715,0.76,,,,,,645.87864,doi/10.1021/ja00455a017,22.0
5,5,81,82,O=C([O-])c1ccccc1-c1c2cc(I)c(=O)c(I)c-2oc2c(I)...,CC(C)O,532.0,557.0,0.66,0.1,,,,,,833.86264,doi/10.1021/ja00455a017,25.0
6,6,98,99,C[Si](C)(C)c1cccc2ccccc12,C1CCCCC1,294.0,328.0,64.0,0.3,3.73,,,,29.6,200.35604,https://doi.org/10.3390/molecules17055108,34.0
7,7,103,104,C[Si](C)(C)c1ccc([Si](C)(C)C)c2ccccc12,C1CCCCC1,300.0,333.0,23.0,0.33,3.87,,,,34.2,272.53856,https://doi.org/10.3390/molecules17055108,33.0
8,8,106,107,COc1ccc([Si](C)(C)C)c2ccccc12,C1CCCCC1,312.0,327.0,10.0,0.65,3.71,,,,38.8,230.38192,https://doi.org/10.3390/molecules17055108,15.0
9,9,107,108,C[Si](C)(C)c1ccc(C#N)c2ccccc12,C1CCCCC1,315.0,333.0,11.0,0.66,3.88,,,,34.1,225.3661,https://doi.org/10.3390/molecules17055108,18.0


#### Embeddings for chromophores

In [14]:
chromophore_smiles = []
chromophore_atoms = []

with connect("../../data/conformers_1_chromophore_train.db") as conn:
    print(len(conn))
    for row in conn.select():
        chromophore_atoms.append(row.toatoms())
        chromophore_smiles.append(row.smiles)

4384


In [16]:
batch_size = 32
chromophore_embeddings = {}

n_batches = len(chromophore_smiles) // batch_size
tail_batch_size = len(chromophore_smiles) - n_batches * batch_size

for i in range(len(chromophore_smiles) // batch_size):
    print(i)
    atoms_list = chromophore_atoms[i * batch_size: (i + 1) * batch_size]
    smiles_list = chromophore_smiles[i * batch_size: (i + 1) * batch_size]

    batch = converter(atoms_list)
    batch = {k:v.to(device) for k, v in batch.items()}
    
    batch = spk.atomistic.PairwiseDistances()(batch)
    emb = model.representation(batch)
    
    aggregated_emb = scatter_add(emb["scalar_representation"], batch[properties.idx_m], batch_size).detach().cpu()
    
    for j, smiles in enumerate(smiles_list):
        chromophore_embeddings[smiles] = aggregated_emb[j]
    
if tail_batch_size > 0:
    atoms_list_tail = chromophore_atoms[(i + 1) * batch_size:]
    smiles_list_tail = chromophore_smiles[(i + 1) * batch_size:]
    print(smiles_list_tail)
    #print(atoms_list_tail)
    
    tail_batch = converter(atoms_list_tail)
    tail_batch = {k:v.to(device) for k, v in tail_batch.items()}
    #print(tail_batch)

    assert tail_batch_size == len(tail_batch[properties.R])
    
    tail_batch = spk.atomistic.PairwiseDistances()(tail_batch)
    emb_tail = model.representation(tail_batch)
    aggregated_emb = scatter_add(emb_tail["scalar_representation"], tail_batch[properties.idx_m], tail_batch_size).detach().cpu()
    
    for j, smiles in enumerate(smiles_list_tail):
        chromophore_embeddings[smiles] = aggregated_emb[j]

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136


In [17]:
# Initialize the new column
train_clean_tmp['Chromophore_embedding'] = None

# for smiles, embedding in chromophore_embeddings.items():
#     mask = train_clean_tmp.Chromophore == smiles
#     train_clean_tmp[mask].Chromophore_embedding = embedding

for index in range(len(train_clean_tmp)):
    smiles_key = train_clean_tmp.iloc[index]['Chromophore']
    train_clean_tmp.at[index, 'Chromophore_embedding'] = chromophore_embeddings.get(smiles_key)

#### Embeddings for solvents

In [19]:
solvent_smiles = []
solvent_atoms = []

with connect("../../data/conformers_1_solvent_train.db") as conn:
    for row in conn.select():
        solvent_atoms.append(row.toatoms())
        solvent_smiles.append(row.smiles)

In [20]:
batch_size = 32
solvent_embeddings = {}

n_batches = len(solvent_smiles) // batch_size
tail_batch_size = len(solvent_smiles) - n_batches * batch_size

for i in range(len(solvent_smiles) // batch_size):
    print(i)
    atoms_list = solvent_atoms[i * batch_size: (i + 1) * batch_size]
    smiles_list = solvent_smiles[i * batch_size: (i + 1) * batch_size]

    batch = converter(atoms_list)
    batch = {k:v.to(device) for k, v in batch.items()}
    
    batch = spk.atomistic.PairwiseDistances()(batch)
    emb = model.representation(batch)
    
    aggregated_emb = scatter_add(emb["scalar_representation"], batch[properties.idx_m], batch_size).detach().cpu()
    
    for j, smiles in enumerate(smiles_list):
        solvent_embeddings[smiles] = aggregated_emb[j]
    
if tail_batch_size > 0:
    atoms_list_tail = solvent_atoms[(i + 1) * batch_size:]
    smiles_list_tail = solvent_smiles[(i + 1) * batch_size:]
    
    tail_batch = converter(atoms_list_tail)
    tail_batch = {k:v.to(device) for k, v in tail_batch.items()}

    assert tail_batch_size == max(tail_batch[properties.idx_m]).item() + 1
    
    tail_batch = spk.atomistic.PairwiseDistances()(tail_batch)
    emb_tail = model.representation(tail_batch)
    aggregated_emb = scatter_add(emb_tail["scalar_representation"], tail_batch[properties.idx_m], tail_batch_size).detach().cpu()
    
    for j, smiles in enumerate(smiles_list_tail):
        solvent_embeddings[smiles] = aggregated_emb[j]

0
1


  np.ceil(bin_size * nbins_c / face_dist_c).astype(int)


2
3
4
5
6
7


In [21]:
# Initialize the new column
train_clean_tmp['Solvent_embedding'] = None

for index in range(len(train_clean_tmp)):
    smiles_key = train_clean_tmp.iloc[index]['Solvent']
    train_clean_tmp.at[index, 'Solvent_embedding'] = solvent_embeddings.get(smiles_key)

In [22]:
for emb in solvent_embeddings.values():
    assert torch.isnan(emb).sum() == 0

In [23]:
train_clean_tmp.columns

Index(['Unnamed: 0.1', 'Unnamed: 0', 'Tag', 'Chromophore', 'Solvent',
       'Absorption_max_nm', 'Emission_max_nm', 'Lifetime (ns)',
       'Quantum_yield', 'log(e/mol-1 dm3 cm-1)', 'abs FWHM (cm-1)',
       'emi FWHM (cm-1)', 'abs FWHM (nm)', 'emi FWHM (nm)',
       'Molecular weight (g mol-1)', 'Reference', 'Stokes_shift',
       'Chromophore_embedding', 'Solvent_embedding'],
      dtype='object')

### Clean df from non-optimized systems (chromophore or solvent)

In [24]:
rows_to_drop = (train_clean_tmp.Chromophore_embedding.isna() | train_clean_tmp.Solvent_embedding.isna())

In [25]:
train_clean_tmp = train_clean_tmp[~rows_to_drop]

In [28]:
! ls

Prepare_data_schnet.ipynb  embeddings  schnet_100k.ckpt


In [29]:
# Save the DataFrame to a pickle file
train_clean_tmp.to_pickle('embeddings/train_clean_scalar_schnet.pkl')

# Load the DataFrame from the pickle file
#loaded_df = pd.read_pickle('../data/train_clean_scalar_painn.pkl')

In [28]:
#train_clean_tmp.to_csv('train_clean_scalar_painn.csv', index=True)

### Concatenate embeddings of chromophores and solvents

In [30]:
def concatenate_embeddings(df, row):
    # Create tensors from the two columns
    chromophore_emb = df.iloc[row].Chromophore_embedding
    solvent_emb = df.iloc[row].Solvent_embedding
    
    # Concatenate the tensors
    concatenated_emb = torch.cat((chromophore_emb, solvent_emb), dim=0)
    
    return concatenated_emb

In [31]:
concatenated_embeddings = []
for i in range(len(train_clean_tmp)):
    concatenated_emb = concatenate_embeddings(train_clean_tmp, i)
    concatenated_embeddings.append(concatenated_emb)

In [32]:
train_clean_tmp['Concatenated_embedding'] = concatenated_embeddings

In [33]:
#train_clean_tmp.to_csv('train_clean_scalar_painn.csv', index=True)
train_clean_tmp.to_pickle('embeddings/train_clean_scalar_schnet.pkl')

### Split df into train test val

In [34]:
from sklearn.model_selection import train_test_split

In [35]:
# Split the DataFrame into train, test, and validation sets
train_size = 0.9
test_size = 0.1

# Calculate the number of rows for each set
train_rows = int(len(train_clean_tmp) * train_size)
test_rows = int(len(train_clean_tmp) * test_size)

train_df, test_df = train_test_split(train_clean_tmp, train_size=0.9, test_size=0.1)

# Display the split DataFrames
print("Train DataFrame:")
print(train_df.shape)
print("\nTest DataFrame:")
print(test_df.shape)

Train DataFrame:
(9138, 20)

Test DataFrame:
(1016, 20)


In [36]:
train_df.to_pickle('embeddings/train_clean_scalar_schnet_train.pkl')

In [37]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 9138 entries, 9967 to 8642
Data columns (total 20 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   Unnamed: 0.1                9138 non-null   int64  
 1   Unnamed: 0                  9138 non-null   int64  
 2   Tag                         9138 non-null   int64  
 3   Chromophore                 9138 non-null   object 
 4   Solvent                     9138 non-null   object 
 5   Absorption_max_nm           9138 non-null   float64
 6   Emission_max_nm             9138 non-null   float64
 7   Lifetime (ns)               4050 non-null   float64
 8   Quantum_yield               9138 non-null   float64
 9   log(e/mol-1 dm3 cm-1)       4890 non-null   float64
 10  abs FWHM (cm-1)             375 non-null    float64
 11  emi FWHM (cm-1)             353 non-null    float64
 12  abs FWHM (nm)               2061 non-null   float64
 13  emi FWHM (nm)               3510 no

In [38]:
test_df.to_pickle('embeddings/train_clean_scalar_schnet_test.pkl')

In [39]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1016 entries, 6036 to 7382
Data columns (total 20 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   Unnamed: 0.1                1016 non-null   int64  
 1   Unnamed: 0                  1016 non-null   int64  
 2   Tag                         1016 non-null   int64  
 3   Chromophore                 1016 non-null   object 
 4   Solvent                     1016 non-null   object 
 5   Absorption_max_nm           1016 non-null   float64
 6   Emission_max_nm             1016 non-null   float64
 7   Lifetime (ns)               462 non-null    float64
 8   Quantum_yield               1016 non-null   float64
 9   log(e/mol-1 dm3 cm-1)       541 non-null    float64
 10  abs FWHM (cm-1)             51 non-null     float64
 11  emi FWHM (cm-1)             49 non-null     float64
 12  abs FWHM (nm)               244 non-null    float64
 13  emi FWHM (nm)               395 non

In [40]:
test_df.Absorption_max_nm.isna().sum()

0

# TEST

### Load Data files

In [41]:
test_clean = pd.read_csv('../../data/test_clean.csv')
test_clean_tmp = test_clean.copy()

In [42]:
test_clean.head(10)

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,Tag,Chromophore,Solvent,Absorption_max_nm,Emission_max_nm,Lifetime (ns),Quantum_yield,log(e/mol-1 dm3 cm-1),abs FWHM (cm-1),emi FWHM (cm-1),abs FWHM (nm),emi FWHM (nm),Molecular weight (g mol-1),Reference,Stokes_shift
0,0,14307,14308,CC[C@H](C)COc1ccc(C2=C3C(C)=C(I)C(C)=[N+]3[B-]...,ClCCl,532.0,557.0,,0.06,,,,,,662.09838,https://doi.org/10.1016/j.inoche.2015.10.029,25.0
1,1,7421,7422,CN(C)c1ccc(C2=Nc3sc4cc(C(F)(F)F)ccc4[n+]3[B-](...,ClCCl,431.0,478.0,,1.0,,,,42.7,46.6,413.17722,DOI: 10.1021/acs.joc.8b02098,47.0
2,2,5928,5929,CCN1C(=O)C(C2C(=O)c3c4ccccc4cc4cc5ccccc5c2c34)...,ClCCl,532.107414,582.0,10.5,0.237,,,,91.1,84.5,466.55968,DOI: 10.1021/acs.joc.8b03083,49.892586
3,3,12238,12239,Fc1ccc(C#Cc2cc(C#Cc3ccc(F)cc3)c(C#Cc3ccc(F)cc3...,ClCCl,371.0,421.0,,0.43,4.724276,,,49.5,71.1,668.66274,DOI: 10.1021/ol2000183,50.0
4,4,19455,19456,COC(=O)c1[nH]c(-c2ccccc2)c2nnc3ccsc3c12,ClCCl,401.0,478.0,2.13,0.1105,3.778151,,,,,309.34834,DOI: 10.1021/acs.joc.6b01662,77.0
5,5,9004,9005,C(=C/c1cnc2ccccc2n1)\c1ccc(N2CCCCC2)cc1,ClCCl,414.0,583.0,,0.7,4.401401,,,,,315.41874,dx.doi.org/10.1021/jo3004919,169.0
6,6,624,625,Cc1nc(-c2cc(C(F)(F)F)ccc2O)n2c1CCCC2,ClCCl,316.0,432.0,3.7,0.61,5.232488,,,,73.1,296.2911,https://doi.org/10.1016/j.dyepig.2018.09.069,116.0
7,7,10680,10681,CCCCCCN1C(=O)c2cccc3c(-c4ccc(-c5cc(-c6ccc(OC)c...,ClCCl,403.0,521.0,,0.38,4.214,,,,80.8,611.76402,https://doi.org/10.1016/j.saa.2013.07.073,118.0
8,8,3529,3530,O=c1c2cc(I)ccc2nc2n1[B-](F)(F)[n+]1ccccc1-2,ClCCl,362.0,449.0,,0.73,,,,,,396.92558,DOI:10.1002/chem.201803428,87.0
9,9,19888,19889,CC(C)(C)c1ccc2c(c1)sc1[n+]2[B-](F)(F)n2c(c3ccc...,ClCCl,538.0,560.0,6.0,0.6,4.652246,,,,,508.19698,DOI: 10.1021/ol503379c,22.0


#### Embeddings for chromophores

In [43]:
chromophore_smiles = []
chromophore_atoms = []

with connect("../../data/conformers_1_chromophore_test.db") as conn:
    print(len(conn))
    for row in conn.select():
        chromophore_atoms.append(row.toatoms())
        chromophore_smiles.append(row.smiles)

898


In [44]:
len(chromophore_smiles)

898

In [45]:
batch_size = 32
chromophore_embeddings = {}

n_batches = len(chromophore_smiles) // batch_size
tail_batch_size = len(chromophore_smiles) - n_batches * batch_size

for i in range(len(chromophore_smiles) // batch_size):
    print(i)
    atoms_list = chromophore_atoms[i * batch_size: (i + 1) * batch_size]
    smiles_list = chromophore_smiles[i * batch_size: (i + 1) * batch_size]

    batch = converter(atoms_list)
    batch = {k:v.to(device) for k, v in batch.items()}
    
    batch = spk.atomistic.PairwiseDistances()(batch)
    emb = model.representation(batch)
    
    aggregated_emb = scatter_add(emb["scalar_representation"], batch[properties.idx_m], batch_size).detach().cpu()
    
    for j, smiles in enumerate(smiles_list):
        chromophore_embeddings[smiles] = aggregated_emb[j]
    print(len(chromophore_embeddings))
    
if tail_batch_size > 0:
    atoms_list_tail = chromophore_atoms[(i + 1) * batch_size:]
    smiles_list_tail = chromophore_smiles[(i + 1) * batch_size:]
    print(len(smiles_list_tail), len(atoms_list_tail))
    #print(atoms_list_tail)
    
    tail_batch = converter(atoms_list_tail)
    tail_batch = {k:v.to(device) for k, v in tail_batch.items()}
    #print(tail_batch)

    #assert tail_batch_size == len(tail_batch[properties.R])
    
    tail_batch = spk.atomistic.PairwiseDistances()(tail_batch)
    emb_tail = model.representation(tail_batch)
    aggregated_emb = scatter_add(emb_tail["scalar_representation"], tail_batch[properties.idx_m], tail_batch_size).detach().cpu()
    
    for j, smiles in enumerate(smiles_list_tail):
        chromophore_embeddings[smiles] = aggregated_emb[j]

0


  np.ceil(bin_size * nbins_c / face_dist_c).astype(int)


32
1
64
2
96
3
128
4
160
5
192
6
224
7
256
8
288
9
320
10
352
11
384
12
416
13
448
14
480
15
512
16
544
17
576
18
608
19
640
20
672
21
704
22
736
23
768
24
800
25
832
26
864
27
896
2 2


In [46]:
# Initialize the new column
test_clean_tmp['Chromophore_embedding'] = None

# for smiles, embedding in chromophore_embeddings.items():
#     mask = train_clean_tmp.Chromophore == smiles
#     train_clean_tmp[mask].Chromophore_embedding = embedding

for index in range(len(test_clean_tmp)):
    smiles_key = test_clean_tmp.iloc[index]['Chromophore']
    test_clean_tmp.at[index, 'Chromophore_embedding'] = chromophore_embeddings.get(smiles_key)

### For solvents

In [47]:
solvent_smiles = []
solvent_atoms = []

with connect("../../data/conformers_1_solvent_test.db") as conn:
    for row in conn.select():
        solvent_atoms.append(row.toatoms())
        solvent_smiles.append(row.smiles)

In [48]:
batch_size = 32
solvent_embeddings = {}

n_batches = len(solvent_smiles) // batch_size
tail_batch_size = len(solvent_smiles) - n_batches * batch_size

for i in range(len(solvent_smiles) // batch_size):
    print(i)
    atoms_list = solvent_atoms[i * batch_size: (i + 1) * batch_size]
    smiles_list = solvent_smiles[i * batch_size: (i + 1) * batch_size]

    batch = converter(atoms_list)
    batch = {k:v.to(device) for k, v in batch.items()}
    
    batch = spk.atomistic.PairwiseDistances()(batch)
    emb = model.representation(batch)
    
    aggregated_emb = scatter_add(emb["scalar_representation"], batch[properties.idx_m], batch_size).detach().cpu()
    
    for j, smiles in enumerate(smiles_list):
        solvent_embeddings[smiles] = aggregated_emb[j]
    
if tail_batch_size > 0:
    atoms_list_tail = solvent_atoms[(i + 1) * batch_size:]
    smiles_list_tail = solvent_smiles[(i + 1) * batch_size:]
    
    tail_batch = converter(atoms_list_tail)
    tail_batch = {k:v.to(device) for k, v in tail_batch.items()}

    assert tail_batch_size == max(tail_batch[properties.idx_m]).item() + 1
    
    tail_batch = spk.atomistic.PairwiseDistances()(tail_batch)
    emb_tail = model.representation(tail_batch)
    aggregated_emb = scatter_add(emb_tail["scalar_representation"], tail_batch[properties.idx_m], tail_batch_size).detach().cpu()
    
    for j, smiles in enumerate(smiles_list_tail):
        solvent_embeddings[smiles] = aggregated_emb[j]

0


  np.ceil(bin_size * nbins_c / face_dist_c).astype(int)


In [49]:
# Initialize the new column
test_clean_tmp['Solvent_embedding'] = None

for index in range(len(test_clean_tmp)):
    smiles_key = test_clean_tmp.iloc[index]['Solvent']
    test_clean_tmp.at[index, 'Solvent_embedding'] = solvent_embeddings.get(smiles_key)

In [50]:
test_clean_tmp

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,Tag,Chromophore,Solvent,Absorption_max_nm,Emission_max_nm,Lifetime (ns),Quantum_yield,log(e/mol-1 dm3 cm-1),abs FWHM (cm-1),emi FWHM (cm-1),abs FWHM (nm),emi FWHM (nm),Molecular weight (g mol-1),Reference,Stokes_shift,Chromophore_embedding,Solvent_embedding
0,0,14307,14308,CC[C@H](C)COc1ccc(C2=C3C(C)=C(I)C(C)=[N+]3[B-]...,ClCCl,532.000000,557.000000,,0.0600,,,,,,662.09838,https://doi.org/10.1016/j.inoche.2015.10.029,25.000000,,"[tensor(2.3278), tensor(-0.2662), tensor(0.303..."
1,1,7421,7422,CN(C)c1ccc(C2=Nc3sc4cc(C(F)(F)F)ccc4[n+]3[B-](...,ClCCl,431.000000,478.000000,,1.0000,,,,42.7,46.6,413.17722,DOI: 10.1021/acs.joc.8b02098,47.000000,"[tensor(3.7830), tensor(5.8822), tensor(3.9722...","[tensor(2.3278), tensor(-0.2662), tensor(0.303..."
2,2,5928,5929,CCN1C(=O)C(C2C(=O)c3c4ccccc4cc4cc5ccccc5c2c34)...,ClCCl,532.107414,582.000000,10.50,0.2370,,,,91.1,84.5,466.55968,DOI: 10.1021/acs.joc.8b03083,49.892586,"[tensor(-4.1799), tensor(3.8600), tensor(-12.1...","[tensor(2.3278), tensor(-0.2662), tensor(0.303..."
3,3,12238,12239,Fc1ccc(C#Cc2cc(C#Cc3ccc(F)cc3)c(C#Cc3ccc(F)cc3...,ClCCl,371.000000,421.000000,,0.4300,4.724276,,,49.5,71.1,668.66274,DOI: 10.1021/ol2000183,50.000000,"[tensor(15.6557), tensor(15.3072), tensor(-7.6...","[tensor(2.3278), tensor(-0.2662), tensor(0.303..."
4,4,19455,19456,COC(=O)c1[nH]c(-c2ccccc2)c2nnc3ccsc3c12,ClCCl,401.000000,478.000000,2.13,0.1105,3.778151,,,,,309.34834,DOI: 10.1021/acs.joc.6b01662,77.000000,"[tensor(3.7891), tensor(3.1593), tensor(-6.667...","[tensor(2.3278), tensor(-0.2662), tensor(0.303..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1110,1110,6923,6924,c1ccc(-n2c(-c3ccc(-c4ccc(-c5ccc(-c6nc7c8ccccc8...,ClC=C(Cl)Cl,343.000000,417.000000,,0.8870,,,,,58.5,714.86996,DOI: 10.1021/acsami.6b14638,74.000000,"[tensor(7.7314), tensor(11.9064), tensor(-20.7...","[tensor(4.7187), tensor(0.9227), tensor(-0.773..."
1111,1111,2920,2921,COC(=O)C(Cc1ccc2oc(-c3ccc(Nc4ccccc4)cc3)nc2c1)...,CC(O)CO,355.795915,438.000964,0.54,0.0900,,,,,,487.55426,DOI:10.1039/c1pp05123g,82.205048,"[tensor(-6.2300), tensor(2.6832), tensor(-10.6...","[tensor(-3.6177), tensor(-1.5237), tensor(0.72..."
1112,1112,14071,14072,CC1=[N+]2C(=C(c3ccc(N(C)C)cc3)c3ccc(C)n3[B-]2(...,CC(=O)C(C)(C)C,505.000000,512.000000,,0.0240,,,,31.3,25.3,339.19580,https://doi.org/10.1016/j.dyepig.2017.10.018,7.000000,"[tensor(-4.7881), tensor(5.6442), tensor(-0.85...","[tensor(-7.9808), tensor(0.5168), tensor(-0.87..."
1113,1113,3075,3076,CCCCCCC(CCCCCC)N1C(=O)c2cccc3c(-c4ccc(C#N)cc4)...,CCCCCCCCCCCO,365.800000,457.500000,2.12,0.7100,,,,,,480.64984,DOI: 10.1039/c6tc04453k,91.700000,"[tensor(-11.0719), tensor(4.3051), tensor(-9.7...","[tensor(-12.6189), tensor(0.9848), tensor(2.03..."


In [51]:
for emb in solvent_embeddings.values():
    assert torch.isnan(emb).sum() == 0

### Clean df from non-optimized systems (chromophore or solvent)

In [52]:
rows_to_drop = (test_clean_tmp.Chromophore_embedding.isna() | test_clean_tmp.Solvent_embedding.isna())
test_clean_tmp = test_clean_tmp[~rows_to_drop]
# Save the DataFrame to a pickle file
test_clean_tmp.to_pickle('embeddings/test_clean_scalar_schnet.pkl')

# Load the DataFrame from the pickle file
#loaded_df = pd.read_pickle('../data/train_clean_scalar_painn.pkl')

### Concatenate embeddings of chromophores and solvents

In [53]:
def concatenate_embeddings(df, row):
    # Create tensors from the two columns
    chromophore_emb = df.iloc[row].Chromophore_embedding
    solvent_emb = df.iloc[row].Solvent_embedding
    
    # Concatenate the tensors
    concatenated_emb = torch.cat((chromophore_emb, solvent_emb), dim=0)
    
    return concatenated_emb

In [54]:
concatenated_embeddings = []
for i in range(len(test_clean_tmp)):
    concatenated_emb = concatenate_embeddings(test_clean_tmp, i)
    concatenated_embeddings.append(concatenated_emb)

In [55]:
test_clean_tmp['Concatenated_embedding'] = concatenated_embeddings

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_clean_tmp['Concatenated_embedding'] = concatenated_embeddings


In [56]:
test_clean_tmp.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1093 entries, 1 to 1114
Data columns (total 20 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   Unnamed: 0.1                1093 non-null   int64  
 1   Unnamed: 0                  1093 non-null   int64  
 2   Tag                         1093 non-null   int64  
 3   Chromophore                 1093 non-null   object 
 4   Solvent                     1093 non-null   object 
 5   Absorption_max_nm           1093 non-null   float64
 6   Emission_max_nm             1093 non-null   float64
 7   Lifetime (ns)               505 non-null    float64
 8   Quantum_yield               1093 non-null   float64
 9   log(e/mol-1 dm3 cm-1)       608 non-null    float64
 10  abs FWHM (cm-1)             43 non-null     float64
 11  emi FWHM (cm-1)             45 non-null     float64
 12  abs FWHM (nm)               249 non-null    float64
 13  emi FWHM (nm)               415 non-nu

In [57]:
#train_clean_tmp.to_csv('train_clean_scalar_painn.csv', index=True)
test_clean_tmp.to_pickle('embeddings/test_clean_scalar_schnet.pkl')