In [1]:
import numpy as np
from rdkit.Chem import AllChem
from rdkit import Chem
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import os
import json,pickle
from collections import OrderedDict
from rdkit import Chem
from rdkit.Chem import MolFromSmiles
import networkx as nx

In [None]:
def atom_features(atom, explicit_H = False, use_chirality=True):
    symbol_one_hot = one_of_k_encoding_unk(
      atom.GetSymbol(), #37
      ['Al', 'Sb', 'Cl', 'Te', 'Si', 'Br', 'Cd', 'S', 'Mn', 'Ba',
       'Ga', 'Cr', 'I', 'Mo', 'B', 'Te', 'As', 'Sb', 'N', 'V',
       'Sn', 'P', 'Sb', 'Ni', 'Pb', 'Se', 'In', 'Be', 'F',
       'Ti', 'O', 'Hg', 'H', 'C', 'Co', 'Fe', 'Zr'
      ])
    
    degree_one_hot = one_of_k_encoding(atom.GetDegree(), [0, 1, 2, 3, 4 ,5]) 
    formal_charge_one_hot = one_of_k_encoding_unk(atom.GetFormalCharge(),[-1, 0, 1]) 
    explicit_valence_one_hot = one_of_k_encoding(atom.GetExplicitValence(), [0, 1, 2, 3, 4, 5, 6]) 
    implicit_valence_one_hot = one_of_k_encoding(atom.GetImplicitValence(), [0, 1, 2, 3, 4, 5, 6])
    hybridization_one_hot = one_of_k_encoding_unk(atom.GetHybridization(), [
                Chem.rdchem.HybridizationType.SP, Chem.rdchem.HybridizationType.SP2,
                Chem.rdchem.HybridizationType.SP3]) 
    aromatic_one_hot = [atom.GetIsAromatic()]

    radical_one_hot = one_of_k_encoding_unk(atom.GetNumRadicalElectrons(), [0, 1, 2])
#     print(f"{symbol_one_hot}+{degree_one_hot}+{formal_charge_one_hot}+{explicit_valence_one_hot}+{hybridization_one_hot}+{aromatic_one_hot}+{radical_one_hot}")
    results = radical_one_hot +symbol_one_hot + degree_one_hot + explicit_valence_one_hot +implicit_valence_one_hot+formal_charge_one_hot + hybridization_one_hot + aromatic_one_hot

    if not explicit_H:
        total_num_hs_one_hot = one_of_k_encoding_unk(atom.GetTotalNumHs(), [0, 1, 2, 3, 4 ])
        results = results + total_num_hs_one_hot
        
    if use_chirality:
        try:
#             print(atom.GetProp('_CIPCode')) 
            chirality_one_hot = one_of_k_encoding_unk(
                atom.GetProp('_CIPCode'),
                ['R', 'S'])
            results = results + chirality_one_hot + [atom.HasProp('_ChiralityPossible')]
        except:
#             print("Chirality information not available.") 
            results = results + [False, False] + [atom.HasProp('_ChiralityPossible')]
    
    return np.array(results)
    

def one_of_k_encoding(x, allowable_set):
    if x not in allowable_set:
        raise Exception("input {0} not in allowable set{1}:".format(x, allowable_set))
    return list(map(lambda s: x == s, allowable_set))

def one_of_k_encoding_unk(x, allowable_set):
    """Maps inputs not in the allowable set to the last element."""
    if x not in allowable_set:
        x = allowable_set[-1]
    return list(map(lambda s: x == s, allowable_set))


def smile_to_graph(smile):
    mol = Chem.MolFromSmiles(smile)

    if mol is None: 
        return None
    
    c_size = mol.GetNumAtoms()
    
    features = []
    for atom in mol.GetAtoms():
        feature = atom_features(atom)
        features.append(feature / sum(feature))
#     print("Length of atom feature:", len(feature))  

    edges = []
    for bond in mol.GetBonds():
        edges.append([bond.GetBeginAtomIdx(), bond.GetEndAtomIdx()])
    g = nx.Graph(edges).to_directed()    
    edge_index = []
    for e1, e2 in g.edges:
        edge_index.append([e1, e2])

    if not edge_index:  
        return None

    return c_size, features, edge_index

In [None]:
import pandas as pd
import numpy as np

list_tissue = ["liv", "lun", "sto", "mgl"]

for tissue in list_tissue:
    df = pd.read_csv(f'/../../data/single task/{tissue}_data.csv')
    df = df.dropna(subset=['SMILES'])

    smiles_list = df['SMILES'].tolist()
    dataX = []
    dataY = []
    valid_smiles_list = []  

    for i, smiles in enumerate(smiles_list):
        g = smile_to_graph(smiles)
        if g is not None:
            dataX.append(g)
            valid_smiles_list.append(smiles)  
            label = df[tissue].iloc[i]
            if label == 1:
                dataY.append(1)
            elif label == 0:
                dataY.append(0)
            elif pd.isna(label):  
                dataY.append(-1)

    if len(dataX) == 0 or len(dataY) == 0:
        print(f"No valid data for tissue {tissue}. Skipping...")
        continue

    val_data = np.array(dataX, dtype=object)
    val_label = np.array(dataY)

    np.save(f'/../../data/single task/val/data_{tissue}.npy', val_data)
    np.save(f'/../../data/single task/val/labels_{tissue}.npy', val_label)


    val_smiles_path = f'/../../data/single task/val/val_smiles_{tissue}.txt'

    with open(val_smiles_path, 'w', encoding='utf-8') as f:
        for item in valid_smiles_list:
            f.write("%s\n" % item)

    count_0 = dataY.count(0)
    count_1 = dataY.count(1)
    count_minus_1 = dataY.count(-1)
    print(f"Counts for tissue {tissue}: 0 = {count_0}, 1 = {count_1}, -1 = {count_minus_1}")


Counts for tissue liv: 0 = 166, 1 = 177, -1 = 0
Counts for tissue lun: 0 = 159, 1 = 184, -1 = 0
Counts for tissue sto: 0 = 157, 1 = 186, -1 = 0
Counts for tissue mgl: 0 = 169, 1 = 174, -1 = 0
