In [2]:
gg_colab = True
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
!pip install rdkit
!pip install torch_geometric
!pip install transformers

Collecting rdkit
  Downloading rdkit-2025.9.1-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (4.1 kB)
Downloading rdkit-2025.9.1-cp312-cp312-manylinux_2_28_x86_64.whl (36.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m36.2/36.2 MB[0m [31m28.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rdkit
Successfully installed rdkit-2025.9.1
Collecting torch_geometric
  Downloading torch_geometric-2.7.0-py3-none-any.whl.metadata (63 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.7/63.7 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
Downloading torch_geometric-2.7.0-py3-none-any.whl (1.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m22.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: torch_geometric
Successfully installed torch_geometric-2.7.0


In [4]:
import pandas as pd
import numpy as np
from rdkit import Chem
# pip install torch torchvision torchaudio
import torch
# pip install torch-geometric
from torch_geometric.data import Data
from torch_geometric.loader import DataLoader
import random

from torch_geometric.data import Batch
from transformers import AutoTokenizer
from torch.utils.data import Dataset
from tqdm import tqdm



In [4]:
if gg_colab:
  GG_COLAB = "/content/drive/MyDrive/HD"
  TOX21 = f'{GG_COLAB}/data/processed/tox21.pkl'
  SIDER = f'{GG_COLAB}/data/processed/sider.pkl'
  TOXCAST = f'{GG_COLAB}/data/processed/toxcast.pkl'
else:
  TOX21 = './data/processed/tox21.pkl'
  SIDER = './data/processed/sider.pkl'
  TOXCAST = './data/processed/toxcast.pkl'



In [5]:
tox21_df = pd.read_pickle(TOX21)
sider_df = pd.read_pickle(SIDER)
toxcast_df = pd.read_pickle(TOXCAST)

tox21_df['sources'] = [['tox21']] * len(tox21_df)
sider_df['sources'] = [['sider']] * len(sider_df)
toxcast_df['sources'] = [['toxcast']] * len(toxcast_df)
#test
print('tox21_df: ',type(tox21_df))
print(tox21_df.head())
print('===================================')
print('sider_df: ',type(sider_df))
print(sider_df.head())
print('===================================')
print('toxcast_df: ',type(toxcast_df))
print(toxcast_df.head())
print('===================================')

tox21_df:  <class 'pandas.core.frame.DataFrame'>
   NR-AR  NR-AR-LBD  NR-AhR  NR-Aromatase  NR-ER  NR-ER-LBD  NR-PPAR-gamma  \
0    0.0        0.0     1.0           NaN    NaN        0.0            0.0   
1    0.0        0.0     0.0           0.0    0.0        0.0            0.0   
2    NaN        NaN     NaN           NaN    NaN        NaN            NaN   
3    0.0        0.0     0.0           0.0    0.0        0.0            0.0   
4    0.0        0.0     NaN           0.0    0.0        0.0            0.0   

   SR-ARE  SR-ATAD5  SR-HSE  SR-MMP  SR-p53   mol_id  \
0     1.0       0.0     0.0     0.0     0.0  TOX3021   
1     NaN       0.0     NaN     0.0     0.0  TOX3020   
2     0.0       NaN     0.0     NaN     NaN  TOX3024   
3     NaN       0.0     NaN     0.0     0.0  TOX3027   
4     0.0       0.0     0.0     NaN     0.0  TOX3028   

                                              smiles  \
0                       CCOc1ccc2nc(S(N)(=O)=O)sc2c1   
1                          CCN1C(

In [6]:
def build_merged_dataframe(tox21_df, sider_df, toxcast_df, smiles_col='cleaned_smiles'):
    """
    Hợp nhất 3 nguồn dữ liệu (Tox21, SIDER, ToxCast) dựa trên cleaned_smiles.
    Tự động prefix tên cột label để tránh trùng lặp và đảm bảo traceability.
    """

    # ---- Copy kèm flag nguồn ----
    tox21 = tox21_df.copy()
    sider = sider_df.copy()
    toxcast = toxcast_df.copy()

    # ---- Đảm bảo có cleaned_smiles ----
    for df in [tox21, sider, toxcast]:
        if "cleaned_smiles" not in df.columns:
            if "smiles" in df.columns:
                print('DataFrame ko có cleaned_smiles ==> dùng tạm cột smiles')
                df["cleaned_smiles"] = df["smiles"]
            else:
                raise ValueError("DataFrame thiếu cả smiles và cleaned_smiles")

    # -----------------------------
    # Tạo mol_id chuẩn hoá
    # -----------------------------

    # Tox21: giữ nguyên (nếu không có thì tạo)
    if "mol_id" not in tox21.columns:
        tox21["mol_id"] = ["TOX21_" + str(i+1) for i in range(len(tox21))]

    # SIDER: tạo SIDER_1..n
    sider["mol_id"] = ["SIDER_" + str(i+1) for i in range(len(sider))]

    # ToxCast: tạo TOXCAST_1..n
    toxcast["mol_id"] = ["TOXCAST_" + str(i+1) for i in range(len(toxcast))]

    # ---- Tạo danh sách smile duy nhất ----
    all_smiles = pd.concat(
        [
            tox21[["mol_id", "cleaned_smiles"]],
            sider[["mol_id", "cleaned_smiles"]],
            toxcast[["mol_id", "cleaned_smiles"]],
        ],
        ignore_index=True
    ).drop_duplicates(subset=["cleaned_smiles"]).reset_index(drop=True) # XÓA HÀNG TRÙNG SMILES

    all_smiles = all_smiles.drop_duplicates(subset=["cleaned_smiles"]).reset_index(drop=True)

    # DataFrame nền tảng chứa unified SMILES
    merged = all_smiles[["mol_id", "cleaned_smiles"]].rename(columns={"mol_id": "any_mol_id"})

    # ---- Hàm prefix cột ----
    def prefix_labels(src_df, prefix):
        df = src_df.copy()
        exclude_cols = ["mol_id", "smiles", "cleaned_smiles", "id"]

        label_cols = [c for c in df.columns if c not in exclude_cols]

        # Giữ lại sources  không prefix
        for keep in ["sources"]:
            if keep in label_cols:
                label_cols.remove(keep)
        # Prefix các cột label thật sự
        df = df.rename(columns={c: f"{prefix}_{c}" for c in label_cols})

        # Rename sources để merge không bị đè
        if "sources" in df.columns:
            df = df.rename(columns={"sources": f"{prefix}_sources"})

        return df

    tox21_pref = prefix_labels(tox21, "tox21")
    sider_pref = prefix_labels(sider, "sider")
    toxcast_pref = prefix_labels(toxcast, "toxcast")

    # ---- 5. Merge left theo cleaned_smiles ----
    merged = merged.merge(tox21_pref, on="cleaned_smiles", how="left")
    merged = merged.merge(sider_pref, on="cleaned_smiles", how="left")
    merged = merged.merge(toxcast_pref, on="cleaned_smiles", how="left")

    # Gom sources từ 3 dataset

    def merge_sources(row):
        s = []
        for pref in ["tox21", "sider", "toxcast"]:
            col = f"{pref}_sources"
            if col in row and isinstance(row[col], list):
                s.extend(row[col])
        return list(sorted(set(s)))

    merged["sources"] = merged.apply(merge_sources, axis=1)


    print("===== MERGED DATAFRAME =====")
    print("Shape:", merged.shape)
    print("Preview (transpose):")
    print(merged.head(5))

    return merged


# test
merged_df = build_merged_dataframe(tox21_df, sider_df, toxcast_df)


===== MERGED DATAFRAME =====
Shape: (10890, 668)
Preview (transpose):
  any_mol_id                                     cleaned_smiles  tox21_NR-AR  \
0    TOX3021                       CCOc1ccc2nc(S(N)(=O)=O)sc2c1          0.0   
1    TOX3020                          CCN1C(=O)NC(c2ccccc2)C1=O          0.0   
2    TOX3024  CC[C@]1(O)CC[C@H]2[C@@H]3CCC4=CCCC[C@@H]4[C@H]...          NaN   
3    TOX3027                    CCCN(CC)C(CC)C(=O)Nc1c(C)cccc1C          0.0   
4    TOX3028                          CC(O)(P(=O)(O)O)P(=O)(O)O          0.0   

   tox21_NR-AR-LBD  tox21_NR-AhR  tox21_NR-Aromatase  tox21_NR-ER  \
0              0.0           1.0                 NaN          NaN   
1              0.0           0.0                 0.0          0.0   
2              NaN           NaN                 NaN          NaN   
3              0.0           0.0                 0.0          0.0   
4              0.0           NaN                 0.0          0.0   

   tox21_NR-ER-LBD  tox21_NR-PPAR-

In [7]:
#TEST
merged_df[['any_mol_id','cleaned_smiles','sources']].head()


Unnamed: 0,any_mol_id,cleaned_smiles,sources
0,TOX3021,CCOc1ccc2nc(S(N)(=O)=O)sc2c1,"[tox21, toxcast]"
1,TOX3020,CCN1C(=O)NC(c2ccccc2)C1=O,"[sider, tox21, toxcast]"
2,TOX3024,CC[C@]1(O)CC[C@H]2[C@@H]3CCC4=CCCC[C@@H]4[C@H]...,"[tox21, toxcast]"
3,TOX3027,CCCN(CC)C(CC)C(=O)Nc1c(C)cccc1C,"[tox21, toxcast]"
4,TOX3028,CC(O)(P(=O)(O)O)P(=O)(O)O,"[sider, tox21, toxcast]"


In [8]:
toxcast_cols = [c for c in merged_df.columns if c.startswith("sider_")]
merged_df[toxcast_cols].head()


Unnamed: 0,sider_Hepatobiliary disorders,sider_Metabolism and nutrition disorders,sider_Product issues,sider_Eye disorders,sider_Investigations,sider_Musculoskeletal and connective tissue disorders,sider_Gastrointestinal disorders,sider_Social circumstances,sider_Immune system disorders,sider_Reproductive system and breast disorders,...,sider_Infections and infestations,"sider_Respiratory, thoracic and mediastinal disorders",sider_Psychiatric disorders,sider_Renal and urinary disorders,"sider_Pregnancy, puerperium and perinatal conditions",sider_Ear and labyrinth disorders,sider_Cardiac disorders,sider_Nervous system disorders,"sider_Injury, poisoning and procedural complications",sider_sources
0,,,,,,,,,,,...,,,,,,,,,,
1,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,[sider]
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,...,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,[sider]


In [37]:
def select_label_columns(merged_df):
    cols = merged_df.columns.tolist()

    # Detect prefix dạng 1 underscore
    # - lấy label ko lấy sources
    tox21_cols = [c for c in cols if c.startswith("tox21_") and "sources" not in c]
    sider_cols = [c for c in cols if c.startswith("sider_") and "sources" not in c]
    toxcast_cols = [c for c in cols if c.startswith("toxcast_") and "sources" not in c]


    print("Detected tox21 cols:", tox21_cols[:10])
    print("Detected sider cols:", sider_cols[:10])
    print("Detected toxcast cols:", toxcast_cols[:10])

    # ---- Binary head ----
    # any label toxcast = 1 thì sample toxic
    def compute_is_toxic(row):
        if len(toxcast_cols) == 0:
            return np.nan
        # vals = row[tox21_cols].astype(float).fillna(0).values
        # - đảm bảo là number 0/1
        vals = pd.to_numeric(row[toxcast_cols], errors="coerce").fillna(0).values
        return float(vals.sum() > 0)

    # Sinh nhãn binary thật
    merged_df["any_toxicity"] = merged_df.apply(compute_is_toxic, axis=1)

    # ---- Organ head: chọn SIDER columns có 'disorder' ----

    # mapping các keyword liên quan Organ / System trong SIDER
    organ_keywords = [
        "disorder",         # original logic
        "system",           # ví dụ: nervous_system_disorder
        "cardio", "renal", "hepatic", "liver", "kidney",
        "immune", "respiratory", "pulmonary"
    ]


    # tách sider thành 2 nhóm organ + ADR
    # + CHỌN CỘT NÀO CHỨA TỪ KHÓA ORGAN
    organ_cols = [
        c for c in sider_cols
        if any(kw in c.lower() for kw in organ_keywords)
    ]

    # + FALLBACK

    if len(organ_cols) == 0:
        print('CỘT ORGAN ĐANG RỖNG ==> LẤY TẠM FALLBACK')
        binary_sider = [
            c for c in sider_cols
            if merged_df[c].dropna().isin([0,1]).all()
        ]
        organ_cols = binary_sider[:5]    # lấy 5 cột đầu tiên thay vì 3 bừa
        print("!!! Fallback: SIDER không có 'organ keywords', lấy 5 binary columns ĐỂ THỬ NGHIỆM.")


    # ---- ADR head ----
    # + CHỈ GIỮ CÁC CỘT NHỊ PHÂN
    adr_cols = [c for c in sider_cols if c not in organ_cols]

    # chỉ giữ ADR là cột nhị phân 0/1
    adr_cols = [
        c for c in adr_cols
        if merged_df[c].dropna().isin([0,1]).all()
    ]


    if len(adr_cols) == 0:
        adr_cols = toxcast_cols[:10]  # fallback
        print("!!! Fallback ADR → lấy 10 cột toxcast đầu tiên.")

    print("-> Binary:", "any_toxicity") # in log
    print("-> Organ cols ({}): {}".format(len(organ_cols), organ_cols))
    print("-> ADR cols ({}): {}".format(len(adr_cols), adr_cols[:20]))

    label_cols_ordered = ["any_toxicity"] + organ_cols + adr_cols
    return label_cols_ordered, organ_cols, adr_cols


In [38]:
merged_df = build_merged_dataframe(tox21_df, sider_df, toxcast_df)

label_cols_ordered, organ_cols, adr_cols = select_label_columns(merged_df)

print("Final label order length:", len(label_cols_ordered))
print("First labels:", label_cols_ordered[:20])


===== MERGED DATAFRAME =====
Shape: (10890, 668)
Preview (transpose):
  any_mol_id                                     cleaned_smiles  tox21_NR-AR  \
0    TOX3021                       CCOc1ccc2nc(S(N)(=O)=O)sc2c1          0.0   
1    TOX3020                          CCN1C(=O)NC(c2ccccc2)C1=O          0.0   
2    TOX3024  CC[C@]1(O)CC[C@H]2[C@@H]3CCC4=CCCC[C@@H]4[C@H]...          NaN   
3    TOX3027                    CCCN(CC)C(CC)C(=O)Nc1c(C)cccc1C          0.0   
4    TOX3028                          CC(O)(P(=O)(O)O)P(=O)(O)O          0.0   

   tox21_NR-AR-LBD  tox21_NR-AhR  tox21_NR-Aromatase  tox21_NR-ER  \
0              0.0           1.0                 NaN          NaN   
1              0.0           0.0                 0.0          0.0   
2              NaN           NaN                 NaN          NaN   
3              0.0           0.0                 0.0          0.0   
4              0.0           NaN                 0.0          0.0   

   tox21_NR-ER-LBD  tox21_NR-PPAR-

In [39]:
def smiles_to_molecule(smiles):
    try:
        molecule = Chem.MolFromSmiles(smiles, sanitize=False) #Lọc molecule bị valence lỗi
        Chem.SanitizeMol(molecule)
        return molecule
    except:
        return None


# ============================ get_atom_features =======================
def one_hot_encoding(x, permitted_list):
    if x not in permitted_list:
        x = permitted_list[-1]
    binary_encoding = [int(x == s) for s in permitted_list]
    return binary_encoding



def get_atom_features(atom, use_chirality=True, hydrogens_implicit=True):
    permitted_atoms = ['C','N','O','S','F','Si','P','Cl','Br','I','B','Na','K','Ca','Fe','Zn','Cu','Unknown']
    if not hydrogens_implicit:
        permitted_atoms = ['H'] + permitted_atoms

    atom_type_enc = one_hot_encoding(atom.GetSymbol(), permitted_atoms)
    n_heavy_neighbors_enc = one_hot_encoding(min(atom.GetDegree(), 4), [0,1,2,3,4])
    formal_charge_enc = one_hot_encoding(int(atom.GetFormalCharge()), [-1,0,1])
    hybridisation_enc = one_hot_encoding(str(atom.GetHybridization()), ["S","SP","SP2","SP3"])
    is_in_ring_enc = [int(atom.IsInRing())]
    is_aromatic_enc = [int(atom.GetIsAromatic())]

    # optional numeric features
    atomic_mass_scaled = [atom.GetMass() / 100]

    atom_feature_vector = (
        atom_type_enc + n_heavy_neighbors_enc + formal_charge_enc +
        hybridisation_enc + is_in_ring_enc + is_aromatic_enc + atomic_mass_scaled
    )

    if use_chirality:
        chirality_enc = one_hot_encoding(str(atom.GetChiralTag()),
                                         ["CHI_UNSPECIFIED", "CHI_TETRAHEDRAL_CW", "CHI_TETRAHEDRAL_CCW"])
        atom_feature_vector += chirality_enc

    if hydrogens_implicit:
        n_hydrogens_enc = one_hot_encoding(min(atom.GetTotalNumHs(), 4), [0,1,2,3,4])
        atom_feature_vector += n_hydrogens_enc

    return np.array(atom_feature_vector)

# ======================================================================

def get_bond_features(bond, use_stereochemistry=True):
    bond_type_enc = one_hot_encoding(
        bond.GetBondType(),
        [Chem.rdchem.BondType.SINGLE, Chem.rdchem.BondType.DOUBLE,
         Chem.rdchem.BondType.TRIPLE, Chem.rdchem.BondType.AROMATIC]
    )
    bond_is_conj_enc = [int(bond.GetIsConjugated())]
    bond_is_in_ring_enc = [int(bond.IsInRing())]
    bond_feature_vector = bond_type_enc + bond_is_conj_enc + bond_is_in_ring_enc

    if use_stereochemistry:
        stereo_enc = one_hot_encoding(
            str(bond.GetStereo()),
            ["STEREOZ","STEREOE","STEREONONE"]
        )
        bond_feature_vector += stereo_enc

    return np.array(bond_feature_vector)


def molecule_to_graph(molecule,
                      use_atom_features=True, use_bond_features=True,
                      hydrogens_implicit=True):
    """
    Convert RDKit Mol -> PyG Data
    """
    if molecule is None:
        return None

    # ===== NODE FEATURES (Atom) =====
    atom_features = []
    for atom in molecule.GetAtoms():
        feat = get_atom_features(atom, use_chirality=True, hydrogens_implicit=hydrogens_implicit)
        atom_features.append(feat)

    x = torch.tensor(np.vstack(atom_features), dtype=torch.float) if len(atom_features) > 0 else torch.zeros((0,1), dtype=torch.float)

    # ===== EDGE FEATURES (Bond) =====
    edge_index = []
    edge_attr_list = []

    for bond in molecule.GetBonds():
        u = bond.GetBeginAtomIdx()
        v = bond.GetEndAtomIdx()

        # Lấy feature cho cạnh
        bf = get_bond_features(bond) if use_bond_features else np.array([1.0])

        # Thêm cạnh 2 chiều (Undirected graph)
        edge_index.append([u, v])
        edge_attr_list.append(bf)

        edge_index.append([v, u])
        edge_attr_list.append(bf)

    # Chuyển đổi sang Tensor
    if len(edge_index) == 0:
        edge_index = torch.zeros((2, 0), dtype=torch.long)
        # Tính kích thước feature ảo để tránh lỗi shape
        dummy_bond = get_bond_features(Chem.MolFromSmiles("CC").GetBondBetweenAtoms(0,1))
        edge_attr = torch.zeros((0, len(dummy_bond)), dtype=torch.float)
    else:
        edge_index = torch.tensor(edge_index, dtype=torch.long).t().contiguous()
        edge_attr = torch.tensor(np.vstack(edge_attr_list), dtype=torch.float)


    data = Data(
        x=x,
        edge_index=edge_index,
        edge_attr=edge_attr,
    )

    return data



In [41]:
class SmartDrugDatasetMerged(Dataset):
    def __init__(self, merged_df, tokenizer, label_cols, max_len=128, smiles_col='cleaned_smiles'):
        self.data = []
        self.label_cols = label_cols
        print(f"Processing merged dataset with {len(merged_df)} samples...")
        for idx, row in tqdm(merged_df.iterrows(), total=merged_df.shape[0]):
            smi = row[smiles_col]
            # build raw label vector for ordered label columns
            raw = []
            for c in label_cols:
                if c in row.index:
                    val = row[c]
                else:
                    val = np.nan
                raw.append(float(val) if not pd.isna(val) else np.nan)
            raw = np.array(raw, dtype=float)
            mask = (~np.isnan(raw)).astype(float)
            labels = np.nan_to_num(raw, nan=0.0)
            # molecule to graph using your function
            mol = smiles_to_molecule(smi)
            if mol is None:
                # print debug and skip
                print(f"WARNING: molec parse failed for smiles index {idx}: {smi}")
                continue
            graph = molecule_to_graph(mol)
            if graph is None:
                print(f"WARNING: graph build failed for smiles index {idx}: {smi}")
                continue
            text_enc = tokenizer(smi, max_length=max_len, padding='max_length', truncation=True, return_tensors="pt")
            # Save minimal tracing info for debug
            item = {
                'graph': graph,
                'input_ids': text_enc['input_ids'].squeeze(0),
                'attention_mask': text_enc['attention_mask'].squeeze(0),
                'labels': torch.tensor(labels, dtype=torch.float),
                'mask': torch.tensor(mask, dtype=torch.float),
                'smiles': smi
            }
            self.data.append(item)

        print("Finished building dataset. total processed samples:", len(self.data))
        # print first 3 items for manual check
        for i in range(min(3, len(self.data))):
            it = self.data[i]
            print(f"+ DEBUG Sample {i}:")
            print(f"++ smiles={it['smiles']}")
            print(f"++ labels={it['labels'].numpy()}")
            print(f"++ mask={it['mask'].numpy()}")

    def __len__(self): return len(self.data)
    def __getitem__(self, idx): return self.data[idx]

dataset = SmartDrugDatasetMerged(merged_df, tokenizer, label_cols_ordered, max_len=64)
print("Dataset length:", len(dataset))


Processing merged dataset with 10890 samples...


100%|██████████| 10890/10890 [00:19<00:00, 545.78it/s]


Finished building dataset. total processed samples: 10890
+ DEBUG Sample 0:
++ smiles=CCOc1ccc2nc(S(N)(=O)=O)sc2c1
++ labels=[1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0.]
++ mask=[1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0.]
+ DEBUG Sample 1:
++ smiles=CCN1C(=O)NC(c2ccccc2)C1=O
++ labels=[0. 0. 0. 1. 1. 1. 0. 0. 1. 0. 0. 1. 1. 0. 0. 1. 0. 0. 0. 1. 0. 1. 0. 1.
 0. 0. 0. 0.]
++ mask=[1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1.]
+ DEBUG Sample 2:
++ smiles=CC[C@]1(O)CC[C@H]2[C@@H]3CCC4=CCCC[C@@H]4[C@H]3CC[C@@]21C
++ labels=[1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0.]
++ mask=[1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0.]
Dataset length: 10890


In [43]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import AutoModel
from torch_geometric.nn import GATConv, GATv2Conv, global_mean_pool

class GATEncoder(nn.Module):
    def __init__(self, in_dim, hidden_dim=64, out_dim=128, heads=4, dropout=0.1, use_gatv2=False, debug=False):
        super().__init__()
        Conv = GATv2Conv if use_gatv2 else GATConv

        # conv1 produces hidden_dim (concat heads)
        self.conv1 = Conv(in_dim, hidden_dim // heads, heads=heads, concat=True)
        # conv2 produces out_dim (concat heads)
        self.conv2 = Conv(hidden_dim, out_dim // heads, heads=heads, concat=True)

        self.out_dim = self.conv2.out_channels  # real output feature dim after conv2
        self.act = nn.ReLU()
        self.pool = global_mean_pool
        self.dropout = nn.Dropout(dropout)
        self.debug = debug

    def forward(self, x, edge_index, edge_attr, batch_idx):
        # device-safe: determine device from batch_idx if x may be None
        device = batch_idx.device if isinstance(batch_idx, torch.Tensor) else (x.device if isinstance(x, torch.Tensor) else torch.device("cpu"))

        # empty graph handling
        if x is None or x.shape[0] == 0:
            B = (batch_idx.max().item() + 1) if isinstance(batch_idx, torch.Tensor) and batch_idx.numel() > 0 else 1
            if self.debug:
                print(f"GATEncoder DEBUG: empty graph, returning zeros {(B, self.out_dim)} on {device}")
            return torch.zeros((B, self.out_dim), device=device, dtype=torch.float)

        # standard forward (note: GATConv/GATv2Conv ignore edge_attr unless using edge_dim)
        x = self.conv1(x, edge_index)
        x = self.act(x)
        x = self.dropout(x)

        x = self.conv2(x, edge_index)
        x = self.act(x)

        x = self.pool(x, batch_idx)  # [B, out_dim]
        if self.debug:
            print("GATEncoder DEBUG: pooled node_emb shape:", x.shape)
        return x


class TextEncoder(nn.Module):
    def __init__(self, hf_model_name='seyonec/ChemBERTa-zinc-base-v1', proj_dim=128, freeze_backbone=False, debug=False):
        super().__init__()
        self.hf = AutoModel.from_pretrained(hf_model_name)
        self.proj = nn.Linear(self.hf.config.hidden_size, proj_dim)
        self.act = nn.ReLU()
        self.debug = debug

        if freeze_backbone:
            for p in self.hf.parameters():
                p.requires_grad = False

    def forward(self, input_ids, attention_mask):
        out = self.hf(input_ids=input_ids, attention_mask=attention_mask, return_dict=True)
        # Roberta-style models: CLS token is at index 0
        cls = out.last_hidden_state[:, 0, :]   # [B, hidden_size]
        cls = self.act(self.proj(cls))         # [B, proj_dim]
        if self.debug:
            print("TextEncoder DEBUG: cls shape:", cls.shape)
        return cls


class ModalityAttentionFusion(nn.Module):
    def __init__(self, input_dims, common_dim=128):
        super().__init__()
        self.n = len(input_dims)
        self.projs = nn.ModuleList([nn.Linear(d, common_dim) for d in input_dims])
        self.attn_fc = nn.ModuleList([
            nn.Sequential(nn.Linear(common_dim, 64), nn.Tanh(), nn.Linear(64, 1))
            for _ in range(self.n)
        ])
        self.final_proj = nn.Linear(common_dim, common_dim)

    def forward(self, embeddings):
        # embeddings: list of [B, d_i]
        projeds = [F.relu(p(e)) for p, e in zip(self.projs, embeddings)]       # list of [B, common_dim]
        scores = [fc(p).squeeze(-1) for fc, p in zip(self.attn_fc, projeds)]    # each [B]
        scores = torch.stack(scores, dim=1)    # [B, n]
        weights = torch.softmax(scores, dim=1) # [B, n]
        stacked = torch.stack(projeds, dim=1) # [B, n, common_dim]
        fused = (stacked * weights.unsqueeze(-1)).sum(dim=1) # [B, common_dim]
        fused = F.relu(self.final_proj(fused))
        return fused, weights


class MultimodalNet(nn.Module):
    def __init__(self, atom_feat_dim, text_model_name, graph_emb_dim=128, text_emb_dim=128, fusion_dim=128,
                 num_binary=1, num_organs=1, num_adr=1, debug=False):
        super().__init__()
        self.graph_enc = GATEncoder(in_dim=atom_feat_dim, out_dim=graph_emb_dim, debug=debug)
        self.text_enc = TextEncoder(hf_model_name=text_model_name, proj_dim=text_emb_dim, debug=debug)
        self.fusion = ModalityAttentionFusion([graph_emb_dim, text_emb_dim], common_dim=fusion_dim)
        self.binary_head = nn.Linear(fusion_dim, num_binary)
        self.organ_head = nn.Linear(fusion_dim, num_organs)
        self.adr_head = nn.Linear(fusion_dim, num_adr)
        self.debug = debug

    def forward(self, batch):
        g = batch['graph']
        node_x = getattr(g, 'x', None)
        edge_index = getattr(g, 'edge_index', None)
        edge_attr = getattr(g, 'edge_attr', None)   # optional
        batch_idx = getattr(g, 'batch', None)

        if self.debug:
            print("Model forward DEBUG: node_x shape:", None if node_x is None else tuple(node_x.shape))

        graph_emb = self.graph_enc(node_x, edge_index, edge_attr, batch_idx)   # [B, graph_emb_dim]
        text_emb = self.text_enc(batch['input_ids'], batch['attention_mask'])  # [B, text_emb_dim]

        fused, weights = self.fusion([graph_emb, text_emb])                    # [B, fusion_dim], [B, 2]
        bin_logits = self.binary_head(fused)
        organ_logits = self.organ_head(fused)
        adr_logits = self.adr_head(fused)

        if self.debug:
            print("Model outputs shapes -> bin:", bin_logits.shape, "organ:", organ_logits.shape, "adr:", adr_logits.shape)
        return {'binary_logits': bin_logits, 'organ_logits': organ_logits, 'adr_logits': adr_logits, 'fusion_weights': weights}


In [44]:
import torch
import torch.nn.functional as F
import numpy as np
from sklearn.metrics import roc_auc_score

def masked_bce_loss(pred, target, weight=None):
    """
    Binary Cross Entropy với mask cho missing labels (NaN)

    Args:
        pred: [batch, num_labels] - Logits (chưa qua sigmoid)
        target: [batch, num_labels] - Labels (chứa NaN cho labels thiếu)
        weight: [num_labels] - Optional, trọng số cho từng label
    Returns:
        loss: Scalar tensor
    """
    # Mask các label KHÔNG phải NaN
    mask = ~torch.isnan(target)

    if mask.sum() == 0:
        return torch.tensor(0.0, requires_grad=True, device=pred.device)

    # Thay NaN = 0 để tránh lỗi (sẽ bị mask anyway)
    target_clean = torch.where(torch.isnan(target), torch.zeros_like(target), target)

    # Compute loss element-wise (chưa reduce)
    loss = F.binary_cross_entropy_with_logits(pred, target_clean, reduction='none')

    # Apply weight TRƯỚC KHI mask (nếu có)
    if weight is not None:
        weight_expanded = weight.unsqueeze(0).expand_as(loss)
        loss = loss * weight_expanded

    # Apply mask và tính mean
    masked_loss = loss[mask]

    return masked_loss.mean()


def calculate_roc_auc(y_true, y_scores):
    """
    Tính ROC-AUC cho multi-label classification
    Tự động bỏ qua NaN và các cột chỉ có 1 class
    """
    auc_scores = []
    num_labels = y_true.shape[1]

    for i in range(num_labels):
        true_col = y_true[:, i]
        pred_col = y_scores[:, i]

        # Lọc bỏ NaN
        mask = ~np.isnan(true_col)
        valid_true = true_col[mask]
        valid_pred = pred_col[mask]

        # Chỉ tính AUC nếu có cả class 0 và class 1
        if len(np.unique(valid_true)) == 2:
            try:
                auc = roc_auc_score(valid_true, valid_pred)
                auc_scores.append(auc)
            except ValueError:
                pass

    return np.mean(auc_scores) if len(auc_scores) > 0 else 0.0


def evaluate_model(model, dataloader, task_name, device):
    """
    Đánh giá model trên 1 dataset cụ thể

    Args:
        model: MultimodalNet
        dataloader: DataLoader của task cần eval (với collate_fn=lambda x: x)
        task_name: 'tox21', 'toxcast', hoặc 'sider'
        device: 'cuda' hoặc 'cpu'
    Returns:
        auc_score: ROC-AUC của task này
    """
    model.eval()

    all_preds = []
    all_labels = []

    with torch.no_grad():
        for batch_data in dataloader:
            # === Prepare data - batch_data là LIST of dicts ===
            graph_list = [item['graph'] for item in batch_data]
            graph_batch = Batch.from_data_list(graph_list).to(device)

            input_ids = torch.stack([item['input_ids'] for item in batch_data]).to(device)
            attention_mask = torch.stack([item['attention_mask'] for item in batch_data]).to(device)
            labels = torch.stack([item['labels'] for item in batch_data])  # Keep on CPU

            # === Forward với task cụ thể ===
            outputs = model(graph_batch, input_ids, attention_mask, task=task_name)

            # === Convert logits to probabilities ===
            probs = torch.sigmoid(outputs).cpu().numpy()

            # === Collect predictions & labels ===
            all_preds.append(probs)
            all_labels.append(labels.numpy())

    # === Concatenate all batches ===
    if len(all_preds) == 0:
        return 0.0

    y_pred = np.vstack(all_preds)  # [N_samples, N_labels]
    y_true = np.vstack(all_labels)  # [N_samples, N_labels]

    # === Calculate AUC ===
    auc = calculate_roc_auc(y_true, y_pred)

    return auc


In [58]:
# Sửa train_step cho single loader (vị trí sửa: dùng 1 batch, split labels cho 3 losses, sum backward)
def train_step(model, optimizer, batch, device):
    """
    Training 1 step trên single batch từ merged loader.
    Calc masked loss riêng từng head (split labels), accumulate sum.
    """
    model.train()
    optimizer.zero_grad()

    # Move to device
    batch['graph'] = batch['graph'].to(device)
    batch['input_ids'] = batch['input_ids'].to(device)
    batch['attention_mask'] = batch['attention_mask'].to(device)
    labels = batch['labels'].to(device)  # [B, 28]

    outputs = model(batch)

    # Split labels: col 0: binary, 1-19: organ (1:20), 20-27: adr (20:28)
    loss_bin = masked_bce_loss(outputs['binary_logits'], labels[:, 0:1])
    loss_organ = masked_bce_loss(outputs['organ_logits'], labels[:, 1:20])
    loss_adr = masked_bce_loss(outputs['adr_logits'], labels[:, 20:28])

    total_loss = loss_bin + loss_organ + loss_adr
    total_loss.backward()

    torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
    optimizer.step()

    return total_loss.item()

# Sửa evaluate_model cho single loader (vị trí sửa: collect probs/labels per head, calc AUC riêng)
def evaluate_model(model, loader, device):
    """
    Eval trên full loader, return AUC per head group.
    """
    model.eval()

    all_bin_preds, all_bin_labels = [], []
    all_organ_preds, all_organ_labels = [], []
    all_adr_preds, all_adr_labels = [], []

    with torch.no_grad():
        for batch in loader:
            batch['graph'] = batch['graph'].to(device)
            batch['input_ids'] = batch['input_ids'].to(device)
            batch['attention_mask'] = batch['attention_mask'].to(device)
            labels = batch['labels']  # [B, 28] CPU

            outputs = model(batch)

            bin_probs = torch.sigmoid(outputs['binary_logits']).cpu().numpy()
            organ_probs = torch.sigmoid(outputs['organ_logits']).cpu().numpy()
            adr_probs = torch.sigmoid(outputs['adr_logits']).cpu().numpy()

            all_bin_preds.append(bin_probs)
            all_bin_labels.append(labels[:, 0:1].numpy())

            all_organ_preds.append(organ_probs)
            all_organ_labels.append(labels[:, 1:20].numpy())

            all_adr_preds.append(adr_probs)
            all_adr_labels.append(labels[:, 20:28].numpy())

    # Concat & calc AUC
    auc_bin = calculate_roc_auc(np.vstack(all_bin_labels), np.vstack(all_bin_preds))
    auc_organ = calculate_roc_auc(np.vstack(all_organ_labels), np.vstack(all_organ_preds))
    auc_adr = calculate_roc_auc(np.vstack(all_adr_labels), np.vstack(all_adr_preds))

    return auc_bin, auc_organ, auc_adr

# CODE TRAIN HOÀN THIỆN (single dataset từ merged)
tokenizer = AutoTokenizer.from_pretrained('seyonec/ChemBERTa-zinc-base-v1')

# Label cols ordered từ prompt (binary + organ 19 + adr 8 = 28)
label_cols_ordered = ['any_toxicity', 'sider_Hepatobiliary disorders', 'sider_Metabolism and nutrition disorders', 'sider_Eye disorders', 'sider_Musculoskeletal and connective tissue disorders', 'sider_Gastrointestinal disorders', 'sider_Immune system disorders', 'sider_Reproductive system and breast disorders', 'sider_General disorders and administration site conditions', 'sider_Endocrine disorders', 'sider_Vascular disorders', 'sider_Blood and lymphatic system disorders', 'sider_Skin and subcutaneous tissue disorders', 'sider_Congenital, familial and genetic disorders', 'sider_Respiratory, thoracic and mediastinal disorders', 'sider_Psychiatric disorders', 'sider_Renal and urinary disorders', 'sider_Ear and labyrinth disorders', 'sider_Cardiac disorders', 'sider_Nervous system disorders', 'sider_Product issues', 'sider_Investigations', 'sider_Social circumstances', 'sider_Neoplasms benign, malignant and unspecified (incl cysts and polyps)', 'sider_Surgical and medical procedures', 'sider_Infections and infestations', 'sider_Pregnancy, puerperium and perinatal conditions', 'sider_Injury, poisoning and procedural complications']

# dataset = SmartDrugDatasetMerged(merged_df, tokenizer, label_cols_ordered, max_len=64)
from sklearn.model_selection import train_test_split
train_df, val_df = train_test_split(merged_df, test_size=0.1, random_state=42)
train_dataset = SmartDrugDatasetMerged(train_df, tokenizer, label_cols_ordered, max_len=64)
val_dataset = SmartDrugDatasetMerged(val_df, tokenizer, label_cols_ordered, max_len=64)


batch_size = 32
# loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)  # Single loader
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False) # val_loader shuffle=False để AUC ổn định


# Atom feat dim
atom_feat_dim = 41

model = MultimodalNet(atom_feat_dim=atom_feat_dim,
                      text_model_name='seyonec/ChemBERTa-zinc-base-v1',
                      graph_emb_dim=128, text_emb_dim=128, fusion_dim=128,
                      num_binary=1, num_organs=19, num_adr=8)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)

num_epochs = 10  # Adjust
for epoch in range(num_epochs):
    print(f"Epoch {epoch+1}/{num_epochs}")

     # -------- TRAINING --------
    model.train()
    total_loss = 0.0
    num_batches = 0

    for batch in train_loader:
        loss = train_step(model, optimizer, batch, device)
        total_loss += loss
        num_batches += 1

    avg_train_loss = total_loss / num_batches
    print(f"Train Loss: {avg_train_loss:.4f}")

    # -------- VALIDATION --------
    model.eval()
    auc_bin, auc_organ, auc_adr = evaluate_model(model, val_loader, device)

    print(f"Val AUCs | Binary: {auc_bin:.4f} | Organ: {auc_organ:.4f} | ADR: {auc_adr:.4f}")


torch.save(model.state_dict(), f'{GG_COLAB}/trained_multimodal_net.pth')
print("Training complete. Model saved.")

Processing merged dataset with 9801 samples...


100%|██████████| 9801/9801 [00:27<00:00, 350.28it/s]


Finished building dataset. total processed samples: 9801
+ DEBUG Sample 0:
++ smiles=COc1cccc2c1[C@@H]1CN(CCCCn3c(=O)[nH]c4c(sc5ncc(-c6ccccc6)nc54)c3=O)C[C@@H]1CO2
++ labels=[1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0.]
++ mask=[1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0.]
+ DEBUG Sample 1:
++ smiles=O=C(Nc1ccc(/C=C/c2ccc(NC(=O)c3cc(S(=O)(=O)O)c4cccnc4c3O)cc2S(=O)(=O)O)c(S(=O)(=O)O)c1)c1cc(S(=O)(=O)O)c2cccnc2c1O
++ labels=[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0.]
++ mask=[1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0.]
+ DEBUG Sample 2:
++ smiles=Cc1cc(C(C)(C)C)c(O)c(C)c1CC1=NCCN1
++ labels=[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0.]
++ mask=[1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0.]
Processing merged dataset with 1089 samples...


100%|██████████| 1089/1089 [00:01<00:00, 576.52it/s]


Finished building dataset. total processed samples: 1089
+ DEBUG Sample 0:
++ smiles=O=c1[nH]c2ccccc2n1C1CCNCC1
++ labels=[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0.]
++ mask=[1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0.]
+ DEBUG Sample 1:
++ smiles=CC1CC(O)CC(C)(C)C1
++ labels=[1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0.]
++ mask=[1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0.]
+ DEBUG Sample 2:
++ smiles=c1ccc(N=Nc2ccccc2)cc1
++ labels=[1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0.]
++ mask=[1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0.]
Epoch 1/10
Train Loss: 1.1986
Val AUCs | Binary: 0.6595 | Organ: 0.8117 | ADR: 0.8276
Epoch 2/10
Train Loss: 0.9832
Val AUCs | Binary: 0.6925 | Organ: 0.8318 | ADR: 0.8200
Epoch 3/10
Train Loss: 0.9093
Val AUCs | Bi

In [59]:
def predict_single_smiles(smiles, model, tokenizer, device):
    model.eval()

    # --- build graph ---
    mol = Chem.MolFromSmiles(smiles)
    g = molecule_to_graph(mol)
    g = g.to(device)

    # --- tokenize ---
    encoded = tokenizer(
        smiles,
        padding='max_length',
        truncation=True,
        max_length=64,
        return_tensors='pt'
    )
    input_ids = encoded['input_ids'].to(device)
    attention_mask = encoded['attention_mask'].to(device)

    # --- prepare batch dict giống training ---
    batch = {
        'graph': g,
        'input_ids': input_ids,
        'attention_mask': attention_mask
    }

    with torch.no_grad():
        outputs = model(batch)

        bin_prob = torch.sigmoid(outputs['binary_logits']).cpu().numpy()[0, 0]
        organ_probs = torch.sigmoid(outputs['organ_logits']).cpu().numpy()[0]
        adr_probs = torch.sigmoid(outputs['adr_logits']).cpu().numpy()[0]

    return bin_prob, organ_probs, adr_probs


smiles_test = "CCOc1ccc2nc(S(N)(=O)=O)sc2c1"

bin_p, organ_p, adr_p = predict_single_smiles(
    smiles_test, model, tokenizer, device
)

print("Binary toxicity:", bin_p)
print("Organ (19 probs):", organ_p)
print("ADR (8 probs):", adr_p)


Binary toxicity: 0.9879056
Organ (19 probs): [0.00399554 0.0048158  0.00496453 0.00346897 0.00915911 0.00701358
 0.00482247 0.00876398 0.00087401 0.0072839  0.00522476 0.01270889
 0.00086104 0.0054768  0.00588085 0.00447956 0.00401606 0.00575863
 0.01008808]
ADR (8 probs): [6.0533352e-05 6.3940804e-03 4.8331553e-04 1.5137258e-03 7.4587861e-04
 5.0754477e-03 3.9736007e-04 5.6759380e-03]


In [60]:
smiles_test = "CCOc1ccc2nc(S(N)(=O)=O)sc2c1"

bin_p, organ_p, adr_p = predict_single_smiles(
    smiles_test, model, tokenizer, device
)

print("Binary prob:", bin_p)
print("Organ probs:", organ_p)
print("ADR probs:", adr_p)


Binary prob: 0.9879056
Organ probs: [0.00399554 0.0048158  0.00496453 0.00346897 0.00915911 0.00701358
 0.00482247 0.00876398 0.00087401 0.0072839  0.00522476 0.01270889
 0.00086104 0.0054768  0.00588085 0.00447956 0.00401606 0.00575863
 0.01008808]
ADR probs: [6.0533352e-05 6.3940804e-03 4.8331553e-04 1.5137258e-03 7.4587861e-04
 5.0754477e-03 3.9736007e-04 5.6759380e-03]


In [61]:
def print_prediction_with_labels(bin_p, organ_p, adr_p, label_cols):
    print("\n===== FULL PREDICTION =====\n")

    # Binary toxicity
    print(f"{label_cols[0]}: {bin_p:.4f}")

    print("\n--- ORGAN TOXICITY (19 labels) ---")
    for label, prob in zip(label_cols[1:20], organ_p):
        print(f"{label}: {prob:.4f}")

    print("\n--- ADR (8 labels) ---")
    for label, prob in zip(label_cols[20:], adr_p):
        print(f"{label}: {prob:.4f}")


bin_p, organ_p, adr_p = predict_single_smiles(
    smiles_test, model, tokenizer, device
)

print_prediction_with_labels(bin_p, organ_p, adr_p, label_cols_ordered)



===== FULL PREDICTION =====

any_toxicity: 0.9879

--- ORGAN TOXICITY (19 labels) ---
sider_Hepatobiliary disorders: 0.0040
sider_Metabolism and nutrition disorders: 0.0048
sider_Eye disorders: 0.0050
sider_Musculoskeletal and connective tissue disorders: 0.0035
sider_Gastrointestinal disorders: 0.0092
sider_Immune system disorders: 0.0070
sider_Reproductive system and breast disorders: 0.0048
sider_General disorders and administration site conditions: 0.0088
sider_Endocrine disorders: 0.0009
sider_Vascular disorders: 0.0073
sider_Blood and lymphatic system disorders: 0.0052
sider_Skin and subcutaneous tissue disorders: 0.0127
sider_Congenital, familial and genetic disorders: 0.0009
sider_Respiratory, thoracic and mediastinal disorders: 0.0055
sider_Psychiatric disorders: 0.0059
sider_Renal and urinary disorders: 0.0045
sider_Ear and labyrinth disorders: 0.0040
sider_Cardiac disorders: 0.0058
sider_Nervous system disorders: 0.0101

--- ADR (8 labels) ---
sider_Product issues: 0.0001
s