# Excercise 1


In [7]:
!pip install pandas
!pip install rdkit

Collecting rdkit
  Downloading rdkit-2024.9.5-cp39-cp39-macosx_11_0_arm64.whl.metadata (4.0 kB)
Downloading rdkit-2024.9.5-cp39-cp39-macosx_11_0_arm64.whl (27.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.7/27.7 MB[0m [31m29.5 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Installing collected packages: rdkit
Successfully installed rdkit-2024.9.5


In [14]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import AllChem, DataStructs
from rdkit.Chem import rdFingerprintGenerator

# load csv
df = pd.read_csv("/Users/adelasilarova/Downloads/dataset.csv")
smiles_list = df["SMILES"].dropna().tolist()

df.head()


Unnamed: 0,Ambit_InchiKey,Original_Entry_ID,Entrez_ID,Activity_Flag,pXC50,DB,Original_Assay_ID,Tax_ID,Gene_Symbol,Ortholog_Group,SMILES
0,CWYBNSVSYKQGJB-UHFFFAOYNA-N,71541095,6532,A,6.85387,pubchem,727355,9606,SLC6A4,4061,ClC=1C=C(C2CCCCNC2)C=CC1Cl
1,CTETYYAZBPJBHE-UHFFFAOYNA-N,CHEMBL1289,6532,A,6.37,chembl20,774720,9606,SLC6A4,4061,C(#CI)COC=1C=C(Cl)C(=CC1Cl)Cl
2,FQDRMHHCWZAXJM-UHFFFAOYNA-N,125558,25553,A,9.98297,pubchem,205028,10116,SLC6A4,4061,O1C=2C=C3CC(N)CC3=CC2OC1
3,QRAWNNQNLQPNIZ-UHFFFAOYNA-N,CHEMBL205229,6532,A,7.54,chembl20,950695,9606,SLC6A4,4061,N1C=C(CCN)C2=C1C(=CC=C2)F
4,HCFHWXDIZOAUTQ-UHFFFAOYNA-N,CHEMBL6376,25553,A,6.11,chembl20,201505,10116,SLC6A4,4061,C1(=CC=2OCOC2C=C1C)CC(N)C


In [17]:
# molecular fingerprints
morgan_gen = rdFingerprintGenerator.GetMorganGenerator(radius=2, fpSize=2048)

molecules = [Chem.MolFromSmiles(smiles) for smiles in smiles_list]
fingerprints = [morgan_gen.GetFingerprint(mol) if mol else None for mol in molecules]

valid_data = [(smiles, fp) for smiles, fp in zip(smiles_list, fingerprints) if fp is not None]
fp_df = pd.DataFrame(valid_data, columns=["SMILES", "Fingerprint"])

# Tanimoto similarity
reference_smiles = "CCO" 
reference_mol = Chem.MolFromSmiles(reference_smiles)
reference_fp = morgan_gen.GetFingerprint(reference_mol)

def tanimoto_similarity(fp1, fp2):
    return DataStructs.TanimotoSimilarity(fp1, fp2)

fp_df["Tanimoto_Similarity"] = fp_df["Fingerprint"].apply(lambda fp: tanimoto_similarity(fp, reference_fp))


top_hits = fp_df.sort_values(by="Tanimoto_Similarity", ascending=False).head(3)
print("Top 3 Hits:")
print(top_hits[["SMILES", "Tanimoto_Similarity"]])


Top 3 Hits:
                       SMILES  Tanimoto_Similarity
7    C=1(C=CC(=CC1)CCCCCCCC)O             0.166667
21  C=1(C=CC(=CC1)CCCCCCCCC)O             0.166667
6     O=C(C(NCC)C)C=1C=CC=CC1             0.142857


## Diskuze

Nejvyšší podobnost 0,166667 mají dvě molekuly, konkrétně C=1(C=CC(=CC1)CCCCCCCC)O a C=1(C=CC(=CC1)CCCCCCCCC)O. 
Třetí nejvyšší podobnost 0.142857 má molekula O=C(C(NCC)C)C=1C=CC=CC1.

Nízké hodnoty Tanimoto značí, že žádná z analyzovaných sloučenin není příliš podobná referenční molekule. To může znamenat, že dané molekuly mají jinou základní strukturu nebo že Tanimoto metrika není optimální pro tento dataset. Pro přesnější výsledky by bylo vhodné kombinovat více metod hodnocení podobnosti a zahrnout i experimentální validaci.
