In [1]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import AllChem, DataStructs

# Load the generated file
generated_df = pd.read_csv('generated_molecules.csv')

# Load the NTD Drugs file
ntd_drugs_df = pd.read_csv('Final_NTD_Drugs_with_names.csv')
ntd_drugs_smiles = ntd_drugs_df['canonical_smiles'].tolist()

# Filter valid canonical_smiles from generated_df
valid_smiles = generated_df[generated_df['validity'] == 'valid']['canonical_smiles'].tolist()

# Convert SMILES to RDKit Mol objects and compute fingerprints
def smiles_to_mol(smiles_list):
    return [Chem.MolFromSmiles(smile) for smile in smiles_list]

def mol_to_fp(mol):
    return AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=2048)

valid_mols = smiles_to_mol(valid_smiles)
ntd_drugs_mols = smiles_to_mol(ntd_drugs_smiles)

valid_fps = [mol_to_fp(mol) for mol in valid_mols]
ntd_drugs_fps = [mol_to_fp(mol) for mol in ntd_drugs_mols]

# Compute Tanimoto similarity
def compute_tanimoto(fp1, fp2):
    return DataStructs.TanimotoSimilarity(fp1, fp2)

# Create a DataFrame to store the similarities
similarity_data = []

for valid_smile, valid_fp in zip(valid_smiles, valid_fps):
    for ntd_drug_smile, ntd_drug_fp in zip(ntd_drugs_smiles, ntd_drugs_fps):
        similarity = compute_tanimoto(valid_fp, ntd_drug_fp)
        similarity_data.append([valid_smile, ntd_drug_smile, similarity])

similarity_df = pd.DataFrame(similarity_data, columns=['valid_smile', 'ntd_drug_smile', 'tanimoto_similarity'])

In [14]:
filtered_similarity_df = similarity_df[(similarity_df['tanimoto_similarity'] >= 0.45) & (similarity_df['tanimoto_similarity'] < 0.6)]


In [15]:
# Merge with NTD drugs data to get molecule names
filtered_similarity_df = filtered_similarity_df.merge(ntd_drugs_df[['canonical_smiles', 'molecule_name']], left_on='ntd_drug_smile', right_on='canonical_smiles', how='left')

# Select and reorder columns
final_df = filtered_similarity_df[['valid_smile', 'ntd_drug_smile', 'tanimoto_similarity', 'molecule_name']]

In [16]:
final_df

Unnamed: 0,valid_smile,ntd_drug_smile,tanimoto_similarity,molecule_name
0,CCOC(=O)c1ccc(NC(=O)N/N=C/c2ccc([N+](=O)[O-])o...,NC(=O)N/N=C/c1ccc([N+](=O)[O-])o1,0.527273,NITROFURAZONE
1,CCN(CC)c1ccc(NC(=O)N/N=C/c2ccc([N+](=O)[O-])o2...,NC(=O)N/N=C/c1ccc([N+](=O)[O-])o1,0.557692,NITROFURAZONE
2,CCCCc1ccc(C(=O)N/N=C/c2ccc([N+](=O)[O-])o2)cc1,NC(=O)N/N=C/c1ccc([N+](=O)[O-])o1,0.557692,NITROFURAZONE
3,CCc1ccc(NC(=O)N/N=C/c2ccc([N+](=O)[O-])o2)cc1,NC(=O)N/N=C/c1ccc([N+](=O)[O-])o1,0.580000,NITROFURAZONE
4,O=[N+]([O-])c1ccc(/N=C/c2ccc([N+](=O)[O-])o2)cc1,NC(=O)N/N=C/c1ccc([N+](=O)[O-])o1,0.500000,NITROFURAZONE
...,...,...,...,...
88,CCN(CC)CCCCCCCCCCCNc1ccnc2ccc(Cl)cc12,CCN(CC)CCCC(C)Nc1ccnc2cc(Cl)ccc12,0.535714,CHLOROQUINE
89,CCCCCCCCCCCCOc1ccc(C(=O)O)cc1,N=C(N)c1ccc(OCCCCCOc2ccc(C(=N)N)cc2)cc1,0.485714,PENTAMIDINE
90,CCCNC(=O)/N=C/c1ccc([N+](=O)[O-])o1,NC(=O)N/N=C/c1ccc([N+](=O)[O-])o1,0.520833,NITROFURAZONE
91,CO/N=C(\Nc1nc2ccccc2[nH]1)c1ccc(-c2ccccc2)cc1,COC(=O)Nc1nc2cc(C(=O)c3ccccc3)ccc2[nH]1,0.454545,MEBENDAZOLE


In [17]:
final_df.to_csv('tanimoto_similarity_results.csv', index=False)


In [18]:
print(len(valid_smiles))

26623


In [19]:
import pandas as pd

# Read NTD_Drugs.csv and label the canonical_smiles column
ntd_drugs_file = 'Final_NTD_Drugs_with_names.csv'
ntd_drugs_data = pd.read_csv(ntd_drugs_file)
ntd_drugs_data['label'] = pd.factorize(ntd_drugs_data['canonical_smiles'])[0]
label_to_name = ntd_drugs_data.set_index('label')['molecule_name'].to_dict()

# Read tanimoto_similarity_results.csv
tanimoto_file = 'tanimoto_similarity_results.csv'
tanimoto_data = pd.read_csv(tanimoto_file)

# Merge dataframes to get labels in tanimoto_similarity_results
merged_data = tanimoto_data.merge(ntd_drugs_data[['canonical_smiles', 'label']], left_on='ntd_drug_smile', right_on='canonical_smiles', how='left')

# Count occurrences of each label and get canonical_smiles
label_counts = merged_data.groupby('label').size()
label_smiles = merged_data.groupby('label')['canonical_smiles'].first()

# Convert labels to molecule names
molecule_names = label_counts.index.map(label_to_name)

# Combine counts, molecule names, and canonical_smiles
results = pd.DataFrame({
    'molecule_name': molecule_names,
    'count': label_counts.values,
    'canonical_smile': label_smiles.values
})

results


Unnamed: 0,molecule_name,count,canonical_smile
0,NIFURTIMOX,2,CC1CS(=O)(=O)CCN1/N=C/c1ccc([N+](=O)[O-])o1
1,BENZNIDAZOLE,1,O=C(Cn1ccnc1[N+](=O)[O-])NCc1ccccc1
2,ALBENDAZOLE,1,CCCSc1ccc2[nH]c(NC(=O)OC)nc2c1
3,OXFENDAZOLE,1,COC(=O)Nc1nc2cc([S+]([O-])c3ccccc3)ccc2[nH]1
4,CHLOROQUINE,19,CCN(CC)CCCC(C)Nc1ccnc2cc(Cl)ccc12
5,MEBENDAZOLE,10,COC(=O)Nc1nc2cc(C(=O)c3ccccc3)ccc2[nH]1
6,PENTAMIDINE,5,N=C(N)c1ccc(OCCCCCOc2ccc(C(=N)N)cc2)cc1
7,NITROFURAZONE,52,NC(=O)N/N=C/c1ccc([N+](=O)[O-])o1
8,FLUCONAZOLE,1,OC(Cn1cncn1)(Cn1cncn1)c1ccc(F)cc1F
9,DAPSONE,1,Nc1ccc(S(=O)(=O)c2ccc(N)cc2)cc1
