In [1]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import AllChem, DataStructs

# Load the generated file
generated_df = pd.read_csv('merged_generated_molecules.csv')

# Load the NTD Drugs file
ntd_drugs_df = pd.read_csv('Final_NTD_Drugs_with_names.csv')
ntd_drugs_smiles = ntd_drugs_df['canonical_smiles'].tolist()

# Filter valid canonical_smiles from generated_df
valid_smiles = generated_df[generated_df['validity'] == 'valid']['canonical_smiles'].tolist()

# Convert SMILES to RDKit Mol objects and compute fingerprints
def smiles_to_mol(smiles_list):
    return [Chem.MolFromSmiles(smile) for smile in smiles_list]

def mol_to_fp(mol):
    return AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=2048)

valid_mols = smiles_to_mol(valid_smiles)
ntd_drugs_mols = smiles_to_mol(ntd_drugs_smiles)

valid_fps = [mol_to_fp(mol) for mol in valid_mols]
ntd_drugs_fps = [mol_to_fp(mol) for mol in ntd_drugs_mols]

# Compute Tanimoto similarity
def compute_tanimoto(fp1, fp2):
    return DataStructs.TanimotoSimilarity(fp1, fp2)

# Create a DataFrame to store the similarities
similarity_data = []

for valid_smile, valid_fp in zip(valid_smiles, valid_fps):
    for ntd_drug_smile, ntd_drug_fp in zip(ntd_drugs_smiles, ntd_drugs_fps):
        similarity = compute_tanimoto(valid_fp, ntd_drug_fp)
        similarity_data.append([valid_smile, ntd_drug_smile, similarity])

similarity_df = pd.DataFrame(similarity_data, columns=['valid_smile', 'ntd_drug_smile', 'tanimoto_similarity'])

In [1]:
filtered_similarity_df = similarity_df[(similarity_df['tanimoto_similarity'] >= 0.45) & (similarity_df['tanimoto_similarity'] < 0.6)]


NameError: name 'similarity_df' is not defined

In [33]:
# Merge with NTD drugs data to get molecule names
filtered_similarity_df = filtered_similarity_df.merge(ntd_drugs_df[['canonical_smiles', 'molecule_name']], left_on='ntd_drug_smile', right_on='canonical_smiles', how='left')

# Select and reorder columns
final_df = filtered_similarity_df[['valid_smile', 'ntd_drug_smile', 'tanimoto_similarity', 'molecule_name']]

In [34]:
final_df

Unnamed: 0,valid_smile,ntd_drug_smile,tanimoto_similarity,molecule_name
0,CCCc1ccc(NC(=O)N/N=C/c2ccc([N+](=O)[O-])o2)cc1,NC(=O)N/N=C/c1ccc([N+](=O)[O-])o1,0.557692,NITROFURAZONE
1,CCCCCCCNc1ccnc2cc(Cl)ccc12,CCN(CC)CCCC(C)Nc1ccnc2cc(Cl)ccc12,0.500000,CHLOROQUINE
2,COc1ccc(/N=C/c2ccc([N+](=O)[O-])o2)cc1,NC(=O)N/N=C/c1ccc([N+](=O)[O-])o1,0.458333,NITROFURAZONE
3,CCN(CC)CCCNc1ccnc2ccccc12,CCN(CC)CCCC(C)Nc1ccnc2cc(Cl)ccc12,0.454545,CHLOROQUINE
4,O=C(Nc1nc2ccccc2[nH]1)c1ccc(-c2ccccc2)cc1,COC(=O)Nc1nc2cc(C(=O)c3ccccc3)ccc2[nH]1,0.470588,MEBENDAZOLE
...,...,...,...,...
450,CCCCCCCCCCCCCCCOc1ccc(CC(=O)N/N=C/c2ccc([N+](=...,NC(=O)N/N=C/c1ccc([N+](=O)[O-])o1,0.500000,NITROFURAZONE
451,CCCCCCCCCCNc1nccn1CC(=O)NCc1ccccc1.Cl,O=C(Cn1ccnc1[N+](=O)[O-])NCc1ccccc1,0.526316,BENZNIDAZOLE
452,N#CSc1nccnc1C(=O)N/N=C/c1ccc([N+](=O)[O-])o1,NC(=O)N/N=C/c1ccc([N+](=O)[O-])o1,0.527273,NITROFURAZONE
453,CCCCCCOc1ccc(C(=O)C(C)=O)cc1,N=C(N)c1ccc(OCCCCCOc2ccc(C(=N)N)cc2)cc1,0.459459,PENTAMIDINE


In [35]:
final_df.to_csv('tanimoto_similarity_results.csv', index=False)


In [36]:
print(len(valid_smiles))

214454


In [37]:
import pandas as pd

# Read NTD_Drugs.csv and label the canonical_smiles column
ntd_drugs_file = 'Final_NTD_Drugs_with_names.csv'
ntd_drugs_data = pd.read_csv(ntd_drugs_file)
ntd_drugs_data['label'] = pd.factorize(ntd_drugs_data['canonical_smiles'])[0]
label_to_name = ntd_drugs_data.set_index('label')['molecule_name'].to_dict()

# Read tanimoto_similarity_results.csv
tanimoto_file = 'tanimoto_similarity_results.csv'
tanimoto_data = pd.read_csv(tanimoto_file)

# Merge dataframes to get labels in tanimoto_similarity_results
merged_data = tanimoto_data.merge(ntd_drugs_data[['canonical_smiles', 'label']], left_on='ntd_drug_smile', right_on='canonical_smiles', how='left')

# Count occurrences of each label and get canonical_smiles
label_counts = merged_data.groupby('label').size()
label_smiles = merged_data.groupby('label')['canonical_smiles'].first()

# Convert labels to molecule names
molecule_names = label_counts.index.map(label_to_name)

# Combine counts, molecule names, and canonical_smiles
results = pd.DataFrame({
    'molecule_name': molecule_names,
    'count': label_counts.values,
    'canonical_smile': label_smiles.values
})

results


Unnamed: 0,molecule_name,count,canonical_smile
0,NIFURTIMOX,2,CC1CS(=O)(=O)CCN1/N=C/c1ccc([N+](=O)[O-])o1
1,BENZNIDAZOLE,19,O=C(Cn1ccnc1[N+](=O)[O-])NCc1ccccc1
2,ACETAZOLAMIDE,4,CC(=O)Nc1nnc(S(N)(=O)=O)s1
3,ALBENDAZOLE,7,CCCSc1ccc2[nH]c(NC(=O)OC)nc2c1
4,OXFENDAZOLE,5,COC(=O)Nc1nc2cc([S+]([O-])c3ccccc3)ccc2[nH]1
5,CHLOROQUINE,95,CCN(CC)CCCC(C)Nc1ccnc2cc(Cl)ccc12
6,MEBENDAZOLE,25,COC(=O)Nc1nc2cc(C(=O)c3ccccc3)ccc2[nH]1
7,PENTAMIDINE,24,N=C(N)c1ccc(OCCCCCOc2ccc(C(=N)N)cc2)cc1
8,NITROFURAZONE,266,NC(=O)N/N=C/c1ccc([N+](=O)[O-])o1
9,FLUCONAZOLE,2,OC(Cn1cncn1)(Cn1cncn1)c1ccc(F)cc1F
