<h2 style="color:red;"> fingerprint_calculation</h2>

In [1]:
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit import DataStructs

def compute_fingerprint(smiles_string, radius=2, nBits=1024):
    """Compute the Morgan fingerprint for a molecule given its SMILES string."""
    molecule = Chem.MolFromSmiles(smiles_string)
    if molecule:
        return AllChem.GetMorganFingerprintAsBitVect(molecule, radius, nBits)
    else:
        raise ValueError(f"Invalid SMILES string: {smiles_string}")

def compute_similarity(fp1, fp2):
    """Compute the Tanimoto similarity between two fingerprints."""
    return DataStructs.TanimotoSimilarity(fp1, fp2)

if __name__ == "__main__":
    smiles1 = 'CCO'
    smiles2 = 'CCN'
    
    fp1 = compute_fingerprint(smiles1)
    fp2 = compute_fingerprint(smiles2)
    
    similarity = compute_similarity(fp1, fp2)
    
    print(f"Similarity between {smiles1} and {smiles2} is: {similarity:.3f}")


Similarity between CCO and CCN is: 0.333


<h2 style="color:blue;">1.  Hierarchical clustering by the RDKit library and the SciPy library </h2>

In [2]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit import DataStructs
from scipy.cluster.hierarchy import linkage, fcluster
import numpy as np

# Load the data from CSV
df = pd.read_csv('unique_smiles_circular_unique_tnm_1440.csv')

# Compute the fingerprints
molecules = [Chem.MolFromSmiles(smiles) for smiles in df['SMILES']]
fingerprints = [AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=1024) for mol in molecules]

# Calculate the distance matrix
num_molecules = len(molecules)
distance_matrix = np.zeros((num_molecules, num_molecules))
for i in range(num_molecules):
    for j in range(num_molecules):
        if i >= j:  # Only fill half of the matrix due to symmetry
            continue
        similarity = DataStructs.TanimotoSimilarity(fingerprints[i], fingerprints[j])
        distance_matrix[i, j] = distance_matrix[j, i] = 1 - similarity

# Hierarchical clustering
Z = linkage(distance_matrix, method='average')

# Define a threshold to cut the dendrogram and form clusters. This threshold represents 1 - similarity value.
threshold = 1.75
labels = fcluster(Z, threshold, criterion='distance')

df['Cluster'] = labels

# Group compounds that are closely related
clustered_compounds = df.groupby('Cluster')['SMILES'].apply(list)

for cluster, compounds in clustered_compounds.items():
    print(f"Cluster {cluster}:")
    for compound in compounds:
        print(f"\t{compound}")
    print("\n")


FileNotFoundError: [Errno 2] No such file or directory: 'unique_smiles_circular_unique_tnm_1440.csv'

In [3]:
import scipy

In [4]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit import DataStructs
from scipy.cluster.hierarchy import linkage, fcluster
import numpy as np

# Load the data from CSV
df = pd.read_csv('unique_smiles_allC_circular_unique_tnm_1440.csv')

# Compute the fingerprints
molecules = [Chem.MolFromSmiles(smiles) for smiles in df['SMILES']]
fingerprints = [AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=1024) for mol in molecules]

# Calculate the distance matrix
num_molecules = len(molecules)
distance_matrix = np.zeros((num_molecules, num_molecules))
for i in range(num_molecules):
    for j in range(num_molecules):
        if i >= j:  # Only fill half of the matrix due to symmetry
            continue
        similarity = DataStructs.TanimotoSimilarity(fingerprints[i], fingerprints[j])
        distance_matrix[i, j] = distance_matrix[j, i] = 1 - similarity

# Hierarchical clustering
Z = linkage(distance_matrix, method='average')
threshold = 1.85
labels = fcluster(Z, threshold, criterion='distance')

df['Cluster'] = labels

# Create a column to hold average Tanimoto similarities within the cluster
df['Avg_Similarity'] = 0.0  

for index, row in df.iterrows():
    cluster_label = row['Cluster']
    current_fp = fingerprints[index]
    
    # Get the fingerprints of all molecules in the same cluster
    same_cluster_fps = [fp for idx, fp in enumerate(fingerprints) if labels[idx] == cluster_label and idx != index]
    
    # Calculate the average similarity of the current molecule to all other molecules in its cluster
    if same_cluster_fps:
        similarities = [DataStructs.TanimotoSimilarity(current_fp, fp) for fp in same_cluster_fps]
        avg_similarity = sum(similarities) / len(similarities)
        df.at[index, 'Avg_Similarity'] = avg_similarity

# Save the DataFrame with cluster labels and average similarity
#df.to_csv('clustered_and_similarity.csv', index=False)
print(df)

# Get the column names
cols = df.columns.tolist()

# Remove 'Cluster' and 'Avg_Similarity' from the list
cols.remove('Cluster')
cols.remove('Avg_Similarity')

# Insert 'Cluster' and 'Avg_Similarity' at the desired positions
cols.insert(2, 'Cluster')
cols.insert(3, 'Avg_Similarity')

# Reorder the dataframe columns
df = df[cols]

df

  Z = linkage(distance_matrix, method='average')


                                                SMILES     zscore     Target  \
0      Cc1[nH]nc2Nc3cc(ncc3C(=Nc12)c4ccccc4Cl)N5CCOCC5 -44.513706      toxic   
1    COc1cc(N(C)CCN(C)C)c(NC(=O)C=C)cc1Nc2nccc(n2)-... -40.997577      toxic   
2    Cn1nc(nc1Nc2ccc3[nH]ncc3c2C4CC4)-c5ccc(cc5)C(=... -18.000000      toxic   
3    COCc1nn(Cc2ccc(Cn3ccccc3=O)cc2)cc1C(=O)NCc4ncc... -13.451337      toxic   
4    Cl.Cl.CC(C)(C)CNCCn1c(Sc2cc3OCOc3cc2I)nc4c(N)n...  -8.485281      toxic   
..                                                 ...        ...        ...   
442          CNc1nc(Nc2cn(nc2C)C(C)(C)C#N)ncc1C(F)(F)F  -0.481162  non_toxic   
443                        CC(N(O)C(N)=O)c1cc2ccccc2s1  -0.816072  non_toxic   
444  OC(=O)CCCCN(CCc1ccccc1OCc2ccc(cc2Cl)-c3ccc(cc3...  -0.528668  non_toxic   
445  OC[C@@H]1CC[C@H](CO1)Nc2ncnc3[nH]cc(C(=O)c4ccc...  -0.934325  non_toxic   
446  COc1cc(Nc2ncc(C)c(Nc3ccc4oc(=O)[nH]c4c3)n2)cc(...  -0.939610  non_toxic   

    TSSPECIES SEX  nAtom  nHeavyAtom  n

Unnamed: 0,SMILES,zscore,Cluster,Avg_Similarity,Target,TSSPECIES,SEX,nAtom,nHeavyAtom,nHetero,...,SRW06,SRW08,SRW10,TSRW10,MW,AMW,WPath,Zagreb1,Zagreb2,mZagreb2
0,Cc1[nH]nc2Nc3cc(ncc3C(=Nc12)c4ccccc4Cl)N5CCOCC5,-44.513706,1,0.118791,toxic,DOG,M,47,28,8,...,7.058758,8.686261,10.359170,78.040268,394.130887,8.385764,1.873000e+03,156,189,6.027778
1,COc1cc(N(C)CCN(C)C)c(NC(=O)C=C)cc1Nc2nccc(n2)-...,-40.997577,1,0.145199,toxic,MOUSE,M,70,37,9,...,7.236339,8.843471,10.503724,87.974766,499.269573,7.132422,4.557000e+03,192,226,8.277778
2,Cn1nc(nc1Nc2ccc3[nH]ncc3c2C4CC4)-c5ccc(cc5)C(=...,-18.000000,1,0.140744,toxic,DOG,M,53,32,10,...,7.167809,8.782783,10.447642,88.519310,437.177565,8.248633,3.319000e+03,176,211,6.861111
3,COCc1nn(Cc2ccc(Cn3ccccc3=O)cc2)cc1C(=O)NCc4ncc...,-13.451337,1,0.144756,toxic,MONKEY,F,62,36,10,...,7.167809,8.746875,10.367190,86.273215,491.196883,7.922530,4.842000e+03,184,214,8.138889
4,Cl.Cl.CC(C)(C)CNCCn1c(Sc2cc3OCOc3cc2I)nc4c(N)n...,-8.485281,1,0.107127,toxic,DOG,M,56,31,12,...,7.063904,8.678802,10.341581,82.787983,598.018148,10.678896,5.900002e+09,158,186,6.125000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
442,CNc1nc(Nc2cn(nc2C)C(C)(C)C#N)ncc1C(F)(F)F,-0.481162,1,0.093876,non_toxic,RAT,F,40,24,10,...,6.870053,8.500047,10.168579,72.732280,339.141928,8.478548,1.405000e+03,126,147,5.180556
443,CC(N(O)C(N)=O)c1cc2ccccc2s1,-0.816072,1,0.113722,non_toxic,PIG,F,28,16,5,...,6.385194,7.974877,9.604745,62.363328,236.061949,8.430784,4.360000e+02,82,96,3.527778
444,OC(=O)CCCCN(CCc1ccccc1OCc2ccc(cc2Cl)-c3ccc(cc3...,-0.528668,1,0.140276,non_toxic,RAT,M,84,48,11,...,7.500529,9.102867,10.750750,85.992042,680.226485,8.097934,1.034700e+04,250,290,10.444444
445,OC[C@@H]1CC[C@H](CO1)Nc2ncnc3[nH]cc(C(=O)c4ccc...,-0.934325,1,0.138557,non_toxic,RAT,F,57,34,9,...,7.189168,8.801018,10.459296,84.446708,478.140783,8.388435,3.776000e+03,182,216,7.500000


In [5]:
df
df.to_csv('clustered_and_similarity.csv', index=False)

<h2 style="color:blue;">2.Butina clustering using the RDKit library:</h2>

In [42]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import AllChem, DataStructs
from rdkit.ML.Cluster import Butina

# Load the data from CSV
df = pd.read_csv('unique_smiles_circular_unique_tnm_1440.csv')

# Compute the fingerprints
molecules = [Chem.MolFromSmiles(smiles) for smiles in df['SMILES']]
fingerprints = [AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=1024) for mol in molecules]

# Calculate the distance matrix
num_molecules = len(molecules)
distance_matrix = []
for i in range(num_molecules):
    for j in range(i+1, num_molecules):
        similarity = DataStructs.TanimotoSimilarity(fingerprints[i], fingerprints[j])
        distance_matrix.append(1 - similarity)

# Using the Butina clustering algorithm
cutoff = 0.91
clusters = Butina.ClusterData(distance_matrix, num_molecules, cutoff, isDistData=True)

# Adding clusters to the DataFrame
df['Cluster'] = -1
for cluster_id, cluster in enumerate(clusters):
    for idx in cluster:
        df.at[idx, 'Cluster'] = cluster_id

# Calculate the average similarity for each molecule to all others in its cluster
df['Avg_Similarity'] = 0.0
for index, row in df.iterrows():
    cluster_id = row['Cluster']
    current_fp = fingerprints[index]

    # Get the fingerprints of all molecules in the same cluster
    same_cluster_fps = [fingerprints[idx] for idx in clusters[cluster_id] if idx != index]

    # Calculate the average similarity of the current molecule to all other molecules in its cluster
    if same_cluster_fps:
        similarities = [DataStructs.TanimotoSimilarity(current_fp, fp) for fp in same_cluster_fps]
        avg_similarity = sum(similarities) / len(similarities)
        df.at[index, 'Avg_Similarity'] = avg_similarity

# Save the DataFrame with cluster labels and average similarity
df.to_csv('butina_clustered_avg_similarity.csv', index=False)




In [43]:
df

Unnamed: 0,SMILES,Cluster,Avg_Similarity
0,Cc1[nH]nc2Nc3cc(ncc3C(=Nc12)c4ccccc4Cl)N5CCOCC5,0,0.113480
1,COc1cc(N(C)CCN(C)C)c(NC(=O)C=C)cc1Nc2nccc(n2)-...,0,0.140841
2,Cn1nc(nc1Nc2ccc3[nH]ncc3c2C4CC4)-c5ccc(cc5)C(=...,0,0.137070
3,COCc1nn(Cc2ccc(Cn3ccccc3=O)cc2)cc1C(=O)NCc4ncc...,0,0.141013
4,Cl.Cl.CC(C)(C)CNCCn1c(Sc2cc3OCOc3cc2I)nc4c(N)n...,0,0.104591
...,...,...,...
442,CNc1nc(Nc2cn(nc2C)C(C)(C)C#N)ncc1C(F)(F)F,0,0.090390
443,CC(N(O)C(N)=O)c1cc2ccccc2s1,0,0.112438
444,OC(=O)CCCCN(CCc1ccccc1OCc2ccc(cc2Cl)-c3ccc(cc3...,3,0.138889
445,OC[C@@H]1CC[C@H](CO1)Nc2ncnc3[nH]cc(C(=O)c4ccc...,0,0.134782
