In [None]:
import sqlite3
from rdkit import Chem
from rdkit.Chem import rdMolDescriptors
from rdkit import DataStructs

# Connect to the SQLite Database
conn = sqlite3.connect('your_database.db')  # Replace with your database path
cursor = conn.cursor()

# Fetch the SMILES strings from your table
query = "SELECT id, smiles_column FROM your_table"  # Replace with your query
cursor.execute(query)
smiles_data = cursor.fetchall()

# Convert SMILES to fingerprints
fingerprints = []
for id, smiles in smiles_data:
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        fp = rdMolDescriptors.GetMorganFingerprintAsBitVect(mol, radius=2)
        fingerprints.append((id, fp))

# Calculate Tanimoto similarities
for i in range(len(fingerprints)):
    for j in range(i + 1, len(fingerprints)):
        id1, fp1 = fingerprints[i]
        id2, fp2 = fingerprints[j]
        similarity = DataStructs.FingerprintSimilarity(fp1, fp2)
        print(f"Similarity between {id1} and {id2}: {similarity}")

# Close the database connection
conn.close()


######################## clustering below

import numpy as np
from scipy.cluster.hierarchy import linkage, dendrogram
import matplotlib.pyplot as plt
from rdkit import DataStructs

# Assuming 'fingerprints' is a list of (id, fingerprint) tuples as in the previous example
# Calculate the distance matrix
num_molecules = len(fingerprints)
distance_matrix = np.zeros((num_molecules, num_molecules))

for i in range(num_molecules):
    for j in range(num_molecules):
        # Use 1 - Tanimoto similarity as the distance
        distance = 1 - DataStructs.FingerprintSimilarity(fingerprints[i][1], fingerprints[j][1])
        distance_matrix[i, j] = distance

# Perform hierarchical clustering
Z = linkage(distance_matrix, method='ward')

# Plot dendrogram
plt.figure(figsize=(10, 8))
dendrogram(Z, labels=[f[0] for f in fingerprints])
plt.title("Hierarchical Clustering Dendrogram")
plt.xlabel("Compound ID")
plt.ylabel("Distance")
plt.show()

