In [1]:
import os
import pickle
import pandas as pd
import numpy as np

def load_object(filename):
    with open(filename, 'rb') as f:
        return pickle.load(f)

  from pandas.core.computation.check import NUMEXPR_INSTALLED


In [2]:
affinity_dict = load_object('/data/grbv/PDBbind/DTI5_general_affinity_dict.pkl')
len(affinity_dict)

19443

In [3]:
casf2016_dir = '/data/grbv/PDBbind/DTI_5/input_graphs_esm2_t6_8M/test_data/casf2016'
casf2016_complexes = [filename[0:4] for filename in os.listdir(casf2016_dir) if 'graph' in filename]

casf2013_dir = '/data/grbv/PDBbind/DTI_5/input_graphs_esm2_t6_8M/test_data/casf2013'
casf2013_complexes = [filename[0:4] for filename in os.listdir(casf2013_dir) if 'graph' in filename]

train_dir = '/data/grbv/PDBbind/DTI_5/input_graphs_esm2_t6_8M/training_data'
train_complexes = [filename[0:4] for filename in os.listdir(train_dir) if 'graph' in filename]

In [None]:
# # Generate sequence dict with all complexes

# sequence_dict = {}
# save_path = '/data/grbv/PDBbind/DTI5_sequence_dict.pkl'


# data_dir = '/data/grbv/PDBbind/DTI5_input_data_processed'

# complexes = [folder for folder in os.listdir(data_dir) if len(folder)==4 and folder[0].isdigit()]
# for compl in complexes: 

#     in_dataset = False
#     if compl in train_complexes: in_dataset=True
#     elif compl in casf2013_complexes: in_dataset=True
#     elif compl in casf2016_complexes: in_dataset=True

#     if in_dataset:
#         path = os.path.join(data_dir, compl)

#         for file in os.scandir(path):
#             if file.name.endswith('protein_dict.pkl'):
#                 id = file.name[0:4]       
#                 protein_dict = load_object(file.path)

#                 seq = ''
#                 for chain in protein_dict:
#                     s = protein_dict[chain]['aa_seq']
#                     seq = seq + s

#                 sequence_dict[id]=seq   

# with open(save_path, 'wb') as fp:
#     pickle.dump(sequence_dict, fp)

In [4]:
sequence_dict = load_object('/data/grbv/PDBbind/DTI5_sequence_dict.pkl')
len(sequence_dict)

19132

In [None]:
# # Save sequences as fasta file:

# fasta_file = '/data/grbv/PDBbind/DTI5_sequences.fasta'

# with open(fasta_file, 'w') as f:
#     for key in sequence_dict:
#         f.write('>' + key + '\n' + sequence_dict[key] + '\n')

In [5]:
#using mmseqs2 to cluster the sequences
import subprocess

#mmseqs easy-cluster test_data.fasta clusterRes tmp --min-seq-id 0.5 -c 0.8 --cov-mode 1

subprocess.run(['mmseqs', 'easy-cluster', '/data/grbv/PDBbind/DTI5_sequences.fasta', 'clusterRes', 'tmp', '--min-seq-id', '0.8', '-c', '0.8', '--cov-mode', '1'])

easy-cluster /data/grbv/PDBbind/DTI5_sequences.fasta clusterRes tmp --min-seq-id 0.8 -c 0.8 --cov-mode 1 

MMseqs Version:                     	15.6f452
Substitution matrix                 	aa:blosum62.out,nucl:nucleotide.out
Seed substitution matrix            	aa:VTML80.out,nucl:nucleotide.out
Sensitivity                         	4
k-mer length                        	0
Target search mode                  	0
k-score                             	seq:2147483647,prof:2147483647
Alphabet size                       	aa:21,nucl:5
Max sequence length                 	65535
Max results per query               	20
Split database                      	0
Split mode                          	2
Split memory limit                  	0
Coverage threshold                  	0.8
Coverage mode                       	1
Compositional bias                  	1
Compositional bias                  	1
Diagonal scoring                    	true
Exact k-mer matching                	0
Mask residues                

CompletedProcess(args=['mmseqs', 'easy-cluster', '/data/grbv/PDBbind/DTI5_sequences.fasta', 'clusterRes', 'tmp', '--min-seq-id', '0.8', '-c', '0.8', '--cov-mode', '1'], returncode=0)

In [6]:
#lets analyse the clusterRes file
clusterRes = pd.read_csv('clusterRes_cluster.tsv', sep='\t', header=None)
clusterRes.head()


Unnamed: 0,0,1
0,1bcj,1bcj
1,2brp,2brp
2,2brp,1f9g
3,5gmh,5gmh
4,3tww,3tww


In [7]:
'''All members of the clustering are listed line by line. 
Each cluster is a consecutive block in the file. The first column always contains the representative sequence, the second contains the cluster member. For the example the cluster with the representative sequence Q0KJ32 contains four members itself and C0W539, D6KVP9, D1Y890.
 IDs are parsed from the header from the input database (see id parsing from headers).
'''
#lets count the number of clusters and its memebers
print(len(clusterRes[0].unique()), 'number of clusters')

3464 number of clusters


In [8]:
#lets count the number of sequences in each cluster
clusterRes[0].value_counts()

0
5kr2    438
5doh    420
3kmx    342
2v22    223
2tpi    194
       ... 
1mfa      1
6bvb      1
4np9      1
4ris      1
5xhz      1
Name: count, Length: 3464, dtype: int64

In [9]:
import csv

with open('clusterRes_cluster.tsv', 'r', newline='') as infile, open('clusterRes_cluster_DTI5_1.tsv', 'w', newline='') as outfile:
    reader = csv.reader(infile, delimiter='\t')
    writer = csv.writer(outfile, delimiter='\t')

    for row in reader:

        # Retrieve the value from the dictionary based on the second column
        id = row[1]

        if id in train_complexes: string = 'training'
        if id in casf2016_complexes: string = 'test    '
        if id in casf2013_complexes: string = 'test    '

        affinity = affinity_dict[id]['log_kd_ki']

        # Add this value as a third column
        row.append(string)
        row.append(affinity)

        
        # Write the modified row to the output file
        writer.writerow(row)

In [10]:
import networkx as nx
import matplotlib.pyplot as plt

# Create a graph
G = nx.Graph()

# Add edges from the DataFrame
for index, row in clusterRes.iterrows():
    G.add_edge(row[0], row[1])

# Set node positions using the spring layout
pos = nx.spring_layout(G)
# Set a clear background
plt.figure(figsize=(50, 50))
plt.gca().set_facecolor('white')

# Draw nodes with adjusted size and color
nx.draw_networkx_nodes(G, pos, node_size=200, node_color='skyblue', alpha=0.8)

# Draw edges with adjusted thickness and color
nx.draw_networkx_edges(G, pos, width=1.0, alpha=0.5, edge_color='gray')

# Draw labels with adjusted font size
nx.draw_networkx_labels(G, pos, font_size=8, font_color='black')


# Show the plot
plt.axis('off')  # Turn off axis
plt.show()