# Running IF-SitePred in Jupyter NB

## Imports

In [1]:
from pymol import cmd, stored
import pickle
import numpy as np
from sklearn.cluster import DBSCAN
import os
import pandas as pd
import time 
import colorsys

## Utils functions

In [2]:
def save_to_pickle(variable, file_path):
    with open(file_path, 'wb') as file:
        pickle.dump(variable, file)

def read_from_pickle(file_path):
    with open(file_path, 'rb') as file:
        return pickle.load(file)

def write_chimerax_attr_file(data, attr_name, file_name, model_id='1', chain_id='A'):
    with open(file_name, 'w') as file:
        # Write the header
        file.write(f"attribute: {attr_name}\n")
        file.write("match mode: any\n")
        file.write("recipient: residues\n")
        file.write("\n")  # Blank line for readability
        
        # Write each residue's attribute
        for attr_value, res_nums in data.items():
            for res_num in res_nums:
                file.write(f"\t#{model_id}/{chain_id}:{res_num}\t{str(attr_value + 1)}\n")

def generate_distinct_colors_hex(n):
    colors_hex = []
    for i in range(n):
        # Divide the hue space evenly
        hue = i / n
        # Set saturation and lightness to 0.5 for vibrant colors
        saturation, lightness = 0.5, 0.5
        # Convert HSL to RGB
        rgb = colorsys.hls_to_rgb(hue, lightness, saturation)
        # Convert to hex, scaling RGB values to 0-255
        hex_color = '#' + ''.join(f'{int(val * 255):02x}' for val in rgb)
        colors_hex.append(hex_color)
    return colors_hex

## Predicting sites functions

In [3]:
def write_xyz(coords, target):
    """
    writes point cloud of pseudoatoms boxed around protein into xyz file
    """
    
    min_coords = [np.min(coords[:,0])-10, np.min(coords[:,1])-10, np.min(coords[:,2])-10]
    max_coords = [np.max(coords[:,0])+10, np.max(coords[:,1])+10, np.max(coords[:,2])+10]

    [x, y, z] = [np.arange(min_coords[i], max_coords[i], 1.5) for i in range(3)]

    with open(f'./results/xyz/xyz_{target}.xyz', 'w') as w:
        w.write(f'{len(x)*len(y)*len(z)}\npoint\n')
        for a in x:
            for b in y:
                for c in z:
                    a_val = round(a, 3)
                    b_val = round(b, 3)
                    c_val = round(c, 3)
                    w.write(f'PS {a_val:.3f} {b_val:.3f} {c_val:.3f}\n')


def write_repeats(coords):
    with open('repeats.xyz', 'w') as w:
        w.write(f'{len(coords)}\npoint\n')
        for c in coords:
            w.write(f'PS {c[0]:.3f} {c[1]:.3f} {c[2]:.3f}\n')

def write_coords(coords, filename):
    with open(filename, 'w') as w:
        w.write(f'{len(coords)}\npoint\n')
        for c in coords:
            w.write(f'PS {c[0]:.3f} {c[1]:.3f} {c[2]:.3f}\n')


def get_final_cloud(pdb, target, final_preds, chain):
    """
    extracts points in point cloud that are close to protein residues that have been predicted to be druggable
    """
    coords = np.array(cmd.get_coords('chA')) 
    write_xyz(coords, target)
    assert os.path.exists(f'./results/xyz/xyz_{target}.xyz')
    cmd.load(f'./results/xyz/xyz_{target}.xyz','cloud1')

    cmd.extract('cloud3', 'cloud1 within 3 of chA')
    cmd.extract('cloud_bubble', 'cloud1 within 6 of chA') # don't want these

    for p in range(len(final_preds)):
        cmd.select(f'sele_{final_preds[p]}', f'chA and resi {final_preds[p]}') 
        if cmd.count_atoms(f'cloud_bubble within 4.5 of sele_{final_preds[p]}') > 0:
            cmd.create('cloud', f'cloud_bubble within 4.5 of sele_{final_preds[p]}')
            cmd.delete(f'sele_{final_preds[p]}')
            break
    
    for i in final_preds[p+1:]:
        try:
            cmd.select(f'sele_{i}', f'chA and resi {i}')
            cmd.create(f'cloud1_{i}', f'cloud_bubble within 4.5 of sele_{i}')
            if cmd.count_atoms(f'cloud1_{i}') > 0:
                cmd.copy_to('cloud', f'cloud1_{i}')
            cmd.delete(f'sele_{i}')
        except:
            raise


    coords = cmd.get_coords('cloud')
    coord_list = [','.join([str(n) for n in i]) for i in coords]
    repeats = list(sorted([i for i in coord_list if coord_list.count(i) > 2]))
    repeats = np.array([[float(x) for x in c.split(',')] for c in repeats])

    return repeats

## Target selection

In [4]:
target_dir = './../../rep_chains'
preds_dir = "./results/IFSP_preds"

In [5]:
OLD_targets = read_from_pickle(r"./results/PDB_rep_chains_files.pkl")

In [6]:
targets = read_from_pickle("./results/PDB_rep_chains_files_V2.pkl")

In [7]:
N_targets = len(targets)
print(N_targets)

4037


In [10]:
new_targets = [target for target in targets if target not in OLD_targets]
print(len(new_targets))

2128


In [7]:
(23391/8)*7

20467.125

It is now 16:33 on 26/03/2024

## Running IF-SitePred on Human LIGYSIS representative ligand-binding chains

Started at 23:14 ON 04/04/2024

In [13]:
len(no_sites)

68

In [17]:
no_sites = read_from_pickle("./results/IFSP_no_sites_half.pkl")
sites_per_prot = read_from_pickle("./results/IFSP_sites_per_prot_half.pkl")

In [19]:
yes_sites = list(sites_per_prot.keys())

In [21]:
print(len(no_sites))
print(len(yes_sites))

74
1128


In [22]:
errors = []
for i, target in enumerate(new_targets):

    if i % 50 == 0:
        print(i)

    if target_id in no_sites:
        continue

    elif target_id in yes_sites:
        continue

    target_id = target.split(".")[0]
    target_path = os.path.join(target_dir, target)
    
    prediction_dir = os.path.join(preds_dir, target_id)
    
    binding_ress_path = os.path.join(prediction_dir, f'{target_id}_binding_ress.pkl')
    
    if not os.path.isfile(binding_ress_path):
        continue

    binding_ress = read_from_pickle(binding_ress_path)
    chain = list(binding_ress.keys())[0]
    preds = list(binding_ress.values())[0]

    if preds == []:
        print("No binding residues for {}".format(target_id))
        no_sites.append(target_id)
        continue
    
    try:
        cmd.reinitialize()
        
        cmd.load(target_path, 'complex')
        
        cmd.extract('hets', 'complex and HETATM')
        
        cmd.delete('hets')
        
        cmd.extract('chA', f'complex and chain {chain}')
    
        int_preds = [int(pred) for pred in preds]
        
        final_coords = get_final_cloud(target_path, target_id, preds, chain)
    
        if final_coords.size == 0:
            print("No sites were found for {}".format(target_id))
            no_sites.append(target_id)
        else:
    
            # cluster coordinates (that have been repeated by different residues) with a maximum distance threshold of 1.5A
            clustering = DBSCAN(eps=1.7, min_samples=2).fit(final_coords)
        
            # get size of each cluster for ranking
            site_counts = {}
            for site in set(clustering.labels_):
                if int(site) != -1:
                    site_counts[site] = list(clustering.labels_).count(site)
        
            print('{} sites found for {}'.format(str(len(site_counts)), target_id))
            sites_per_prot[target_id] = site_counts
            
            for site_rank in range(len(site_counts)):
        
                biggest = max(site_counts, key=site_counts.get)
        
                # get coordinates of points in selected site, and calculate the centre by taking the mean of all points in the site
                main_site = np.array([final_coords[i] for i in range(len(final_coords)) if clustering.labels_[i] == biggest])
                
                write_coords(main_site, f'./results/IFSP_preds/{target_id}/site_rank_{site_rank+1}.xyz')
                
                centre = np.array([np.mean(main_site[:,0]), np.mean(main_site[:,1]), np.mean(main_site[:,2])])
        
                with open(f'./results/IFSP_preds/{target_id}/centre_rank_{site_rank+1}.xyz', 'w') as w:
                    w.write(f'1\npoint\n')
                    w.write(f'PS {centre[0]:.3f} {centre[1]:.3f} {centre[2]:.3f}\n')
        
                del site_counts[biggest]
    
    except Exception as e:
        print("ERROR with {}".format(target_id))
        errors.append(target)
        raise e

0

10 sites found for 6cph_D
50
100
150
200
250
300
350
400
450
500
550
600
650
700
750
800
850
900
950
1000
1050
1100
1150
1200
1250
1300
1350
1400
1450
1500
1550
1600
1650
1700
1750
1800
1850
1900
1950
2000
2050
2100


In [23]:
len(no_sites)

74

In [24]:
len(errors)

0

In [25]:
save_to_pickle(no_sites, "./results/IFSP_no_site_accs_V2.pkl")

## ChimeraX colouring commands

To colour by ligandability score:
    
    color byattribute r:ligandability #!1 target scab palette 0,white:0.5,#febe55:1,#de2d26

To colour by binary ligand-binding label:

    color byattribute r:ligand_binding #!1 target scab palette 0,white:0.5,white:1,red