In [1]:
# Generates heatmap for wb and wb2 interactions across all available class A GPCRs

import sys
import glob

In [2]:
# Utils.py

PDB_TO_UNIPROT_TABLE_PATH = "/scratch/PI/rondror/akma327/DynamicNetworks/data/crystal-analysis/simulation-analysis/gpcrdb-freq-config/GPCR_PDB_List.txt"
GPCRDB_TABLE_PATH="/scratch/PI/rondror/akma327/DynamicNetworks/data/crystal-analysis/simulation-analysis/gpcrdb-freq-config/All_species_gpcrdb_numbers_strOnly.txt"
GPCRDB_RESIDUE_FREQ_TABLE="/scratch/PI/rondror/akma327/DynamicNetworks/data/crystal-analysis/simulation-analysis/gpcrdb-freq-config/gpcrdb_residue_conservation.txt"


# Rename amino acids to common name
def fixAminoAcidNames(key):
	key = key.replace("HSD", "HIS")
	key = key.replace("HSE", "HIS")
	key = key.replace("HSP", "HIS")
	key = key.replace("HIE", "HIS")
	key = key.replace("HIP", "HIS")
	key = key.replace("HID", "HIS")
	key = key.replace("GLH", "GLU")
	key = key.replace("ASH", "ASP")
	key = key.replace("CYP", "CYS")
	key = key.replace("CYX", "CYS")
	return key

# Retrive gpcrdb from dictionary for specified residue. Return "-" if not found
def getGPCRDB(res, GPCRDB_DICT):
	res = fixAminoAcidNames(res)
	if(res not in GPCRDB_DICT):
		print(res + " not found.")
		return "-"
	return GPCRDB_DICT[res]

# Create directory if not exist
def createDirectory(OUTPUT_FILE):
	directory = os.path.dirname(OUTPUT_FILE)
	if not os.path.exists(directory):
		os.makedirs(directory)

# Generate write file descriptor 
def genWriteDescriptor(OUTPUT_FILE):
	createDirectory(OUTPUT_FILE)
	return open(OUTPUT_FILE, 'w')


# Retrieve Uniprot Code for the PDB_CODE from pdb_to_uniprot_table_path
def getUniprotCode(PDB_CODE):
	f = open(PDB_TO_UNIPROT_TABLE_PATH, 'r')
	for line in f:
		if(line == "\n"): continue 
		l_info = line.split("\t")
		uniprot_code, pdb = l_info[0].strip(), l_info[2].strip()
		if(PDB_CODE.upper() == pdb.upper()): return uniprot_code.upper()
	print("PDB_CODE Not Found in PDB To Uniprot Table")
	exit(1)


# Given uniprot code reads through GPCRDB_TABLE_PATH to generate the amino acid
# to gpcrdb number table. 
# Output {"ASP112": "1x50", "ARG116":"2x45"}
def genGpcrdbDict(UNIPROT_CODE):
	GPCRDB_DICT = {}
	f = open(GPCRDB_TABLE_PATH, 'r')
	for line in f: 
		l_info = line.split("\t")
		uniprot, resnum, resname, gpcrdb = l_info[0].strip(), l_info[1].strip(), l_info[2].strip(), l_info[4].strip()
		if(uniprot == None): continue
		if(uniprot.upper() == UNIPROT_CODE.upper()):
			key = resname.upper() + resnum 
			GPCRDB_DICT[key] = gpcrdb
	return GPCRDB_DICT


# Generates the residue to gpcrdb table for given pdb
def genResidueToGpcrdbTable(PDB_CODE):
	UNIPROT_CODE = getUniprotCode(PDB_CODE)
	GPCRDB_DICT = genGpcrdbDict(UNIPROT_CODE)
	return GPCRDB_DICT


In [8]:
def orderpair(atom1, atom2):
    if(atom1 > atom2): return (atom1, atom2)
    return (atom2, atom1)


def process_columns(pdb_to_lw):
    ### Get union set of all ligand water mediated interaction keys
    interaction_key_union = set()
    for pdb in pdb_to_lw:
        interaction_key_union |= set(pdb_to_lw[pdb])
        
    filtered_columns = []
    for key in interaction_key_union:
        TM = int(key[0].split("x")[0])
        if(TM >= 1 and TM <= 7):
            filtered_columns.append(key)
    
    ### Sort the interaction key union. This determines column order
    sorted_columns = sorted(filtered_columns, key=lambda x: (x[0]))
    return (sorted_columns)
    
def group_by_uniprot(pdb_to_lw):
    pdb_to_lw_protein_grouped = {}
    for pdb in pdb_to_lw:
        uniprot = getUniprotCode(pdb)
        if(uniprot not in pdb_to_lw_protein_grouped):
            pdb_to_lw_protein_grouped[uniprot] = [(pdb, pdb_to_lw[pdb])]
        else:
            pdb_to_lw_protein_grouped[uniprot].append((pdb, pdb_to_lw[pdb]))
    return pdb_to_lw_protein_grouped
    

def write_table(pdb_to_wb, OUTPUT_TABLE):
    fw = open(OUTPUT_TABLE, 'w')
    column_headers = process_columns(pdb_to_wb)
    header = ["receptor"] + [gpcrdb1 + ":" + gpcrdb2 + ":" + c_type for gpcrdb1, gpcrdb2, c_type in column_headers]
    
    pdb_to_wb_protein_grouped = group_by_uniprot(pdb_to_wb)
    
    ### Write column headers
    fw.write("\t".join(header) + "\n")
    
    ### Write out each receptor entry 
    for uniprot in sorted(pdb_to_wb_protein_grouped.keys()):
        for row_key, row_interactions in pdb_to_wb_protein_grouped[uniprot]:
            row_info = [uniprot + ":" + row_key]
            for interaction_key in column_headers:
                if(interaction_key in row_interactions):
                    row_info += "1"
                else:
                    row_info += "0"
            fw.write("\t".join(row_info) + "\n")
        
def group_pdb_chains(pdb_to_wb):
    grouped_pdb_to_wb = {}
    for pdb_chain in pdb_to_wb:
        wbs = pdb_to_wb[pdb_chain]
        pdb, chain = pdb_chain.split("_")
        if (pdb not in grouped_pdb_to_wb):
            grouped_pdb_to_wb[pdb] = wbs
        else:
            grouped_pdb_to_wb[pdb] += wbs 

    for key in grouped_pdb_to_wb:
        grouped_pdb_to_wb[key] = list(set(grouped_pdb_to_wb[key]))
        
    return grouped_pdb_to_wb
        

def extract_wb_info(INPUT_DIR, PDB_LIST=[]):
    pdb_to_wb = {}
    pdb_files = glob.glob(INPUT_DIR + "/*txt")
    for index, pdb_file in enumerate(pdb_files):
        print(pdb_file)
#         if(index > 2): break
        pdb_chain = pdb_file.strip().split("/")[-1].strip(".txt")
        pdb, chain = pdb_chain.split("_")
        print(index, pdb, chain)
        if(len(PDB_LIST) != 0 and pdb not in PDB_LIST): continue
        if(len(pdb) != 4): 
            continue
            
        ### Process each pdb file for its ligand residue interactions
        pdb_to_wb[pdb_chain] = []
        GPCRDB_DICT = genResidueToGpcrdbTable(pdb)
        f = open(pdb_file, 'r')
        for line in f:
            linfo = line.strip().split("@")
            interaction_type = linfo[1].strip("-") # wb or wb2
            atom1, atom2 = linfo[0].split(" -- ")
            print(atom1, atom2)
            residue_atom1, residue_atom2 = orderpair(atom1, atom2)
            residue1 = residue_atom1.split("-")[0]
            residue2 = residue_atom2.split("-")[0]
            gpcrdb1 = getGPCRDB(residue1, GPCRDB_DICT)
            gpcrdb2 = getGPCRDB(residue2, GPCRDB_DICT)
            if(gpcrdb1 != None and gpcrdb1 != "None" and gpcrdb2 != None and gpcrdb2 != "None"):
                pdb_to_wb[pdb_chain].append((gpcrdb1, gpcrdb2, interaction_type))

    ### Group chains that are part of same pdb
    pdb_to_wb = group_pdb_chains(pdb_to_wb)
    return pdb_to_wb


def heatmap(INPUT_DIR, OUTPUT_TABLE, PDB_LIST=[]):
    pdb_to_wb = extract_wb_info(INPUT_DIR, PDB_LIST)
    write_table(pdb_to_wb, OUTPUT_TABLE)


In [12]:
# watermarks all receptors

INPUT_DIR="/scratch/PI/rondror/akma327/DynamicNetworks/data/crystal-analysis/ligand-wetness/watermarks/heatmaps/030717-receptor/individual-pdb-chain"
OUTPUT_TABLE="/scratch/PI/rondror/akma327/DynamicNetworks/data/crystal-analysis/ligand-wetness/watermarks/heatmaps/030717-receptor/watermark_receptor_heatmap.txt"
heatmap(INPUT_DIR, OUTPUT_TABLE)

(0, '2VT4', 'A')
('ARG183-NH2', 'GLU115-OE1')
('ARG183-NH2', 'ILE177-O')
('ARG183-NH2', 'TRP182-NE1')
('ASP195-O', 'CYS198-O')
('ASP195-O', 'PRO196-O')
('CYS198-O', 'PRO196-O')
('CYS302-O', 'PHE306-N')
('CYS302-O', 'PHE328-O')
('GLU115-OE1', 'ILE177-O')
('GLU115-OE1', 'TRP182-NE1')
('ILE177-O', 'TRP182-NE1')
('LEU113-N', 'TRP109-O')
('PHE306-N', 'PHE328-O')
('SER173-O', 'TYR207-OH')
('SER173-O', 'VAL172-O')
('TYR207-OH', 'VAL172-O')
(1, '3VG9', 'B')
('ALA51-O', 'SER65-N')
SER65 not found.
('ALA51-O', 'SER65-O')
SER65 not found.
('ALA51-O', 'TRP35-NE1')
TRP35 not found.
('ALA55-O', 'VAL58-O')
VAL58 not found.
ALA55 not found.
('ASN137-ND2', 'SER116-OG')
SER116 not found.
ASN137 not found.
('ASN137-ND2', 'THR114-O')
THR114 not found.
ASN137 not found.
('ASP1-O', 'GLU27-OE2')
GLU27 not found.
ASP1 not found.
('ASP1-O', 'SER26-OG')
SER26 not found.
ASP1 not found.
('ASP143-O', 'LYS199-N')
LYS199 not found.
ASP143 not found.
('ASP151-N', 'GLY152-N')
ASP151 not found.
('ASP151-N', 'SER191-OG

ValueError: invalid literal for int() with base 10: '-'

In [9]:
# watermarks high res receptors

INPUT_DIR="/scratch/PI/rondror/akma327/DynamicNetworks/data/crystal-analysis/ligand-wetness/watermarks/heatmaps/030717-receptor/individual-pdb-chain"
OUTPUT_TABLE="/scratch/PI/rondror/akma327/DynamicNetworks/data/crystal-analysis/ligand-wetness/watermarks/heatmaps/030717-receptor/watermark_receptor_heatmap_highres.txt"
PDB_LIST = ['5IU4', '4N6H', '4EIY', '5IU7', '5K2C', '5K2D', '5IUB', '5C1M', '4BVN']
heatmap(INPUT_DIR, OUTPUT_TABLE, PDB_LIST)

/scratch/PI/rondror/akma327/DynamicNetworks/data/crystal-analysis/ligand-wetness/watermarks/heatmaps/030717-receptor/individual-pdb-chain/2VT4_A.txt
(0, '2VT4', 'A')
/scratch/PI/rondror/akma327/DynamicNetworks/data/crystal-analysis/ligand-wetness/watermarks/heatmaps/030717-receptor/individual-pdb-chain/3VG9_B.txt
(1, '3VG9', 'B')
/scratch/PI/rondror/akma327/DynamicNetworks/data/crystal-analysis/ligand-wetness/watermarks/heatmaps/030717-receptor/individual-pdb-chain/4WW3_A.txt
(2, '4WW3', 'A')
/scratch/PI/rondror/akma327/DynamicNetworks/data/crystal-analysis/ligand-wetness/watermarks/heatmaps/030717-receptor/individual-pdb-chain/4BEY_A.txt
(3, '4BEY', 'A')
/scratch/PI/rondror/akma327/DynamicNetworks/data/crystal-analysis/ligand-wetness/watermarks/heatmaps/030717-receptor/individual-pdb-chain/4X1H_C.txt
(4, '4X1H', 'C')
/scratch/PI/rondror/akma327/DynamicNetworks/data/crystal-analysis/ligand-wetness/watermarks/heatmaps/030717-receptor/individual-pdb-chain/2G87_A.txt
(5, '2G87', 'A')
/scr

ValueError: invalid literal for int() with base 10: '-'