In [73]:
# Generate heatmap for wb and wb2 across all class A GPCRs
# Rows = GPCRDB1:GPCRDB2:Interaction Type
# Columns = PDB

import sys
import glob

In [74]:
# Utils.py

PDB_TO_UNIPROT_TABLE_PATH = "/scratch/PI/rondror/akma327/DynamicNetworks/data/crystal-analysis/simulation-analysis/gpcrdb-freq-config/GPCR_PDB_List.txt"
GPCRDB_TABLE_PATH="/scratch/PI/rondror/akma327/DynamicNetworks/data/crystal-analysis/simulation-analysis/gpcrdb-freq-config/All_species_gpcrdb_numbers_strOnly.txt"
GPCRDB_RESIDUE_FREQ_TABLE="/scratch/PI/rondror/akma327/DynamicNetworks/data/crystal-analysis/simulation-analysis/gpcrdb-freq-config/gpcrdb_residue_conservation.txt"


# Rename amino acids to common name
def fixAminoAcidNames(key):
	key = key.replace("HSD", "HIS")
	key = key.replace("HSE", "HIS")
	key = key.replace("HSP", "HIS")
	key = key.replace("HIE", "HIS")
	key = key.replace("HIP", "HIS")
	key = key.replace("HID", "HIS")
	key = key.replace("GLH", "GLU")
	key = key.replace("ASH", "ASP")
	key = key.replace("CYP", "CYS")
	key = key.replace("CYX", "CYS")
	return key

# Retrive gpcrdb from dictionary for specified residue. Return "-" if not found
def getGPCRDB(res, GPCRDB_DICT):
	res = fixAminoAcidNames(res)
	if(res not in GPCRDB_DICT):
		print(res + " not found.")
		return "None"
	return GPCRDB_DICT[res]

# Create directory if not exist
def createDirectory(OUTPUT_FILE):
	directory = os.path.dirname(OUTPUT_FILE)
	if not os.path.exists(directory):
		os.makedirs(directory)

# Generate write file descriptor 
def genWriteDescriptor(OUTPUT_FILE):
	createDirectory(OUTPUT_FILE)
	return open(OUTPUT_FILE, 'w')


# Retrieve Uniprot Code for the PDB_CODE from pdb_to_uniprot_table_path
def getUniprotCode(PDB_CODE):
	f = open(PDB_TO_UNIPROT_TABLE_PATH, 'r')
	for line in f:
		if(line == "\n"): continue 
		l_info = line.split("\t")
		uniprot_code, pdb = l_info[0].strip(), l_info[2].strip()
		if(PDB_CODE.upper() == pdb.upper()): return uniprot_code.upper()
	print("PDB_CODE Not Found in PDB To Uniprot Table")
	exit(1)


# Given uniprot code reads through GPCRDB_TABLE_PATH to generate the amino acid
# to gpcrdb number table. 
# Output {"ASP112": "1x50", "ARG116":"2x45"}
def genGpcrdbDict(UNIPROT_CODE):
	GPCRDB_DICT = {}
	f = open(GPCRDB_TABLE_PATH, 'r')
	for line in f: 
		l_info = line.split("\t")
		uniprot, resnum, resname, gpcrdb = l_info[0].strip(), l_info[1].strip(), l_info[2].strip(), l_info[4].strip()
		if(uniprot.upper() == UNIPROT_CODE.upper()):
			key = resname.upper() + resnum 
			GPCRDB_DICT[key] = gpcrdb
	return GPCRDB_DICT


# Generates the residue to gpcrdb table for given pdb
def genResidueToGpcrdbTable(PDB_CODE):
	UNIPROT_CODE = getUniprotCode(PDB_CODE)
	GPCRDB_DICT = genGpcrdbDict(UNIPROT_CODE)
	return GPCRDB_DICT


In [77]:
def orderpair(atom1, atom2):
    if(atom1 < atom2): return (atom1, atom2)
    return (atom2, atom1)

# def write_table(pdb_to_wb, OUTPUT_TABLE):
#     f = open(OUTPUT_TABLE, 'w')
#     pdbs = pdb_to_wb.keys()
#     all_gpcrdb = set()
#     for pdb in pdb_to_wb:
#         all_gpcrdb |= set(pdb_to_wb[pdb])
        
#     header = "GPCRDB1:GPCRDB2\t" + "\t".join(pdbs)
#     f.write(header + "\n")
#     all_rows = []
#     for gpcrdb in all_gpcrdb:
#         row_info = [gpcrdb[0], gpcrdb[1], gpcrdb[2]]
#         for pdb in pdbs:
#             if(gpcrdb not in pdb_to_wb[pdb]):
#                 row_info.append("0")
#             else:
#                 row_info.append("1")
#         all_rows.append(row_info)
        
#     all_rows.sort(key=lambda x: (x[0], x[1], x[2]))
#     print(all_rows)
#     for row_info in all_rows:
#         f.write(":".join(row_info[0:3]) + "\t" + "\t".join(row_info[3:]) + "\n")


def write_table(pdb_to_wb, OUTPUT_TABLE):
    f = open(OUTPUT_TABLE, 'w')
    pdb_to_wb_grouped_uniprot_ligand = group_uniprot_and_ligand(pdb_to_wb)
    
    ### Get column headers of uniprot:ligand and union of water bridge interactions 
    all_gpcrdb = set()
    header = ["GPCRDB1:GPCRDB2"]
    for uniprot in sorted(pdb_to_wb_grouped_uniprot_ligand.keys()):
        for ligand in sorted(pdb_to_wb_grouped_uniprot_ligand[uniprot].keys()):
            header.append(uniprot + ":" + ligand)
            all_gpcrdb |= set(pdb_to_wb_grouped_uniprot_ligand[uniprot][ligand])
    
    f.write("\t".join(header) + "\n")
    
    all_rows = []
    for gpcrdb in all_gpcrdb:
        row_info = [gpcrdb[0], gpcrdb[1], gpcrdb[2]]
        ### Iterate uniprot and then ligands in same order 
        for uniprot in sorted(pdb_to_wb_grouped_uniprot_ligand.keys()):
            for ligand in sorted(pdb_to_wb_grouped_uniprot_ligand[uniprot].keys()):
                wbs = pdb_to_wb_grouped_uniprot_ligand[uniprot][ligand]
                if(gpcrdb not in wbs):
                    row_info.append("0")
                else:
                    row_info.append("1")
        all_rows.append(row_info)
    
    all_rows.sort(key=lambda x: (x[0], x[1], x[2]))
    for row_info in all_rows:
        f.write(":".join(row_info[0:3]) + "\t" + "\t".join(row_info[3:]) + "\n")
            
    
            
def group_uniprot_and_ligand(pdb_to_wb):
    uniprot_to_ligand_to_interactions = {}
    pdb_to_ligand_dict = pdb_to_ligand()
    for pdb in pdb_to_wb:
        uniprot = getUniprotCode(pdb)
        ligand = pdb_to_ligand_dict[pdb]
        wbs = pdb_to_wb[pdb]
        print(pdb, uniprot, ligand, wbs)
        if (uniprot not in uniprot_to_ligand_to_interactions):
            uniprot_to_ligand_to_interactions[uniprot] = {ligand:set(wbs)}
        else:
            if(ligand not in uniprot_to_ligand_to_interactions[uniprot]):
                uniprot_to_ligand_to_interactions[uniprot][ligand] = set(wbs)
            else:
                uniprot_to_ligand_to_interactions[uniprot][ligand] |= set(wbs)
    return uniprot_to_ligand_to_interactions
    


def group_pdb_chains(pdb_to_wb):
    grouped_pdb_to_wb = {}
    for pdb_chain in pdb_to_wb:
        wbs = pdb_to_wb[pdb_chain]
        pdb, chain = pdb_chain.split("_")
        if (pdb not in grouped_pdb_to_wb):
            grouped_pdb_to_wb[pdb] = wbs
        else:
            grouped_pdb_to_wb[pdb] += wbs 

    for key in grouped_pdb_to_wb:
        grouped_pdb_to_wb[key] = list(set(grouped_pdb_to_wb[key]))
        
    return grouped_pdb_to_wb
        
def group_by_uniprot(pdb_to_wb):
    pdb_to_wb_protein_grouped = {}
    for pdb in pdb_to_wb:
        uniprot = getUniprotCode(pdb)
        if(uniprot not in pdb_to_wb_protein_grouped):
            pdb_to_wb_protein_grouped[uniprot] = [(pdb, pdb_to_wb[pdb])]
        else:
            pdb_to_wb_protein_grouped[uniprot].append((pdb, pdb_to_wb[pdb]))
    return pdb_to_wb_protein_grouped
    
def pdb_to_ligand():
    classA_GPCR_table="/scratch/PI/rondror/akma327/DynamicNetworks/data/crystal-analysis/ligand-wetness/watermarks/tables/classA_GPCRs_known_ligands.tsv"
    f = open(classA_GPCR_table, 'r')
    header = f.readline()
    pdb_to_ligand = {}
    for line in f:
        linfo = line.strip().split("\t")
        uniprot, gpcr_class, pdb, ligand = linfo
        pdb_to_ligand[pdb] = ligand
        
    return pdb_to_ligand

        
def notTM7(gpcrdb):
    """
        Return True if gpcrdb not in TM 1 through 7
    """
    
    TM = int(gpcrdb.split("x")[0])
    if(TM >=1 and TM <= 7): return False
    return True 

def extract_wb_info(INPUT_DIR):
    pdb_to_wb = {}
    pdb_files = glob.glob(INPUT_DIR + "/*txt")
    for index, pdb_file in enumerate(pdb_files):
#         if(index > 4): break
        pdb_chain = pdb_file.strip().split("/")[-1].strip(".txt")
        pdb, chain = pdb_chain.split("_")
        if(len(pdb) != 4): 
            continue
            
        ### Process each pdb file for its ligand residue interactions
        pdb_to_wb[pdb_chain] = []
        GPCRDB_DICT = genResidueToGpcrdbTable(pdb)
        f = open(pdb_file, 'r')
        for line in f:
            linfo = line.strip().split("@")
            interaction_type = linfo[1].strip("-")
            atom1, atom2 = linfo[0].split(" -- ")
            residue_atom1, residue_atom2 = orderpair(atom1, atom2)
            residue1 = residue_atom1.split("-")[0]
            residue2 = residue_atom2.split("-")[0]
            gpcrdb1, gpcrdb2 = orderpair(getGPCRDB(residue1, GPCRDB_DICT), getGPCRDB(residue2, GPCRDB_DICT))
            if(gpcrdb1 != None and gpcrdb1 != "None" and gpcrdb2 != None and gpcrdb2 != "None"):
                if(notTM7(gpcrdb1) or notTM7(gpcrdb2)): continue
                pdb_to_wb[pdb_chain].append((gpcrdb1, gpcrdb2, interaction_type))
    
    pdb_to_wb = group_pdb_chains(pdb_to_wb)
    
    return pdb_to_wb
        
def heatmap(INPUT_DIR, OUTPUT_TABLE):
    pdb_to_wb = extract_wb_info(INPUT_DIR)
    write_table(pdb_to_wb, OUTPUT_TABLE)


In [78]:
INPUT_DIR="/scratch/PI/rondror/akma327/DynamicNetworks/data/crystal-analysis/ligand-wetness/watermarks/heatmaps/030717-receptor/individual-pdb-chain"
OUTPUT_TABLE="/scratch/PI/rondror/akma327/DynamicNetworks/data/crystal-analysis/ligand-wetness/watermarks/heatmaps/030717-receptor/watermark_receptor_heatmap_test.txt"
pdb_to_wb = heatmap(INPUT_DIR, OUTPUT_TABLE)

SER65 not found.
SER65 not found.
TRP35 not found.
ALA55 not found.
VAL58 not found.
ASN137 not found.
SER116 not found.
ASN137 not found.
THR114 not found.
ASP1 not found.
GLU27 not found.
ASP1 not found.
SER26 not found.
ASP143 not found.
LYS199 not found.
ASP151 not found.
ASP151 not found.
SER191 not found.
ASP82 not found.
GLN37 not found.
ASP82 not found.
GLU81 not found.
GLN124 not found.
SER121 not found.
GLN37 not found.
GLU81 not found.
GLN37 not found.
LYS39 not found.
GLN37 not found.
SER43 not found.
GLU27 not found.
SER26 not found.
SER191 not found.
THR178 not found.
THR180 not found.
HIS90 not found.
ILE2 not found.
HIS90 not found.
THR95 not found.
TYR140 not found.
TYR173 not found.
ILE2 not found.
THR95 not found.
LYS39 not found.
SER43 not found.
PHE98 not found.
PHE98 not found.
PRO119 not found.
SER131 not found.
SER116 not found.
THR114 not found.
SER162 not found.
TRP163 not found.
SER174 not found.
THR172 not found.
SER174 not found.
THR172 not found.
SER174 no

In [70]:
pdb_to_wb

{'2VT4': [('6x47', '6x51', 'wb'),
  ('4x56', '5x39', 'wb'),
  ('6x47', '7x37', 'wb'),
  ('6x51', '7x37', 'wb'),
  ('4x57', '5x39', 'wb'),
  ('3x26', '4x61', 'wb'),
  ('4x56', '4x57', 'wb')],
 '3VG9': [],
 '4BEY': [('6x51', '7x37', 'wb'),
  ('6x40', '6x44', 'wb2'),
  ('3x49', '4x42', 'wb'),
  ('2x50', '7x45', 'wb'),
  ('6x44', '7x53', 'wb2'),
  ('5x58', '7x53', 'wb'),
  ('6x44', '6x44', 'wb2'),
  ('6x40', '6x44', 'wb'),
  ('2x40', '7x57', 'wb'),
  ('6x43', '6x44', 'wb'),
  ('7x49', '7x53', 'wb2'),
  ('2x50', '7x49', 'wb'),
  ('7x45', '7x49', 'wb'),
  ('3x43', '5x58', 'wb'),
  ('1x59', '1x60', 'wb'),
  ('6x43', '7x49', 'wb'),
  ('6x44', '7x49', 'wb2'),
  ('6x43', '6x44', 'wb2'),
  ('6x40', '7x49', 'wb2'),
  ('2x60', '3x28', 'wb'),
  ('6x40', '6x43', 'wb2'),
  ('6x44', '7x49', 'wb'),
  ('6x47', '7x37', 'wb'),
  ('6x40', '7x53', 'wb'),
  ('6x47', '6x51', 'wb'),
  ('3x43', '7x53', 'wb'),
  ('2x60', '2x60', 'wb'),
  ('6x44', '7x53', 'wb'),
  ('1x60', '1x60', 'wb'),
  ('6x43', '7x53', 'wb2')]

In [71]:
def group_uniprot_and_ligand(pdb_to_wb):
    uniprot_to_ligand_to_interactions = {}
    pdb_to_ligand_dict = pdb_to_ligand()
    for pdb in pdb_to_wb:
        uniprot = getUniprotCode(pdb)
        ligand = pdb_to_ligand_dict[pdb]
        wbs = pdb_to_wb[pdb]
        print(pdb, uniprot, ligand, wbs)
        if (uniprot not in uniprot_to_ligand_to_interactions):
            uniprot_to_ligand_to_interactions[uniprot] = {ligand:set(wbs)}
        else:
            if(ligand not in uniprot_to_ligand_to_interactions[uniprot]):
                uniprot_to_ligand_to_interactions[uniprot][ligand] = set(wbs)
            else:
                uniprot_to_ligand_to_interactions[uniprot][ligand] |= set(wbs)
    return uniprot_to_ligand_to_interactions

In [67]:

blah = group_uniprot_and_ligand(pdb_to_wb)
# blah

('4BEY', 'OPSD_BOVIN', 'RET', [('3x49', '4x42', 'wb'), ('6x43', '7x49', 'wb'), ('6x43', '6x44', 'wb'), ('2x50', '7x49', 'wb'), ('7x45', '7x49', 'wb'), ('6x44', '7x49', 'wb'), ('2x40', '7x57', 'wb'), ('2x50', '7x45', 'wb'), ('6x47', '7x37', 'wb'), ('6x47', '6x51', 'wb'), ('1x59', '1x60', 'wb'), ('1x59', '1x60', 'wb'), ('2x60', '3x28', 'wb'), ('2x60', '3x28', 'wb'), ('1x60', '1x60', 'wb'), ('3x43', '5x58', 'wb'), ('3x43', '7x53', 'wb'), ('6x40', '6x44', 'wb'), ('6x40', '7x53', 'wb'), ('6x44', '7x53', 'wb'), ('6x51', '7x37', 'wb'), ('2x60', '2x60', 'wb'), ('5x58', '7x53', 'wb'), ('6x40', '7x49', 'wb2'), ('6x44', '7x49', 'wb2'), ('6x40', '6x43', 'wb2'), ('6x44', '7x49', 'wb2'), ('6x44', '7x53', 'wb2'), ('6x43', '7x53', 'wb2'), ('6x40', '6x44', 'wb2'), ('6x40', '6x44', 'wb2'), ('6x40', '6x43', 'wb2'), ('6x43', '6x44', 'wb2'), ('6x40', '7x49', 'wb2'), ('6x43', '7x53', 'wb2'), ('6x44', '7x53', 'wb2'), ('6x43', '6x44', 'wb2'), ('7x49', '7x53', 'wb2'), ('7x49', '7x53', 'wb2'), ('6x44', '6x44', 

In [72]:
blah

{'AA2AR_HUMAN': {'ZMA': set()},
 'ADRB1_MELGA': {'P32': {('3x26', '4x61', 'wb'),
   ('4x56', '4x57', 'wb'),
   ('4x56', '5x39', 'wb'),
   ('4x57', '5x39', 'wb'),
   ('6x47', '6x51', 'wb'),
   ('6x47', '7x37', 'wb'),
   ('6x51', '7x37', 'wb')}},
 'OPSD_BOVIN': {'BNG': set(),
  'RET': {('1x59', '1x60', 'wb'),
   ('1x60', '1x60', 'wb'),
   ('2x40', '7x57', 'wb'),
   ('2x50', '7x45', 'wb'),
   ('2x50', '7x49', 'wb'),
   ('2x60', '2x60', 'wb'),
   ('2x60', '3x28', 'wb'),
   ('3x43', '5x58', 'wb'),
   ('3x43', '7x53', 'wb'),
   ('3x49', '4x42', 'wb'),
   ('5x58', '7x53', 'wb'),
   ('6x40', '6x43', 'wb2'),
   ('6x40', '6x44', 'wb'),
   ('6x40', '6x44', 'wb2'),
   ('6x40', '7x49', 'wb2'),
   ('6x40', '7x53', 'wb'),
   ('6x43', '6x44', 'wb'),
   ('6x43', '6x44', 'wb2'),
   ('6x43', '7x49', 'wb'),
   ('6x43', '7x53', 'wb2'),
   ('6x44', '6x44', 'wb2'),
   ('6x44', '7x49', 'wb'),
   ('6x44', '7x49', 'wb2'),
   ('6x44', '7x53', 'wb'),
   ('6x44', '7x53', 'wb2'),
   ('6x47', '6x51', 'wb'),
   ('6x4