In [None]:
from services.pdb import pfam, pdbligand, pdbmolecule
from services.classyfire import classyfire
from services.uniprot import pdb2uniprot
from services.uniprot import pdb2uniprotAC
from services.uniprot import entry2ACC
from services.pubchem import get_image
from collections import defaultdict
from tqdm import tqdm
from math import ceil
import pandas as pd

## Create dictionary mapping PDB ID to chain containing the binding pocket

In [None]:
header = {0:"pdb", 1:"lig", 2:"chain", 3:"num"}
pdbs = pd.read_csv('./data/PDB_List_Final.txt', header=None, sep='\t')
pdbs = pdbs.rename(columns=header)
pdb_dict = dict(zip(pdbs.pdb, pdbs.chain))

pdb_ligs = pdbs
pdb_ligs['pdb_lig'] = pdbs[['pdb', 'lig']].apply(lambda row: '_'.join(row.values.astype(str)), axis=1)
pdb_lig_dict  = dict(zip(pdb_ligs.pdb_lig, pdb_ligs.chain))

## Gather Relationships between PDB + Chain and Uniprot ID

In [None]:
from pprint import pprint
PDB_list = pdb_dict.keys()

count = 0

seqDict = {}
for mol in PDB_list:
    chain_entry = {}
    polymer = pdbmolecule(mol)['polymer'] #Geting data from pdb API
    for j in range(len(polymer)): 
        if type(polymer) == list:
            if type(polymer[j]['chain']) == list: 
                chain = polymer[j]['chain'][0]['id']
            else:
                chain = polymer[j]['chain']['id']
            if 'macroMolecule' in polymer[j]: #Else uniprotACC is empty
                uniprotACC = polymer[j]['macroMolecule']['accession']['id']
            else:
                print('empty uniprot accession')
        else: 
            if type(polymer['chain']) == list: 
                chain = polymer['chain'][0]['id']
            else:
                chain = polymer['chain']['id']
            try:
                uniprotACC = polymer['macroMolecule']['accession']['id']
            except:
                print(mol)
                pprint(pdbmolecule(mol))
                pprint(polymer)
                print(len(polymer))
                count += 1
                print(f"whoopsie! Mistake number {count}")
                
        chain_entry[chain] = uniprotACC
        seqDict[mol] = chain_entry

## Prepare the sequence alignment matrix

In [None]:
pdb_acc = {}
for pdbid in pdb_lig_dict.keys():
    try: 
        pdb_acc[pdbid] = seqDict[pdbid[0:4]][pdb_lig_dict[pdbid]]
    except:
        missing = True

align = pd.read_csv('./data/align_scores.csv', sep=',', index_col='seq1')

id_acc_mapping = pd.read_csv(entry2ACC(align.columns), sep='\t')
id_acc_dict = dict(zip(id_acc_mapping.To, id_acc_mapping.From))

def getID(pdb):
    return acc_id[pdb_acc[pdb]]

seq_align = pd.DataFrame(columns = pdb_lig_dict.keys())
pdb_acc = defaultdict(lambda: 0, pdb_acc)
acc_id = defaultdict(lambda: 0, id_acc_dict)

#update to seq align values. 
pockets = list(pdb_lig_dict.keys())
for i in range(10):
    file = open('seqAlign_' + str(i) +'.csv', 'w+')
    seq_align = pd.DataFrame(columns = pdb_lig_dict.keys())
    
    for j in tqdm(range(ceil(len(pockets)/10*i), ceil(len(pockets)/10*(i+1)))):
        pdb1 = pockets[j]
        seq_align.loc[pdb1] = 0
        seqID1 = acc_id[pdb_acc[pdb1]]
        for pdb2 in pockets:
            seqID2 = acc_id[pdb_acc[pdb2]]

            if seqID1 != 0 and seqID2 != 0:
                seq_align.loc[pdb1, pdb2] = align.loc[seqID1, seqID2]
    file.write(seq_align.to_csv())
    file.close()

## Read in sequence alignment values and normalize final matrix

In [None]:
lines = []
for i in range(10):
    new_df = pd.read_csv('seqAlign_' + str(i) +'.csv', index_col=0)
    lines.append(new_df)
seq_align = pd.concat(lines, axis=0)
print(seq_align)

for row in seq_align.index:
    for col in seq_align.index:
        seq_align[row][col] = seq_align[row][col] / max(seq_align[row][row] , seq_align[col][col])
    print(row)
print(seq_align)