# Generating Contact Maps of Somatic Mutations

In [6]:
# symbol : [sequence,]
# well_typed_proteins = {
#     "RASK":"P01116", # https://www.uniprot.org/uniprotkb/P01116/entry#sequences # "4OBE"
#     "TP53":"P04637", # https://www.uniprot.org/uniprotkb/P04637/entry#sequences "2OCJ"
#     "AKT1":"P31749", # https://www.uniprot.org/uniprotkb/P31749/entry#sequences # "1UNQ"
#     "RB1":"P06400", # https://www.uniprot.org/uniprotkb/P06400/entry#sequences  "4ELJ"
#     "BAD":"Q92934" # https://www.uniprot.org/uniprotkb/Q92934/entry#sequences "1MAZ"
# }

# symbol : {missense mutation : sequence,structure}
most_common_somatic_mutations = {
        "RASK":["G12D","G12V","G13D"], # https://portal.gdc.cancer.gov/genes/ENSG00000133703
        "TP53":["R175H", "R248Q", "R273C"], # https://portal.gdc.cancer.gov/genes/ENSG00000141510
        "AKT1":["E17K", "E40K", "W80R"], # https://portal.gdc.cancer.gov/genes/ENSG00000142208
        "RB1":[], # no missense mutation in most common somatic mutations, https://portal.gdc.cancer.gov/genes/ENSG00000139687 
        "BAD":["G136D", "M117I", "G62E"], # https://portal.gdc.cancer.gov/genes/ENSG00000002330
    }

In [9]:
%run utils.ipynb
folder_path = "uniprot_sequences"
all_data,lengths = process_uniprot_folder(folder_path) # data: (id,seq,symbol)
original_sequences = {symbol:(id,seq) for (id,seq,symbol) in all_data}
print(original_sequences)

In [12]:
# generate mutated sequences
mutated_sequences = {}
for (id,seq,symbol) in all_data:
    mutations = most_common_somatic_mutations[symbol]
    for mutation in mutations:
        if symbol not in mutated_sequences:
            mutated_sequences[symbol] = [] # add protein
        mutated_sequences[symbol].append((mutation,mutated_seq,)) # update list of mutations
print(mutated_sequences)

{'RASK': [('G12D', 'MSDVAIVKEGWLHKRGEYIKTWRPRYFLLKNDGTFIGYKERPQDVDQREAPLNNFSVAQCQLMKTERPRPNTFIIRCLQRTTVIERTFHVETPEEREEWTTAIQTVADGLKKQEEEEMDFRSGSPSDNSGAEEMEVSLAKPKHRVTMNEFEYLKLLGKGTFGKVILVKEKATGRYYAMKILKKEVIVAKDEVAHTLTENRVLQNSRHPFLTALKYSFQTHDRLCFVMEYANGGELFFHLSRERVFSEDRARFYGAEIVSALDYLHSEKNVVYRDLKLENLMLDKDGHIKITDFGLCKEGIKDGATMKTFCGTPEYLAPEVLEDNDYGRAVDWWGLGVVMYEMMCGRLPFYNQDHEKLFELILMEEIRFPRTLGPEAKSLLSGLLKKDPKQRLGGGSEDAKEIMQHRFFAGIVWQHVYEKKLSPPFKPQVTSETDTRYFDEEFTAQMITITPPDQDDSMECVDSERRPHFPQFSYSASGTA'), ('G12V', 'MSDVAIVKEGWLHKRGEYIKTWRPRYFLLKNDGTFIGYKERPQDVDQREAPLNNFSVAQCQLMKTERPRPNTFIIRCLQRTTVIERTFHVETPEEREEWTTAIQTVADGLKKQEEEEMDFRSGSPSDNSGAEEMEVSLAKPKHRVTMNEFEYLKLLGKGTFGKVILVKEKATGRYYAMKILKKEVIVAKDEVAHTLTENRVLQNSRHPFLTALKYSFQTHDRLCFVMEYANGGELFFHLSRERVFSEDRARFYGAEIVSALDYLHSEKNVVYRDLKLENLMLDKDGHIKITDFGLCKEGIKDGATMKTFCGTPEYLAPEVLEDNDYGRAVDWWGLGVVMYEMMCGRLPFYNQDHEKLFELILMEEIRFPRTLGPEAKSLLSGLLKKDPKQRLGGGSEDAKEIMQHRFFAGIVWQHVYEKKLSPPFKPQVTSETDTRYFDEEFTAQMITITPPDQDDSMECVDSERRPHFPQFSYSASGTA'), ('

In [None]:
# Source: https://github.com/facebookresearch/esm/tree/main?tab=readme-ov-file#esmfold
model, alphabet = torch.hub.load("facebookresearch/esm:main", "esm2_t33_650M_UR50D")
batch_converter = alphabet.get_batch_converter()

In [None]:
# generate contact maps via categorical jacobian
cjs = {}
#3. Calculate the categorical jacobian for each protein sequence
def get_cj_by_symbol(symbol):
    protein = original_sequences[symbol]
    # ∂in/∂out
    x = batch_converter([protein])[-1] 
    ln = lengths[i] 
    cj = get_categorical_jacobian(x,ln,model)
    # add protein with its corresponding categorical jacobian to dict
    cjs[protein[0]]=cj
 get_cj_by_symbol("RASK")