# Parsing UnitProt data

In [133]:
import pandas as pd
from pandas import DataFrame
from re import escape

## Functions to generate queries from data

In [134]:
 def toSQLStr(tup):
    result = "("
    if type(tup) == str:
        subitem = ""
        for c in tup:
            if c in "'":
                subitem = subitem + "'"
            subitem = subitem + c
        result = result + "'" + subitem + "'"
    elif type(tup) == tuple:
        first = True
        for item in tup:
            if first:
                first = False
            else:
                result = result + ", "
            if type(item) == str:
                subitem = ""
                for c in item:
                    if c in "'":
                        subitem = subitem + "'"
                    subitem = subitem + c
                result = result + "'" + subitem + "'"
            else:
                result = result + str(item)
    return result + ")"

In [135]:
def writeInsertQuery(table, lines, filepath):
    if len(lines) == 0 or not table or not filepath:
        return
    with open(filepath, "w") as outf:
        outf.write("INSERT INTO TABLE " + table + " VALUES")
        outf.writelines("\n" + toSQLStr(line) for line in lines)
        outf.write(";")

## Useful functions

In [136]:
# return a dataframe's columns named after array 'attributes'
# inefficient, rather slow
def extractColumns(df, attributes):
    return [df[attributes].iloc[[i]].values.tolist()[0] for i in range(len(df))]

# split a string and remove the empty elements
def split(s, delimiter):
    content = s.split(delimiter)
    return [item for item in content if item]

def appendToSubArray(di, key, value):
    if not key in di:
        di[key] = []
    di[key].append(value)
        

## Parsing UnitProt

In [137]:
cancer_lines = []
allowedCategories = ['ID', 'AC', 'DE', 'GN', 'KW', 'DR']
with open("../Datasets/UniProtKB/unitprot-cancer/unitprot-cancer.txt") as cf:
    for line in cf:
        content = split(line, ' ')
        if len(content) > 0 and content[0] in allowedCategories:
            # 1. separate line prefix from content
            # 2. remove '\n' character at the end of every line (supposedly, otherwise don't do it like this)
            cancer_lines.append((content[0], (' '.join(content[1:]))[:-1]))


In [138]:
#cancer_lines

In [143]:
# For every type of line, specific parsing is required

# how to parse each line : 
# https://web.expasy.org/docs/userman.html

class CancerDataNode:
    def __init__(self, id):
        self.id = id
        self.ac = []
        
        # desc related
        self.desc = {"AltName" : {}, 
                     "RecName" : {}, 
                     "Flags" : [], 
                     "Contains" : {"RecName" : {} , "AltName" : {}}, 
                     "Includes" : {"RecName" : {} , "AltName" : {}}
                    }
        self.flags = []
        self.go = []
        self.keywords = []
        self.gn = {}
        
    # generate triples from the data
    def triples(self):
        triples = []
        
        # Keywords
        for item in self.keywords:
            triples.append((self.id, "keyword", item))
            
        # Accession numbers
        for item in self.ac:
            triples.append((self.id, "acnumber", item))
            
        # Recommended Names
        if 'RecName' in self.desc:
            for item in self.desc['RecName']:
                triples.append((self.id, "recname", item))
        
        # Alternative Names
        if 'AltName' in self.desc:
            for item in self.desc['AltName']:
                triples.append((self.id, "altname", item))
            
        return triples
    
    def __str__(self):
        desc = ""
        desc = desc + "ID=" + self.id 
        desc = desc +"\nDescription=" + str(self.desc)
        desc = desc + "\nKeywords=" + str(self.keywords)
        desc = desc + "\nACcessionNumbers=" + str(self.ac)
        desc = desc + "\nGeneNames=" + str(self.gn)
        return desc

cancerData = []
currentNode = None

# We need this data to be external, because DE content is spread across several lines w.r.t previous lines
descBuf = {}
subDescType = None
superDescType = None
            
for line in cancer_lines:
    if not line[1]:
        continue
    form = line[0]
    if (form == 'ID'):
        currentNode = CancerDataNode(line[1].split(' ')[0])
        cancerData.append(currentNode)
    elif (form == 'AC'):
        if currentNode:
                [currentNode.ac.append(item.strip()) for item in split(line[1], ';')]
    elif (form == 'DE'):
        if currentNode:
            spacesplit = split(line[1], ' ')
            colonsplit = split(line[1], ':')
            if len(colonsplit) and colonsplit[0] in ['Contains', 'Includes']:
                superDescType = colonsplit[0]
            elif len(colonsplit) and colonsplit[0] in ['AltName', 'RecName', 'SubName']:
                subDescType = colonsplit[0]
                content = split(colonsplit[1][:-1], '=')
                value = split(content[1], '{')[0].strip()
                if superDescType:
                    appendToSubArray(currentNode.desc[superDescType][subDescType], content[0].strip(), value)
                else:
                    appendToSubArray(currentNode.desc[subDescType], content[0].strip(), value)
            elif spacesplit[0] == 'Flags:':
                currentNode.desc['Flags'].append(colonsplit[1][:-1].strip())
            else:
                content = split(line[1], '=')
                if len(content) < 2:
                    continue
                # to remove the pubmed content
                value = split(content[1][:-1], '{')[0].strip()
                if superDescType and subDescType:
                    appendToSubArray(currentNode.desc[superDescType][subDescType], content[0].strip(), value)
                elif subDescType:
                    appendToSubArray(currentNode.desc[subDescType], content[0].strip(), value)
                
    elif (form == 'GN'):
        if currentNode:
            if line[1] == 'and':
                continue
            for item in split(line[1], ';'):
                content = split(item, '=')
                if len(content) > 1:
                    commasplit = split(content[1], ',')
                    for i in commasplit:
                        appendToSubArray(currentNode.gn, content[0], i)
            
    elif (form == 'KW'):
        if currentNode:
                [currentNode.keywords.append(item.strip()) for item in split(line[1], ';')]
    elif (form == 'DR'):
        if currentNode:
                semicolonsplit = split(line[1], ';')
                if len(semicolonsplit) > 1 and semicolonsplit[0] == 'GO':
                    currentNode.go.append(split(semicolonsplit[1], ':')[1].strip())
    
#if len(cancerData) > 0:
#    print(cancerData[0]) 

## Content of a data node


In [140]:
for i in cancerData:
#    if i.id == "BRCA1_HUMAN":
    if i.id == "RACK1_HUMAN":
        print(i)
        #[print(a, b) for a, b in i.desc.items()]
        break

ID=RACK1_HUMAN
Description={'AltName': {}, 'Includes': {'AltName': {}, 'RecName': {}}, 'Contains': {'AltName': {'Full': ['Cell proliferation-inducing gene 21 protein', 'Guanine nucleotide-binding protein subunit beta-2-like 1', 'Guanine nucleotide-binding protein subunit beta-like protein 12.3', 'Human lung cancer oncogene 7 protein', 'Receptor for activated C kinase', 'Small ribosomal subunit protein RACK1', 'Guanine nucleotide-binding protein subunit beta-2-like 1, N-terminally processed'], 'Short': ['HLC-7']}, 'RecName': {'Full': ['Receptor of activated protein C kinase 1', 'Receptor of activated protein C kinase 1, N-terminally processed']}}, 'RecName': {}, 'Flags': []}
Keywords=['3D-structure', 'Acetylation', 'Apoptosis', 'Biological rhythms', 'Cell cycle', 'Cell membrane', 'Cell projection', 'Complete proteome', 'Cytoplasm', 'Developmental protein', 'Direct protein sequencing', 'Gastrulation', 'Growth regulation', 'Host-virus interaction', 'Membrane', 'Nucleus', 'Phosphoprotein',

## Generating insert queries from data nodes

In [144]:
# change your table names here
id_table = "id"
ac_table = "ac"
name_table = "name"
keyword_table = "keyword"
genename_table = "genename"
gene_ontology_table = "go"
flag_table = "flag"

lines = {
    id_table : [],
    ac_table : [], 
    # if you want to have both altname and recname in the same table, use this one
    name_table: [],
    keyword_table : [], 
    genename_table : [], 
    gene_ontology_table : [],
    flag_table : []}

for node in cancerData:
    
    lines[id_table].append((node.id))
    
    [lines[ac_table].append((node.id, item)) for item in node.ac]
    
    # alternative names
    [lines[name_table].append((node.id, "AltName", k, item)) for k, names in node.desc["AltName"].items() for item in names]
    [lines[name_table].append((node.id, "AltName", k, item)) for k, names in node.desc["Contains"]["AltName"].items() for item in names]
    [lines[name_table].append((node.id, "AltName", k, item)) for k, names in node.desc["Includes"]["AltName"].items() for item in names]
    
    # recommended names
    [lines[name_table].append((node.id, "RecName", k, item)) for k, names in node.desc["RecName"].items() for item in names]
    [lines[name_table].append((node.id, "RecName", k, item)) for k, names in node.desc["Contains"]["RecName"].items() for item in names]
    [lines[name_table].append((node.id, "RecName", k, item)) for k, names in node.desc["Includes"]["RecName"].items() for item in names]
    
    [lines[flag_table].append((node.id, item)) for item in node.desc["Flags"]]
    
    [lines[keyword_table].append((node.id, item)) for item in node.keywords]
    
    [lines[gene_ontology_table].append((node.id, item)) for item in node.go]
    
    [lines[genename_table].append((node.id, item)) for k, v in node.gn.items() for item in v]
    
for key, value in lines.items():
    writeInsertQuery(key, value, "queries/" + str(key) + ".sql")

In [142]:
print("Reached the end of the notebook !")

Reached the end of the notebook !
