# Parsing UnitProt data

In [1]:
import pandas as pd
from pandas import DataFrame

## Functions to generate queries from data

In [2]:
# Generating INSERT scripts
def generateInsertScript(name, lines):
    query = "INSERT INTO TABLE " + name  + " VALUES"
    for line in lines:
        first = True
        query = query + " ("
        for v in line:
            if first:
                first = False
            else:
                query = query + ", "
            query = query + str(v)
        query = query + ")"
    if len(lines) == 0:
        return ";"
    else:
        return query + ";"
generateInsertScript("users", [("1", "25", "hello"), (2, 3, 4)])

'INSERT INTO TABLE users VALUES (1, 25, hello) (2, 3, 4);'

In [3]:
def writeInsertQuery(table, lines, filepath):
    if len(lines) == 0 or not table or not filepath:
        return
    with open(filepath, "w") as outf:
        outf.write("INSERT INTO TABLE " + table + " VALUES")
        outf.writelines("\n" + str(line) for line in lines)
        outf.write(";")

## Useful functions

In [4]:
# return a dataframe's columns named after array 'attributes'
# inefficient, rather slow
def extractColumns(df, attributes):
    return [df[attributes].iloc[[i]].values.tolist()[0] for i in range(len(df))]

# split a string and remove the empty elements
def split(s, delimiter):
    content = s.split(delimiter)
    return [item for item in content if item]

def appendToSubArray(di, key, value):
    if not key in di:
        di[key] = []
    di[key].append(value)
        

## Parsing UnitProt

In [5]:
cancer_lines = []
allowedCategories = ['ID', 'AC', 'DE', 'GN', 'KW']
with open("../Datasets/UniProtKB/unitprot-cancer/unitprot-cancer.txt") as cf:
    for line in cf:
        content = split(line, ' ')
        if len(content) > 0 and content[0] in allowedCategories:
            # 1. separate line prefix from content
            # 2. remove '\n' character at the end of every line (supposedly, otherwise don't do it like this)
            cancer_lines.append((content[0], (' '.join(content[1:]))[:-1]))


In [6]:
#cancer_lines

In [14]:
# For every type of line, specific parsing is required

# how to parse each line : 
# https://web.expasy.org/docs/userman.html

class CancerDataNode:
    def __init__(self, id):
        self.id = id
        self.ac = []
        
        # desc related
        self.desc = {"AltName" : {}, 
                     "RecName" : {}, 
                     "Flags" : [], 
                     "Contains" : {"RecName" : {} , "AltName" : {}}, 
                     "Includes" : {"RecName" : {} , "AltName" : {}}
                    }
        self.flags = []
        
        self.keywords = []
        self.gn = {}
        
    # generate triples from the data
    def triples(self):
        triples = []
        
        # Keywords
        for item in self.keywords:
            triples.append((self.id, "keyword", item))
            
        # Accession numbers
        for item in self.ac:
            triples.append((self.id, "acnumber", item))
            
        # Recommended Names
        if 'RecName' in self.desc:
            for item in self.desc['RecName']:
                triples.append((self.id, "recname", item))
        
        # Alternative Names
        if 'AltName' in self.desc:
            for item in self.desc['AltName']:
                triples.append((self.id, "altname", item))
            
        return triples
    
    def __str__(self):
        desc = ""
        desc = desc + "ID=" + self.id 
        desc = desc +"\nDescription=" + str(self.desc)
        desc = desc + "\nKeywords=" + str(self.keywords)
        desc = desc + "\nACcessionNumbers=" + str(self.ac)
        desc = desc + "\nGeneNames=" + str(self.gn)
        return desc

cancerData = []
currentNode = None

# We need this data to be external, because DE content is spread across several lines w.r.t previous lines
descBuf = {}
subDescType = None
superDescType = None
            
for line in cancer_lines:
    if not line[1]:
        continue
    form = line[0]
    if (form == 'ID'):
        currentNode = CancerDataNode(line[1].split(' ')[0])
        cancerData.append(currentNode)
    elif (form == 'AC'):
        if currentNode:
                [currentNode.ac.append(item.strip()) for item in split(line[1], ';')]
    elif (form == 'DE'):
        # As you might have guessed, this is the most difficult to parse. WIP
        # It works almost correctly, but there are some incorrect split operations. Only the first word of =$1 is captured.
        # There are several flaws here.
        # The split by ' ' messes up things.
        # 
        if currentNode:
            spacesplit = split(line[1], ' ')
            colonsplit = split(line[1], ':')
            if len(colonsplit) and colonsplit[0] in ['Contains', 'Includes']:
                superDescType = colonsplit[0]
            elif len(colonsplit) and colonsplit[0] in ['AltName', 'RecName', 'SubName']:
                subDescType = colonsplit[0]
                # to remove the ';'
                content = split(colonsplit[1][:-1], '=')
                if superDescType:
                    appendToSubArray(currentNode.desc[superDescType][subDescType], content[0].strip(), content[1].strip())
                else:
                    appendToSubArray(currentNode.desc[subDescType], content[0].strip(), content[1].strip())
            elif spacesplit[0] == 'Flags:':
                currentNode.desc['Flags'].append(colonsplit[1][:-1].strip())
            else:
                content = split(line[1], '=')
                if len(content) < 2:
                    continue
                if superDescType and subDescType:
                    appendToSubArray(currentNode.desc[superDescType][subDescType], content[0], content[1][:-1])
                elif subDescType:
                    appendToSubArray(currentNode.desc[subDescType], content[0], content[1][:-1])
                else:
                    print("No subtype ? Impossiburu !")
                
    elif (form == 'GN'):
        if currentNode:
            if line[1] == 'and':
                continue
            gn = {}
            for item in split(line[1], ';'):
                content = split(item, '=')
                if len(content) > 1:
                    #{ split
                    #, split
                    commasplit = split(content[1], ',')
                    for i in commasplit:
                        appendToSubArray(currentNode.gn, content[0], i)
            
    elif (form == 'KW'):
        if currentNode:
                [currentNode.keywords.append(item.strip()) for item in split(line[1], ';')]
    
#if len(cancerData) > 0:
#    print(cancerData[0]) 

## Content of a data node


In [8]:
for i in cancerData:
#    if i.id == "BRCA1_HUMAN":
    if i.id == "RACK1_HUMAN":
        print(i)
        #[print(a, b) for a, b in i.desc.items()]
        break

ID=RACK1_HUMAN
Description={'Contains': {'AltName': {'Short': ['HLC-7'], ' Full': ['Cell proliferation-inducing gene 21 protein', 'Guanine nucleotide-binding protein subunit beta-2-like 1', 'Guanine nucleotide-binding protein subunit beta-like protein 12.3', 'Human lung cancer oncogene 7 protein', 'Receptor for activated C kinase', 'Small ribosomal subunit protein RACK1 {EC', 'Guanine nucleotide-binding protein subunit beta-2-like 1, N-terminally processed']}, 'RecName': {' Full': ['Receptor of activated protein C kinase 1', 'Receptor of activated protein C kinase 1, N-terminally processed']}}, 'RecName': {}, 'Flags': [], 'Includes': {'AltName': {}, 'RecName': {}}, 'AltName': {}}
Keywords=['3D-structure', 'Acetylation', 'Apoptosis', 'Biological rhythms', 'Cell cycle', 'Cell membrane', 'Cell projection', 'Complete proteome', 'Cytoplasm', 'Developmental protein', 'Direct protein sequencing', 'Gastrulation', 'Growth regulation', 'Host-virus interaction', 'Membrane', 'Nucleus', 'Phosphopro

## Generating insert queries from data nodes

In [15]:
lines = {
    "id": [],
    "ac" : [], 
    "altname" : [], 
    "recname" : [], 
    "keyword" : [], 
    "genename" : [], 
    "flag" : []}

for node in cancerData:
    
    lines["id"].append(node.id)
    
    [lines["ac"].append((node.id, item)) for item in node.ac]
    
    [lines["altname"].append((node.id, k, item)) for k, names in node.desc["AltName"].items() for item in names]
    [lines["altname"].append((node.id, k, item)) for k, names in node.desc["Contains"]["AltName"].items() for item in names]
    [lines["altname"].append((node.id, k, item)) for k, names in node.desc["Includes"]["AltName"].items() for item in names]
    
    [lines["recname"].append((node.id, k, item)) for k, names in node.desc["RecName"].items() for item in names]
    [lines["recname"].append((node.id, k, item)) for k, names in node.desc["Contains"]["RecName"].items() for item in names]
    [lines["recname"].append((node.id, k, item)) for k, names in node.desc["Includes"]["RecName"].items() for item in names]
    
    [lines["flag"].append((node.id, item)) for item in node.desc["Flags"]]
    
    [lines["keyword"].append((node.id, item)) for item in node.keywords]
    
    [lines["genename"].append((node.id, item)) for k, v in node.gn.items() for item in v]
    
for key, value in lines.items():
    writeInsertQuery("up_" + key, value, "queries/" + str(key) + ".sql")

In [10]:
print("Reached the end of the notebook !")

Reached the end of the notebook !
