In [1]:
import pandas as pd
from pandas import DataFrame

## uncomment all of this if you wish to work on parsing gene2go
#gene2go = pd.read_csv("../Datasets/NCBI/gene2go/gene2go" , "\t")
#gene2go = gene2go.loc[gene2go['#tax_id'] == 9606]
#gene2go = gene2go.drop(columns=['#tax_id'])
##

#humanGenome = pd.read_csv("../Datasets/NCBI/Homo_sapiens.gene_info/Homo_sapiens.gene_info", "\t")

#gene2go

In [2]:
# Generating the scripts

# (1) name : name of the table
# (2) lines : list of all lines to be inserted
# e.g. generateInsertScript("users", [["1", "25", "hello"], ['2', '3', '4']])
def generateInsertScript(name, lines):
    query = "INSERT INTO TABLE " + name  + " VALUES"
    for line in lines:
        first = True
        query = query + " ("
        for v in line:
            if first:
                first = False
            else:
                query = query + ", "
            query = query + str(v)
        query = query + ")"
    if len(lines) == 0:
        return ";"
    else:
        return query + ";"
generateInsertScript("users", [("1", "25", "hello"), (2, 3, 4)])

'INSERT INTO TABLE users VALUES (1, 25, hello) (2, 3, 4);'

In [7]:
# return a dataframe's columns named after array 'attributes'
# inefficient, rather slow
def extractColumns(df, attributes):
    return [df[attributes].iloc[[i]].values.tolist()[0] for i in range(len(df))]

# split a string and remove the empty elements
def split(s, delimiter):
    content = s.split(delimiter)
    return [item for item in content if item]

def appendToSubArray(di, key, value):
    if not key in di:
        di[key] = []
    di[key].append(value)
        

## Parsing UnitProt

In [4]:
cancer_lines = []
allowedCategories = ['ID', 'AC', 'DE', 'GN', 'KW']
with open("../Datasets/UniProtKB/unitprot-cancer/unitprot-cancer.txt") as cf:
    for line in cf:
        content = split(line, ' ')
        if len(content) > 0 and content[0] in allowedCategories:
            # 1. separate line prefix from content
            # 2. remove '\n' character at the end of every line (supposedly, otherwise don't do it like this)
            cancer_lines.append((content[0], (' '.join(content[1:]))[:-1]))
#cancer_lines

In [40]:
# For every type of line, specific parsing is required
# You should search for the specific syntax and parsing process on unitprot

# how to parse each line : 
# https://web.expasy.org/docs/userman.html

class CancerDataNode:
    def __init__(self, id):
        self.id = id
        self.ac = []
        
        # desc related
        self.desc = {"AltName" : {}, 
                     "RecName" : {}, 
                     "Flags" : [], 
                     "Contains" : {"RecName" : {} , "AltName" : {}}, 
                     "Includes" : {"RecName" : {} , "AltName" : {}}
                    }
        self.include = {}
        self.contains = {}
        self.flags = []
        
        self.keywords = []
        self.gn = []
        
    # generate triples from the data
    def triples(self):
        triples = []
        
        # Keywords
        for item in self.keywords:
            triples.append((self.id, "keyword", item))
            
        # Accession numbers
        for item in self.ac:
            triples.append((self.id, "acnumber", item))
            
        # Recommended Names
        if 'RecName' in self.desc:
            for item in self.desc['RecName']:
                triples.append((self.id, "recname", item))
        
        # Alternative Names
        if 'AltName' in self.desc:
            for item in self.desc['AltName']:
                triples.append((self.id, "altname", item))
            
        return triples
    
    def __str__(self):
        desc = ""
        desc = desc + "ID=" + self.id 
        desc = desc +"\nDescription=" + str(self.desc)
        desc = desc + "\nKeywords=" + str(self.keywords)
        desc = desc + "\nACcessionNumbers=" + str(self.ac)
        desc = desc + "\nGeneNames=" + str(self.gn)
        return desc

cancerData = []
currentNode = None

# We need this data to be external, because DE content is spread across several lines w.r.t previous lines
descBuf = {}
subDescType = None
superDescType = None
            
for line in cancer_lines:
    if not line[1]:
        continue
    form = line[0]
    if (form == 'ID'):
        currentNode = CancerDataNode(line[1].split(' ')[0])
        cancerData.append(currentNode)
    elif (form == 'AC'):
        if currentNode:
                [currentNode.ac.append(item.strip()) for item in split(line[1], ';')]
    elif (form == 'DE'):
        # As you might have guessed, this is the most difficult to parse. WIP
        # It works almost correctly, but there are some incorrect split operations. Only the first word of =$1 is captured.
        if currentNode:
            spacesplit = split(line[1], ' ')
            if spacesplit[0] in ['Contains:', 'Includes:']:
                superDescType = spacesplit[0][:-1]
            elif spacesplit[0] in ['AltName:', 'RecName:']:
                subDescType = spacesplit[0][:-1]
                content = split(spacesplit[1], '=')
                if superDescType:
                    appendToSubArray(currentNode.desc[superDescType][subDescType], content[0], content[1])
                else:
                    appendToSubArray(currentNode.desc[subDescType], content[0], content[1])
            elif spacesplit[0] == 'Flags:':
                currentNode.desc['Flags'].append(spacesplit[1].strip())
            else:
                content = split(line[1], '=')
                if len(content) < 2:
                    print(content)
                    continue
                if superDescType and subDescType:
                    
                    appendToSubArray(currentNode.desc[superDescType][subDescType], content[0], content[1])
                elif subDescType:
                    
                    appendToSubArray(currentNode.desc[subDescType], content[0], content[1])
                else:
                    print("No subtype ? Impossiburu !")
                
    elif (form == 'GN'):
        if currentNode:
            if line[1] == 'and':
                continue
            gn = {}
            for item in split(line[1], ';'):
                content = split(item, '=')
                if len(content) > 1:
                    gn[content[0]] = content[1]
            currentNode.gn.append(gn)
            
    elif (form == 'KW'):
        if currentNode:
                [currentNode.keywords.append(item.strip()) for item in split(line[1], ';')]
    
#if len(cancerData) > 0:
#    print(cancerData[0]) 

In [41]:
for i in cancerData:
#    if i.id == "BRCA1_HUMAN":
    if i.id == "RACK1_HUMAN":
        [print(a, b) for a, b in i.desc.items()]
        break

Includes {'RecName': {}, 'AltName': {}}
RecName {}
Contains {'RecName': {'Full': ['Receptor', 'Receptor']}, 'AltName': {'Short': ['HLC-7;'], 'Full': ['Cell', 'Guanine', 'Guanine', 'Human', 'Receptor', 'Small', 'Guanine']}}
AltName {}
Flags []
