## .ini

In [1]:
import os
import re
import pandas as pd
from collections import defaultdict

In [2]:
PATH = 'D:\\Python\\2021-Brenda and rhea Database\\others\\'
paths = {
    "file": PATH + "brenda_download.txt", # the TXTFile.
    "NAME": PATH + "NAME.txt", # The Name file.
    "results": PATH + "results", # to store results.
}

## API

In [4]:
bp = brenda_parser(txtfile = paths["file"], namefile = paths["NAME"])

get the ECs.

In [5]:
bp.getECs()[:5]

('0.0.0.0', '1.1.1.1', '1.1.1.10', '1.1.1.100', '1.1.1.101')

In [6]:
len(bp.getECs())

7558

Get names of records (raw records) that were extracted but not processed further.

In [7]:
bp.getRawNames()[1:5]

('PROTEIN', 'RECOMMENDED_NAME', 'SYSTEMATIC_NAME', 'SYNONYMS')

In [8]:
bp.getRawRecords('3.1.1.3', 'SYNONYMS')[:5]

[' Alip1p <154>',
 ' LipY <136>',
 '#1,7,9,14,21,26,34,43,47,85,90,93,104,106,135,150# triacylglycerol acylhydrolase <111,122,189,193,208,209,218,238,264,266,315,316,320,331,332,348>',
 '#1,9# TPL <315,348>',
 '#10# Tgl4p <135>']

Get names of records (handled records) that were extracted and processed further.

In [9]:
bp.getHandledNames()[:5]

('PROTEIN', 'REFERENCE', 'RECOMMENDED_NAME', 'REACTION', 'SYSTEMATIC_NAME')

In [10]:
bp.getHandledRecords('3.1.1.3', 'PROTEIN')[20]

('21',
 'Pseudomonas aeruginosa  ',
 None,
 '24,57,66,106,111,115,123,139,140,141,144,147,168,181,196,197,201,210 214,245,269,293')

## Code

In [3]:
class brenda_parser():
    def __init__(self, txtfile = paths["file"], namefile = paths["NAME"]):
        self._pages = dict()
        
        # 5 types of RE patterns.
        self.RE_patterns = {'PROTEIN': '#([\d]+)# (.+?)(?:\((?=#)(.+?)(?<=>)\))? <([\d, ]+)>',
                            'REFERENCE': '<([\d]+)> (.+?) {Pubmed:[\dn]*}(?: \(c\))?',
                            }
        for name in ['RECOMMENDED_NAME', 'REACTION', 'SYSTEMATIC_NAME', 'REACTION_TYPE', ]:
            self.RE_patterns[name] = '(.*)'
        for name in [ 'SYNONYMS',  'SOURCE_TISSUE',  'LOCALIZATION', 'NATURAL_SUBSTRATE_PRODUCT', 'SUBSTRATE_PRODUCT',
                      'TURNOVER_NUMBER', 'KM_VALUE', 'PH_OPTIMUM', 'PH_RANGE', 'SPECIFIC_ACTIVITY', 
                      'TEMPERATURE_OPTIMUM', 'TEMPERATURE_RANGE', 'COFACTOR', 'ACTIVATING_COMPOUND', 'INHIBITORS', 'METALS_IONS',
                      'MOLECULAR_WEIGHT', 'POSTTRANSLATIONAL_MODIFICATION', 'SUBUNITS', 'PI_VALUE',
                      'APPLICATION', 'ENGINEERING', 'GENERAL_STABILITY', 'ORGANIC_SOLVENT_STABILITY',
                      'OXIDATION_STABILITY', 'PH_STABILITY', 'STORAGE_STABILITY', 'TEMPERATURE_STABILITY', 'KI_VALUE',
                      'IC50_VALUE', 'KCAT_KM_VALUE', 'EXPRESSION', 'GENERAL_INFORMATION']:
            self.RE_patterns[name] = '#([\d, ]+)# (.+?)(?: \((?=#)(.+?)(?<=>)\))?(?: \|(?=#)(.+?)(?<=>)\|)?(?: \{.*?\})? <([\d, ]+)>'
        for name in ['CLONED', 'CRYSTALLIZATION', 'PURIFICATION', 'RENATURED']:
            self.RE_patterns[name] = "#([\d, ]+)# (.*)<([\d, ]+)>"
        
        self.register(txtfile, namefile)
        self.parse()

    ### API
    def getECs(self,):
        return tuple(self._pages.keys())
    def getRawNames(self, ):
        return tuple(tn[0] for tn in self.table_names)
    def getHandledNames(self,):
        return tuple(self.RE_patterns.keys())
    def getRawRecords(self, ec, table_name):
        return self._pages[ec]["raw_records"][table_name]
    def getHandledRecords(self, ec, table_name):
        return self._pages[ec]["handled_records"][table_name]

    ### do the job.
    def register(self, txtfile = paths["file"], namefile = paths["NAME"]):
        with open(txtfile, 'r') as f:
            self.content = [ line for line in f.readlines()] # len(self.content) = 4604276
        with open(namefile, 'r') as f:
            self.table_names = eval(f.read()) # len(self.table_names) = 44
    def parse(self, ):
        self.genRecords()
        self.handleRecords()

    def addPage(self, ec):
        """
        use defaultdict(list) as the kernel.
        """
        self._pages[ec] = {
            "raw_records": defaultdict(list),
            "handled_records": defaultdict(list),
        }
    def addRawRecord(self, ec, table_name, record):
        self._pages[ec]["raw_records"][table_name].append(record)
    def addHandledRecord(self, ec, table_name, record):
        self._pages[ec]["handled_records"][table_name].append(record)

    def genRecords(self, ):
        """
        Model the file as a Finite State Machine.
        """
        states = ['BEGIN', 'IN_PAGE', 'IN_TABLE']
        state = states[0]
        ec = ''
        table_name = ''
        abbre = ''
        record = ''
        for line in self.content:
            if state == states[0]: # BEGIN 
                if line.startswith('ID\t'):
                    ec = line[3:-1]
                    self.addPage(ec)
                    state = states[1]
            elif state == states[1]: # IN_PAGE
                if line.startswith('\n') or line.startswith('*'):
                    pass
                elif line.startswith('///'):
                    ec = ''
                    state = states[0]
                else:
                    for tn in self.table_names:
                        if line.startswith(tn[0]):
                            table_name = tn[0]
                            abbre = tn[1]
                            state = states[2]
                            break
            elif state == states[2]: # IN_TABLE
                if line.startswith(abbre):
                    if record:
                        self.addRawRecord(ec, table_name, record)
                    record = ' '.join(line.split('\t')[1:])[:-1]
                elif line.startswith('\n'):
                    if record:
                        self.addRawRecord(ec, table_name, record)
                    table_name = ''
                    abbre = ''
                    record = ''
                    state = states[1]
                else:
                    record = record + ' ' + ' '.join(line.split('\t')[1:])[:-1]

    def handleRecords(self, ):
        """
        Process records using Regular Expressions
        """
        for ec in self._pages.keys():
            for table_name, RE in self.RE_patterns.items():
                for record in self.getRawRecords(ec, table_name):
                    re_result = re.match(RE, record)
                    if re_result:
                        self.addHandledRecord(ec, table_name, re.match(RE, record).groups())
                    elif table_name == 'SYNONYMS':
                        if record.endswith('>') and len(record.split('<')) == 2:
                            content, ref = record.split('<')
                            self.addHandledRecord(ec, table_name, (None, content, None, None, ref[:-1]))
                        else:
                            self.addHandledRecord(ec, table_name, (None, record, None, None, None))
                    # else:
                    #     ERROR_RECORDS.append((ec, table_name, record))


## draft

In [None]:
ERROR_RECORDS = []
bp = brenda_parser(txtfile = paths["file"], namefile = paths["NAME"])

In [167]:
for records in ERROR_RECORDS:
    print(records)