In [1]:
with open('data/airbus_report.xhtml') as f:
    report = f.read()

report[:1000]

'<?xml version="1.0" encoding="UTF-8"?><html xmlns="http://www.w3.org/1999/xhtml" xmlns:link="http://www.xbrl.org/2003/linkbase" xmlns:xbrli="http://www.xbrl.org/2003/instance" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:xbrldi="http://xbrl.org/2006/xbrldi" xmlns:iso4217="http://www.xbrl.org/2003/iso4217" xmlns:xsd="http://www.w3.org/2001/XMLSchema" xmlns:utr="http://www.xbrl.org/2009/utr" xmlns:num="http://www.xbrl.org/dtr/type/numeric" xmlns:nonnum="http://www.xbrl.org/dtr/type/non-numeric" xmlns:xbrldt="http://xbrl.org/2005/xbrldt" xmlns:ref="http://www.xbrl.org/2006/ref" xmlns:ix="http://www.xbrl.org/2013/inlineXBRL" xmlns:ixt="http://www.xbrl.org/inlineXBRL/transformation/2020-02-12" xmlns:airbus="http://www.airbus.com" xml:lang="en" xmlns:esef_cor="https://www.esma.europa.eu/taxonomy/2022-03-24/esef_cor" xmlns:ifrs-full="https://xbrl.ifrs.org/taxonomy/2022-03-24/ifrs-full"><head><title></title><style type="text/css">#text

In [14]:
from html.parser import HTMLParser
from collections import Counter, defaultdict

class ReportParser(HTMLParser):
    def __init__(self):
        super().__init__()
        self.tags = defaultdict(list)
        self.xbrlTags = []
        self.contexts = []
        self.context = []
        self.datas = []
        self.last_idx = -1

    def get_current_context(self):
        def process(item):
            res = item['tag']
            if 'class' in item.keys():
                res += f' {item['class']}'
            return res
        
        # return " >> ".join([process(item) for item in self.context])
        start_idx = self.last_idx + 1
        self.last_idx = len(self.datas) - 1
        return " ".join(self.datas[start_idx:]).strip() + f' >> {self.xbrlTags[-1]}'
        
    def handle_starttag(self, tag, attr):
        attr = dict(attr)
        curr = {
            'tag': tag, 
        }
        
        if tag not in ['html', 'body']:
            curr['attr'] = attr
            curr['class'] = attr.get('class', '')
        
        self.context.append(curr)
        
        if tag.startswith('ix') or tag.startswith('xbrl'):
            self.tags[tag].append(attr)
            if tag.startswith('ix'):
                xbrlTag = attr.get('name', '')
                self.xbrlTags.append(xbrlTag)
            
    def handle_endtag(self, tag):
        self.context.pop(-1)
        if tag.startswith('ix') and self.xbrlTags[-1] != '':
            cxt = self.get_current_context()
            self.contexts.append(cxt)
        
    def handle_data(self, data):
        if self.context[-1]['tag'] not in ['style'] and not self.context[-1]['tag'].startswith('ix') and not self.context[-1]['tag'].startswith('xbrl'):
            if 'data' in self.context[-1].keys():
                self.context[-1]['data'] += data
            else:
                self.context[-1]['data'] = data
        
        if self.context[-1]['tag'] not in ['style'] and not self.context[-1]['tag'].startswith('ix') and not self.context[-1]['tag'].startswith('xbrl'):
            if self.context[-1]['tag'] == 'span':
                self.datas.append(data)

parser = ReportParser()
parser.feed(report)
empty_count, big_count = 0, 0
for context in parser.contexts:
    if len(context) <= 1:
        empty_count += 1
    elif len(context) <= 1000:
        print(context)
        print('-------------')
    else:
        big_count += 1
# print(f"Empty contexts: {empty_count}\tLong contexts: {big_count}")

 >> ifrs-full:Revenue
-------------
Cost of sales  ( >> ifrs-full:CostOfSales
-------------
)  ( >> ifrs-full:CostOfSales
-------------
)  Gross margin  12 >> ifrs-full:GrossProfit
-------------
 >> ifrs-full:GrossProfit
-------------
Selling expenses  ( >> ifrs-full:SellingExpense
-------------
)  ( >> ifrs-full:SellingExpense
-------------
)  Administrative expenses  ( >> ifrs-full:AdministrativeExpense
-------------
)  ( >> ifrs-full:AdministrativeExpense
-------------
)  Research and development expenses  13  ( >> ifrs-full:ResearchAndDevelopmentExpense
-------------
)  ( >> ifrs-full:ResearchAndDevelopmentExpense
-------------
)  Other income  14 >> ifrs-full:OtherIncome
-------------
 >> ifrs-full:OtherIncome
-------------
Other expenses  14  ( >> ifrs-full:OtherExpenseByFunction
-------------
)  ( >> ifrs-full:OtherExpenseByFunction
-------------
)  Share of profit from investments accounted for under the equity method  15 >> ifrs-full:ShareOfProfitLossOfAssociatesAndJointVentur