In [1]:
# !pip install PyPDF2
from PyPDF2 import PdfFileReader
import re
import pandas as pd

In [2]:
def extract_information(pdf_path):
    pdf_data = ""
    with open(pdf_path, 'rb') as f:
        pdf = PdfFileReader(f)
        information = pdf.getDocumentInfo()
        number_of_pages = pdf.getNumPages()
        ifrs_data = []
        merged_content = ""
        start_flag = 0
        pattern = "(.*?)\s([1-9][0-9][0-9]|[1-9][0-9]|[0-9]\s)"
        for page_n in range(number_of_pages):
            page = pdf.pages[page_n]
            data = (page.extract_text() + "\n\n").lstrip()
            content_start = "OBJECTIVE"
            content_end = "APPENDICES"
            content_head_start = "INTERNATIONAL FINANCIAL REPORTING STANDARD 15 REVENUE FROM CONTRACTS WITH CUSTOMERS".replace(" ", "(\s|\n)")
            content_head_end = "Appendix A"
            if data.startswith("CONTENT"):
                heads = data[data.index(content_start): data.index(content_end)].replace("\n", " ").replace("  ", " ")
                match = re.findall(pattern, heads)
                prev_para = 1
                for head, para_num in match:
                    para_num = int(para_num.strip())
                    head = head.strip()
                    if head.isupper():
                        head = head.capitalize()
                    ifrs_data.append({
                        "Paragraph": para_num,
                        "Tag": head
                    })
            if re.match(content_head_end, data, re.I):
                start_flag = 0
                merged_content += "\n" + content_head_end
            elif re.match(content_head_start, data, re.I):
                start_flag = 1
                merged_content = ""
            if start_flag:
                ending_dirt_text_cp_pattern = "(A\d+ © IFRS Foundation|© IFRS Foundation A\d+)"
                ending_dirt_text_num_pattern = "(\d+\n)*(\d+IFRS 15\s\n)$"
                if re.search(ending_dirt_text_cp_pattern, data, re.I):
                    data = re.sub(ending_dirt_text_cp_pattern, "", data)
                if re.search(ending_dirt_text_num_pattern, data, re.I):
                    data = re.sub(ending_dirt_text_num_pattern, "", data)
                merged_content += data
        merged_content = merged_content.replace("\n", "$$$")
        for i in range(len(ifrs_data)):
            if i == len(ifrs_data) - 1:
                match_paragraph_pattern =  "\$\$\$" + ifrs_data[i]["Tag"] + "\$\$\$" + "(.*?)" + "\$\$\$" + content_head_end + "\$\$\$"
                match = re.search(match_paragraph_pattern, merged_content)
            else:
                match_paragraph_pattern = "\$\$\$" + ifrs_data[i]["Tag"] + "\$\$\$" + "(.*?)" + "\$\$\$" + ifrs_data[i+1]["Tag"] + "\$\$\$"
                match = re.search(match_paragraph_pattern, merged_content)
            if match and match.group(1).strip():
                ifrs_data[i]["data"] = match.group(1).replace("$$$", " ").strip().replace("  ", " ").replace(" .", ".").replace(" -", " - ")

        # for ifrs in ifrs_data:
        #     print("-" if "data" in ifrs else str(ifrs['Paragraph']) + "-" + ifrs['Tag'] + "-Missing!")
#        Para missing data: 91, 95, 113, 123, 127
    return ifrs_data

In [3]:
file = "..//data//IFRS15.pdf"
ifrs_data = extract_information(file)

In [4]:
ifrs_data

[{'Paragraph': 1,
  'Tag': 'Objective',
  'data': 'The objective of this Standard is to establish the principles that an entity shall apply to report useful information to users of financial statements about the nature, amount, timing and uncertainty of revenue and cash flows arising from a contract with a customer.'},
 {'Paragraph': 2,
  'Tag': 'Meeting the objective',
  'data': 'To meet the objective in paragraph 1, the core principle of this Standard is that an entity shall recognise revenue to depict the transfer of promised goods or services to customers in an amount that reflects the consideration to which the entity expects to be entitled in exchange for those goods or services. An entity shall consider the terms of the contract and all relevant facts and circumstances when applying this Standard. An entity shall apply this Standard, including the use of any practical expedients, consistently to contracts with similar characteristics and in similar circumstances. This Standard s

In [5]:
ifrs_data = [ifrs for ifrs in ifrs_data if "data" in ifrs]

In [6]:
tags = [ifrs['Tag'] for ifrs in ifrs_data]
tags

['Objective',
 'Meeting the objective',
 'Scope',
 'Identifying the contract',
 'Combination of contracts',
 'Contract modifications',
 'Identifying performance obligations',
 'Satisfaction of performance obligations',
 'Measurement',
 'Changes in the transaction price',
 'Amortisation and impairment',
 'Presentation',
 'Disclosure']

## Zero Shot Learning

In [7]:
from flair.models import TARSClassifier
from flair.data import Sentence

In [8]:
# 1. Load our pre-trained TARS model for English
tars = TARSClassifier.load('tars-base')

2022-12-06 19:28:03,446 loading file C:\Users\UIF13879\.flair\models\tars-base-v8.pt


In [9]:
# 3. Define some classes that you want to predict using descriptive names
classes = tags

# 2. Prepare a test sentence
for senten in ifrs_data:
    sentence = Sentence(senten["data"])

    #4. Predict for these classes
    tars.predict_zero_shot(sentence, classes)
    
    
    # Print sentence with predicted labels
    print(senten["Tag"])
    print(sentence)

Objective
Sentence: "The objective of this Standard is to establish the principles that an entity shall apply to report useful information to users of financial statements about the nature , amount , timing and uncertainty of revenue and cash flows arising from a contract with a customer ." → Objective (0.5696)
Meeting the objective
Sentence: "To meet the objective in paragraph 1 , the core principle of this Standard is that an entity shall recognise revenue to depict the transfer of promised goods or services to customers in an amount that reflects the consideration to which the entity expects to be entitled in exchange for those goods or services . An entity shall consider the terms of the contract and all relevant facts and circumstances when applying this Standard . An entity shall apply this Standard , including the use of any practical expedients , consistently to contracts with similar characteristics and in similar circumstances . This Standard specifies the accounting for an i

Combination of contracts
Sentence: "An entity shall combine two or more contracts entered into at or near the same time with the same customer ( or related parties of the customer ) and account for the contracts as a single contract if one or more of the following criteria are met : ( a ) the contracts are negotiated as a package with a single commercial objective ; ( b ) the amount of consideration to be paid in one contract depends on the price or performance of the other contract ; or ( c ) the goods or services promised in the contracts ( or some goods or services promised in each of the contracts ) are a single performance obligation in accordance with paragraphs 22 – 30 ."
Contract modifications
Sentence: "A contract modification is a change in the scope or price ( or both ) of a contract that is approved by the parties to the contract . In some industries and jurisdictions , a contract modification may be described as a change order , a variation or an amendment . A contract mod

Satisfaction of performance obligations
Sentence: "An entity shall recognise revenue when ( or as ) the entity satisfies a performance obligation by transferring a promised good or service ( ie an asset ) to a customer . An asset is transferred when ( or as ) the customer obtains control of that asset . For each performance obligation identified in accordance with paragraphs 22 – 30 , an entity shall determine at contract inception whether it satisfies the performance obligation over time ( in accordance with paragraphs 35 – 37 ) or satisfies the performance obligation at a point in time ( in accordance with paragraph 38 ) . If an entity does not satisfy a performance obligation over time , the performance obligation is satisfied at a point in time . Goods and services are assets , even if only momentarily , when they are received and used ( as in the case of many services ) . Control of an asset refers to the ability to direct the use of , and obtain substantially all of the remaining

Measurement
Sentence: "When ( or as ) a performance obligation is satisfied , an entity shall recognise as revenue the amount of the transaction price ( which excludes estimates of variable consideration that are constrained in accordance with paragraphs 56 – 58 ) that is allocated to that performance obligation ."
Changes in the transaction price
Sentence: "After contract inception , the transaction price can change for various reasons , including the resolution of uncertain events or other changes in circumstances that change the amount of consideration to which an entity expects to be entitled in exchange for the promised goods or services . An entity shall allocate to the performance obligations in the contract any subsequent changes in the transaction price on the same basis as at contract inception . Consequently , an entity shall not reallocate the transaction price to reflect changes in stand-alone selling prices after contract inception . Amounts allocated to a satisfied perfo

In [10]:
filename = "../data/bulletin-dataset-tm-fsl.csv"
test_filename = "../data/test-bulletin-dataset-tm-fsl.csv"

In [11]:
df = pd.read_csv(filename)
df.head()

Unnamed: 0,label,text,category
0,0,The Associate offices ordinarily do not issue ...,legislation
1,0,The taxpayer is strongly encouraged to inform ...,legislation
2,0,"When filing the request, the taxpayer must ide...",legislation
3,0,The taxpayer also must notify the Associate of...,legislation
4,0,The question must be on the interpretation and...,legislation


In [16]:
# 3. Define some classes that you want to predict using descriptive names
classes = ["legislation", "compensation", "corporation"]

# 2. Prepare a test sentence
for idx, senten in df.iterrows():
    sentence = Sentence(senten['text'])

    #4. Predict for these classes
    tars.predict_zero_shot(sentence, classes)
    
    
    # Print sentence with predicted labels
    print(senten["category"])
    print(sentence)

legislation
Sentence: "The Associate offices ordinarily do not issue letter rulings on a matter involving the Federal tax consequences of any proposed Federal , state , local , municipal , or foreign legislation ." → legislation (0.9704); corporation (0.9409)
legislation
Sentence: "The taxpayer is strongly encouraged to inform the Service about , and discuss the implications of , any authority believed to be contrary to the position advanced , such as legislation , tax treaties , court decisions , regulations , notices , revenue rulings , revenue procedures , or announcements ." → legislation (0.8707); corporation (0.5805)
legislation
Sentence: "When filing the request , the taxpayer must identify any pending legislation that may affect the proposed transaction ." → legislation (0.9778); corporation (0.7754)
legislation
Sentence: "The taxpayer also must notify the Associate office if any such legislation is introduced after the request is filed but before a change in method of accounti

## Here the sentence object has the predicted topics at the end of the sentence with a percentage.

Reference:
@inproceedings{halder2020coling,
  title={Task Aware Representation of Sentences for Generic Text Classification},
  author={Halder, Kishaloy and Akbik, Alan and Krapac, Josip and Vollgraf, Roland},
  booktitle = {{COLING} 2020, 28th International Conference on Computational Linguistics},
  year      = {2020}
}