In [12]:
!pip install langextract

Collecting langextract
  Downloading langextract-1.0.8-py3-none-any.whl.metadata (18 kB)
Collecting async_timeout>=4.0.0 (from langextract)
  Downloading async_timeout-5.0.1-py3-none-any.whl.metadata (5.1 kB)
Collecting exceptiongroup>=1.1.0 (from langextract)
  Downloading exceptiongroup-1.3.0-py3-none-any.whl.metadata (6.7 kB)
Collecting ml-collections>=0.1.0 (from langextract)
  Downloading ml_collections-1.1.0-py3-none-any.whl.metadata (22 kB)
Downloading langextract-1.0.8-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.0/85.0 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading async_timeout-5.0.1-py3-none-any.whl (6.2 kB)
Downloading exceptiongroup-1.3.0-py3-none-any.whl (16 kB)
Downloading ml_collections-1.1.0-py3-none-any.whl (76 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.7/76.7 kB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: ml-collections, exceptiongroup, async_ti

In [13]:
import langextract as lx
import textwrap

# 1. Define the prompt and extraction rules
prompt = textwrap.dedent("""\
    Extract key finance concepts, decisions, and organizational roles in order of appearance.
    Use exact text from the source for extractions. Do not paraphrase or overlap entities.
    Provide meaningful attributes for each entity to add context.""")

# 2. Provide a high-quality example to guide the model
examples = [
    lx.data.ExampleData(
        text="INTRODUCTION TO FINANCE: In general, finance is defined as the provision of money at the time it is required.",
        extractions=[
            lx.data.Extraction(
                extraction_class="concept",
                extraction_text="finance",
                attributes={"definition": "provision of money at the time it is required"}
            ),
            lx.data.Extraction(
                extraction_class="definition",
                extraction_text="the provision of money at the time it is required",
                attributes={"context": "general meaning of finance"}
            )
        ]
    ),
    lx.data.ExampleData(
        text="The Traditional Phase (Up to 1940) - finance function was episodic in nature.",
        extractions=[
            lx.data.Extraction(
                extraction_class="phase",
                extraction_text="The Traditional Phase (Up to 1940)",
                attributes={"focus": "operational activities", "characteristic": "episodic finance function"}
            )
        ]
    ),
    lx.data.ExampleData(
        text="The key issues in capital structure decision are: what should be the optimal debt-equity ratio?",
        extractions=[
            lx.data.Extraction(
                extraction_class="financial_decision",
                extraction_text="capital structure decision",
                attributes={"key_issue": "optimal debt-equity ratio"}
            )
        ]
    ),
    lx.data.ExampleData(
        text="Chief Finance Officer supervises the work of treasurer and controller.",
        extractions=[
            lx.data.Extraction(
                extraction_class="role",
                extraction_text="Chief Finance Officer",
                attributes={"responsibility": "supervises treasurer and controller"}
            ),
            lx.data.Extraction(
                extraction_class="role",
                extraction_text="treasurer",
                attributes={"responsibility": "obtaining finance, banking, cash management, credit admin"}
            ),
            lx.data.Extraction(
                extraction_class="role",
                extraction_text="controller",
                attributes={"responsibility": "financial accounting, auditing, taxation, management accounting"}
            )
        ]
    )
]

In [14]:
# The input text to be processed
input_text = "In general, finance is defined as the provision of money at time when it is required"

# Run the extraction
result = lx.extract(
    text_or_documents=input_text,
    prompt_description=prompt,
    examples=examples,
    model_id="gemini-2.5-flash",
)

DEBUG:absl:Registered GeminiLanguageModel with patterns ['^gemini'] at priority 10
DEBUG:absl:Registered OllamaLanguageModel with patterns ['^gemma', '^llama', '^mistral', '^mixtral', '^phi', '^qwen', '^deepseek', '^command-r', '^starcoder', '^codellama', '^codegemma', '^tinyllama', '^wizardcoder', '^gpt-oss', '^meta-llama/[Ll]lama', '^google/gemma', '^mistralai/[Mm]istral', '^mistralai/[Mm]ixtral', '^microsoft/phi', '^Qwen/', '^deepseek-ai/', '^bigcode/starcoder', '^codellama/', '^TinyLlama/', '^WizardLM/'] at priority 10
DEBUG:absl:Registered OpenAILanguageModel with patterns ['^gpt-4', '^gpt4\\.', '^gpt-5', '^gpt5\\.'] at priority 10
2025-08-30 18:11:15,590 - langextract.debug - DEBUG - [langextract.inference] CALL: BaseLanguageModel.__init__(self=<GeminiLanguageModel>, constraint=Constraint(co...NONE: 'none'>), kwargs={})
2025-08-30 18:11:15,591 - langextract.debug - DEBUG - [langextract.inference] RETURN: BaseLanguageModel.__init__ -> None (0.0 ms)
2025-08-30 18:11:15,594 - langex

[92m✓[0m Extraction processing complete



INFO:absl:Finalizing annotation for document ID doc_1ea0400d.
INFO:absl:Document annotation completed.


[92m✓[0m Extracted [1m2[0m entities ([1m2[0m unique types)
  [96m•[0m Time: [1m2.12s[0m
  [96m•[0m Speed: [1m40[0m chars/sec
  [96m•[0m Chunks: [1m1[0m


In [15]:
# Save the results to a JSONL file
lx.io.save_annotated_documents([result], output_name="extraction_results.jsonl", output_dir=".")

# Generate the visualization from the file
html_content = lx.visualize("extraction_results.jsonl")
with open("visualization.html", "w") as f:
    if hasattr(html_content, 'data'):
        f.write(html_content.data)  # For Jupyter/Colab
    else:
        f.write(html_content)

[94m[1mLangExtract[0m: Saving to [92mextraction_results.jsonl[0m: 1 docs [00:00, 1213.98 docs/s]

[92m✓[0m Saved [1m1[0m documents to [92mextraction_results.jsonl[0m



[94m[1mLangExtract[0m: Loading [92mextraction_results.jsonl[0m: 100%|██████████| 754/754 [00:00<00:00, 1.66MB/s]

[92m✓[0m Loaded [1m1[0m documents from [92mextraction_results.jsonl[0m





In [16]:
!pip install docling

Collecting docling
  Downloading docling-2.48.0-py3-none-any.whl.metadata (10 kB)
Collecting docling-core<3.0.0,>=2.42.0 (from docling-core[chunking]<3.0.0,>=2.42.0->docling)
  Downloading docling_core-2.45.0-py3-none-any.whl.metadata (6.5 kB)
Collecting docling-parse<5.0.0,>=4.2.2 (from docling)
  Downloading docling_parse-4.2.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.5 kB)
Collecting docling-ibm-models<4,>=3.9.0 (from docling)
  Downloading docling_ibm_models-3.9.0-py3-none-any.whl.metadata (6.7 kB)
Collecting pypdfium2!=4.30.1,<5.0.0,>=4.30.0 (from docling)
  Downloading pypdfium2-4.30.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (48 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.5/48.5 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
Collecting easyocr<2.0,>=1.7 (from docling)
  Downloading easyocr-1.7.2-py3-none-any.whl.metadata (10 kB)
Collecting rtree<2.0.0,>=1.3.0 (from docling)
  Downloading rtree-1.4.1-

In [17]:
from docling.document_converter import DocumentConverter

source = "/content/task2.pdf.pdf"  # document per local path or URL
converter = DocumentConverter()
result = converter.convert(source)
print(result.document.export_to_markdown())  # output: "## Docling Technical Report[...]"

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


## INTRODUCTION TO FINANCE

In general, finance is defined as the provision of money at the tim It is required.

Specifically  it is defined as procurement of funds and their effective utilisation.

Financial management is defined as the management of flow of funds in a firm.

All business decisions have financial implications and therefore financial management is inevitably related with every aspect of business operations.

## EVOLUTION OF FINANCE

It may be divided into three broad categories, i.e., traditional phase, transitional phase and modern phase.

## 1.The Traditional Phase (Up to 1940)

Initially finance was a part of economic activities and business owners were more concerned with the operational activities.

Characteristics of this phase were :

- finance function was episodic in nature.
- funds were arranged mainly from financial institutions or through shares/ debentures.
- the outsider's point of view was dominant

## 2.Transitional Phase (1940 - 1950)

Here though the 

In [21]:

results = lx.extract(
    text_or_documents=result,
    prompt_description=prompt,
    examples=examples,
    model_id="gemini-2.5-flash",
    extraction_passes=3,    # Improves recall through multiple passes
    max_workers=20,         # Parallel processing for speed
    max_char_buffer=1000    # Smaller contexts for better accuracy
)

2025-08-30 18:24:40,050 - langextract.debug - DEBUG - [langextract.inference] CALL: BaseLanguageModel.__init__(self=<GeminiLanguageModel>, constraint=Constraint(co...NONE: 'none'>), kwargs={})
2025-08-30 18:24:40,051 - langextract.debug - DEBUG - [langextract.inference] RETURN: BaseLanguageModel.__init__ -> None (0.0 ms)
2025-08-30 18:24:40,053 - langextract.debug - DEBUG - [langextract.inference] CALL: BaseLanguageModel.apply_schema(self=<GeminiLanguageModel>, schema_instance=GeminiSchema(...xtractions']}))
2025-08-30 18:24:40,055 - langextract.debug - DEBUG - [langextract.inference] RETURN: BaseLanguageModel.apply_schema -> None (0.0 ms)
DEBUG:absl:Initialized Annotator with prompt:
Extract key finance concepts, decisions, and organizational roles in order of appearance.
Use exact text from the source for extractions. Do not paraphrase or overlap entities.
Provide meaningful attributes for each entity to add context.

Examples
Q: INTRODUCTION TO FINANCE: In general, finance is define