In [26]:
!pip install -Uqq docling huggingface_hub
!pip install -Uqq transformers

In [27]:
from docling.document_converter import DocumentConverter
from IPython.display import display, HTML
import os
import sys
import logging

class DevNull:
    def write(self, msg): pass
    def flush(self): pass

logging.disable(logging.CRITICAL)
sys.stderr = DevNull()

In [28]:
pdf_path = "/content/sample_1.pdf"
import os
if os.path.exists(pdf_path):
    print(f"PDF file found: {pdf_path}")
else:
    print(f"File not found at: {pdf_path}")

converter = DocumentConverter()
result = converter.convert(pdf_path)
doc = result.document

PDF file found: /content/sample_1.pdf


In [29]:
import os, re
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
from huggingface_hub import InferenceClient

os.environ["HF_TOKEN"] = ""

tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")
model = AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert")
pipe = pipeline("text-classification", model=model, tokenizer=tokenizer)
client = InferenceClient(provider="auto", api_key=os.environ["HF_TOKEN"])

structured_text = doc.export_to_markdown()
sentences = [s.strip() for s in re.split(r'(?<=[.!?])\s+', structured_text) if s.strip()]

results = []
for sentence in sentences:
    try:
        output = pipe(sentence, truncation=True, max_length=512)[0]
        results.append((sentence, output["label"], round(output["score"], 2)))
    except Exception:
        try:
            out = client.text_classification(sentence, model="ProsusAI/finbert")[0]
            results.append((sentence, out["label"], round(out["score"], 2)))
        except:
            results.append((sentence, "error", 0.0))

In [31]:
try:
    sentiment_data = results
except NameError:
    raise RuntimeError("'results' variable not found. Run Task 2 (FinBERT analysis) first.")

color_map = {
    "positive": "#b6fcb6",
    "negative": "#f5a3a3",
    "neutral": "#fff7b2",
}


paragraphs = [p.strip() for p in structured_text.split("\n\n") if p.strip()]

html_content = """
<h2 style='font-family:Arial;'>FinBERT Sentiment Highlight Visualization</h2>
<div style='font-family:Arial; line-height:1.6;'>
"""

sent_idx = 0
for para in paragraphs:
    para_sentences = [s.strip() for s in re.split(r'(?<=[.!?])\s+', para) if s.strip()]
    paragraph_html = ""

    for s in para_sentences:
        stripped = s.strip()


        if stripped.startswith("##") or stripped.isupper() or len(stripped.split()) <= 6:
            paragraph_html += f"{stripped} "
            continue


        if sent_idx < len(sentiment_data):
            _, label, _ = sentiment_data[sent_idx]
            color = color_map.get(label.lower(), "#ffffff")
            paragraph_html += f"<span style='background-color:{color}; padding:2px 4px; border-radius:4px;'>{s} </span>"
        else:

            paragraph_html += f"<span style='background-color:#fff7b2; padding:2px 4px; border-radius:4px;'>{s} </span>"

        sent_idx += 1

    html_content += f"<div style='margin-bottom:12px;'>{paragraph_html}</div>"

html_content += "</div>"

output_path = "/content/visualization.html"
with open(output_path, "w", encoding="utf-8") as f:
    f.write(f"<html><body>{html_content}</body></html>")

print(f"Paragraph-colored HTML saved to: {output_path}")


Paragraph-colored HTML saved to: /content/visualization.html
