In [None]:
from google.colab import files
df1 = files.upload()

Saving valution.docx to valution.docx


In [None]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import zipfile
import xml.etree.ElementTree as ET
import json
import csv
# import argparse  # Removed argparse
from pathlib import Path

# Namespaces used by WordprocessingML and OMML
NS = {
    "w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main",
    "m": "http://schemas.openxmlformats.org/officeDocument/2006/math",
    "r": "http://schemas.openxmlformats.org/officeDocument/2006/relationships",
    "wp": "http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing",
    "a": "http://schemas.openxmlformats.org/drawingml/2006/main",
}

def qname(ns_prefix, local):
    return f"{{{NS[ns_prefix]}}}{local}"

def get_attr(elem, ns_prefix, local, default=None):
    return elem.attrib.get(qname(ns_prefix, local), default)

def text_of(elem):
    """Concatenate all m:t (and w:t) descendant texts in document order."""
    if elem is None:
        return ""
    parts = []
    for t in elem.iter():
        # m:t holds math text; w:t rarely appears in math but include as fallback
        if t.tag in (qname("m","t"), qname("w","t")):
            parts.append(t.text or "")
        # m:chr stores a single character in the 'm:val' attribute
        if t.tag == qname("m","chr"):
            val = get_attr(t, "m", "val")
            if val:
                parts.append(val)
    return "".join(parts)

def render_math(elem):
    """
    Recursive, *lightweight* OMML -> linear-text renderer.
    Handles common structures: fraction, super/sub, sqrt/radical, n-ary (sum/integral),
    delimiters, and generic runs. Falls back to concatenated text.
    """
    if elem is None:
        return ""

    tag = elem.tag

    # FRACTION: m:f with m:num, m:den (each contains m:e)
    if tag == qname("m","f"):
        num = elem.find(".//m:num", NS)
        den = elem.find(".//m:den", NS)
        return f"({render_math(num)})/({render_math(den)})"

    # SUPERSCRIPT: m:sSup  (base is m:e, sup is m:sup)
    if tag == qname("m","sSup"):
        base = elem.find(".//m:e", NS)
        sup = elem.find(".//m:sup", NS)
        return f"{paren_if_needed(render_math(base))}^{{{render_math(sup)}}}"

    # SUBSCRIPT: m:sSub  (base is m:e, sub is m:sub)
    if tag == qname("m","sSub"):
        base = elem.find(".//m:e", NS)
        sub = elem.find(".//m:sub", NS)
        return f"{paren_if_needed(render_math(base))}_{{{render_math(sub)}}}"

    # SUB & SUP: m:sSubSup  (base is m:e, sub is m:sub, sup is m:sup)
    if tag == qname("m","sSubSup"):
        base = elem.find(".//m:e", NS)
        sub = elem.find(".//m:sub", NS)
        sup = elem.find(".//m:sup", NS)
        return f"{paren_if_needed(render_math(base))}_{{{render_math(sub)}}}^{{{render_math(sup)}}}"

    # RADICAL: m:rad  (optional m:deg, and m:e for the expression)
    if tag == qname("m","rad"):
        deg = elem.find(".//m:deg", NS)
        body = elem.find(".//m:e", NS)
        if deg is not None:
            return f"root({render_math(deg)}, {render_math(body)})"
        return f"sqrt({render_math(body)})"

    # N-ARY (sum/integral/product...): m:nary
    # Structure: m:naryPr/m:chr@val defines operator; children m:sub, m:sup, and m:e
    if tag == qname("m","nary"):
        pr = elem.find("./m:naryPr", NS)
        op_chr = "∑"  # default to sum if unknown
        if pr is not None:
            chr_el = pr.find("./m:chr", NS)
            if chr_el is not None:
                op_chr = get_attr(chr_el, "m", "val", op_chr)
        sub = elem.find("./m:sub", NS)
        sup = elem.find("./m:sup", NS)
        body = elem.find("./m:e", NS)
        lower = render_math(sub) if sub is not None else ""
        upper = render_math(sup) if sup is not None else ""
        limits = ""
        if lower or upper:
            limits = f"_{{{lower}}}" + (f"^{{{upper}}}" if upper else "")
        return f"{op_chr}{limits}({render_math(body)})"

    # DELIMITER (parentheses/brackets): m:d  with m:dPr/m:begChr@val and m:endChr@val
    if tag == qname("m","d"):
        beg = "("
        end = ")"
        pr = elem.find("./m:dPr", NS)
        if pr is not None:
            b = pr.find("./m:begChr", NS)
            e = pr.find("./m:endChr", NS)
            beg = get_attr(b, "m", "val", beg) if b is not None else beg
            end = get_attr(e, "m", "val", end) if e is not None else end
        # content is typically inside one or more m:e
        content = "".join(render_math(e) for e in elem.findall("./m:e", NS))
        if not content:
            content = text_of(elem)
        return f"{beg}{content}{end}"

    # OMATH PARAGRAPH: concat child omath
    if tag == qname("m","oMathPara"):
        pieces = []
        for child in elem.findall(".//m:oMath", NS):
            pieces.append(render_math(child))
        # sometimes oMathPara may contain direct text too
        raw = text_of(elem)
        if pieces:
            return " ".join(p for p in pieces if p.strip()) or raw
        return raw

    # OMATH INLINE: gather its children/runs with special handling of known constructs
    if tag == qname("m","oMath"):
        # try to render known child structures first
        parts = []
        for child in list(elem):
            parts.append(render_math(child))
        s = "".join(parts).strip()
        return s if s else text_of(elem)

    # INLINE RUN: m:r -> use m:t inside
    if tag == qname("m","r"):
        return text_of(elem)

    # Fallback: try concatenating known text/chr in this subtree
    return text_of(elem)

def paren_if_needed(s):
    """Parenthesize if s has spaces or operators that may bind loosely."""
    if not s:
        return s
    if any(op in s for op in [" ", "+", "-", "*", "/", "="]):
        return f"({s})"
    return s

def find_equations_in_part(xml_bytes):
    root = ET.fromstring(xml_bytes)
    # collect both inline equations and equation paragraphs
    eq_nodes = list(root.findall(".//m:oMath", NS)) + list(root.findall(".//m:oMathPara", NS))
    equations = []
    for i, node in enumerate(eq_nodes, 1):
        # Render to linear string
        linear = render_math(node).strip()
        # Raw OMML (string)
        xml_str = ET.tostring(node, encoding="unicode")
        equations.append({
            "index_in_part": i,
            "linear": linear,
            "omml_xml": xml_str
        })
    return equations

def extract_from_docx(docx_path):
    parts_to_scan = []
    eqs = []

    with zipfile.ZipFile(docx_path, "r") as z:
        names = z.namelist()

        # Always try main document
        if "word/document.xml" in names:
            parts_to_scan.append("word/document.xml")

        # Optional components where equations might appear
        for n in names:
            if n.startswith("word/headers/header") and n.endswith(".xml"):
                parts_to_scan.append(n)
            elif n.startswith("word/footers/footer") and n.endswith(".xml"):
                parts_to_scan.append(n)
            elif n == "word/footnotes.xml":
                parts_to_scan.append(n)
            elif n == "word/endnotes.xml":
                parts_to_scan.append(n)
            # Textboxes and shapes are usually embedded within document.xml content

        for part in sorted(set(parts_to_scan)):
            try:
                xml_bytes = z.read(part)
            except KeyError:
                continue
            equations = find_equations_in_part(xml_bytes)
            for e in equations:
                e["source_part"] = part
            eqs.extend(equations)

    return eqs

def write_outputs(eqs, out_prefix):
    out_json = Path(f"{out_prefix}.json")
    out_csv = Path(f"{out_prefix}.csv")

    # JSON
    with open(out_json, "w", encoding="utf-8") as f:
        json.dump(eqs, f, ensure_ascii=False, indent=2)

    # CSV
    with open(out_csv, "w", encoding="utf-8", newline="") as f:
        writer = csv.writer(f)
        writer.writerow(["source_part", "index_in_part", "linear", "omml_xml"])
        for e in eqs:
            writer.writerow([e.get("source_part",""), e.get("index_in_part",""),
                             e.get("linear",""), e.get("omml_xml","")])

    return str(out_json), str(out_csv)

def main():
    # Removed argparse usage
    docx_path = Path("/content/valution.docx") # Directly use the uploaded file path
    out_prefix = "equations" # Default output prefix

    if not docx_path.exists() or docx_path.suffix.lower() != ".docx":
        # This check is still useful, though less likely to fail with a hardcoded path
        raise SystemExit("Please provide a valid .docx file.")

    equations = extract_from_docx(docx_path)
    print(f"Found {len(equations)} equation(s).")

    out_json, out_csv = write_outputs(equations, out_prefix)
    print(f"Wrote: {out_json}")
    print(f"Wrote: {out_csv}")

if __name__ == "__main__":
    main()

Found 0 equation(s).
Wrote: equations.json
Wrote: equations.csv


In [None]:
# Install helper if not present
from google.colab import files
import zipfile, xml.etree.ElementTree as ET, json, csv
from pathlib import Path

# Namespaces for Word/MathML
NS = {
    "w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main",
    "m": "http://schemas.openxmlformats.org/officeDocument/2006/math",
}

def qname(ns_prefix, local):
    return f"{{{NS[ns_prefix]}}}{local}"

def get_attr(elem, ns_prefix, local, default=None):
    return elem.attrib.get(qname(ns_prefix, local), default)

def text_of(elem):
    parts = []
    for t in elem.iter():
        if t.tag in (qname("m","t"), qname("w","t")):
            parts.append(t.text or "")
        if t.tag == qname("m","chr"):
            val = get_attr(t, "m", "val")
            if val:
                parts.append(val)
    return "".join(parts)

def render_math(elem):
    if elem is None:
        return ""
    tag = elem.tag
    if tag == qname("m","f"):  # fraction
        num = elem.find(".//m:num", NS)
        den = elem.find(".//m:den", NS)
        return f"({render_math(num)})/({render_math(den)})"
    if tag == qname("m","sSup"):  # superscript
        base = elem.find(".//m:e", NS)
        sup = elem.find(".//m:sup", NS)
        return f"{render_math(base)}^{{{render_math(sup)}}}"
    if tag == qname("m","sSub"):  # subscript
        base = elem.find(".//m:e", NS)
        sub = elem.find(".//m:sub", NS)
        return f"{render_math(base)}_{{{render_math(sub)}}}"
    if tag == qname("m","rad"):  # sqrt
        body = elem.find(".//m:e", NS)
        return f"sqrt({render_math(body)})"
    if tag == qname("m","oMathPara"):
        return " ".join(render_math(c) for c in elem.findall(".//m:oMath", NS))
    if tag == qname("m","oMath"):
        return "".join(render_math(c) for c in list(elem)) or text_of(elem)
    if tag == qname("m","r"):
        return text_of(elem)
    return text_of(elem)

def find_equations(xml_bytes):
    root = ET.fromstring(xml_bytes)
    eq_nodes = list(root.findall(".//m:oMath", NS)) + list(root.findall(".//m:oMathPara", NS))
    eqs = []
    for i, node in enumerate(eq_nodes, 1):
        eqs.append({
            "index": i,
            "linear": render_math(node).strip(),
            "omml_xml": ET.tostring(node, encoding="unicode")
        })
    return eqs

def extract_from_docx(docx_path):
    eqs = []
    with zipfile.ZipFile(docx_path, "r") as z:
        for part in ["word/document.xml","word/footnotes.xml","word/endnotes.xml"]:
            if part in z.namelist():
                eqs.extend(find_equations(z.read(part)))
    return eqs

# --- Upload your DOCX ---
uploaded = files.upload()
docx_path = list(uploaded.keys())[0]

# --- Extract equations ---
equations = extract_from_docx(docx_path)
print(f"Found {len(equations)} equations")

# --- Save results ---
out_json = "formulas.json"
out_csv = "formulas.csv"

with open(out_json,"w",encoding="utf-8") as f:
    json.dump(equations,f,ensure_ascii=False,indent=2)

with open(out_csv,"w",encoding="utf-8",newline="") as f:
    writer = csv.writer(f)
    writer.writerow(["index","linear","omml_xml"])
    for e in equations:
        writer.writerow([e["index"], e["linear"], e["omml_xml"]])

print("Files saved:", out_json, out_csv)

# --- Download to your machine ---
files.download(out_json)
files.download(out_csv)


Saving valution.docx to valution (1).docx
Found 0 equations
Files saved: formulas.json formulas.csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
!pip -q install pymupdf lxml tqdm

import re, os, json, csv, io, zipfile
from pathlib import Path
from typing import List, Tuple, Dict, Any
from tqdm import tqdm

# PDF
import fitz  # PyMuPDF

# PPTX XML
from zipfile import ZipFile
from lxml import etree

# Colab helpers
try:
    from google.colab import files
    IN_COLAB = True
except Exception:
    IN_COLAB = False

print("Ready ✅ (PyMuPDF, lxml, tqdm)")


Ready ✅ (PyMuPDF, lxml, tqdm)


In [None]:
MATH_SYMBOLS = set(list("=±×÷≈≃∝≡≠≤≥→←∞∑∏∫√∂∇∆µπϕφθλΩωσΣΠΘΛ·⋅^_*/+-()[]{}<>|:;"))
MATH_WORDS = [
    r"\blim\b", r"\bsin\b", r"\bcos\b", r"\btan\b", r"\blog\b", r"\bln\b",
    r"\bexp\b", r"\bmin\b", r"\bmax\b", r"\bargmin\b", r"\bargmax\b",
    r"\bvar\b", r"\bcov\b", r"\bE\[\b", r"\bP\(\b"
]
MATH_WORDS_RE = re.compile("|".join(MATH_WORDS), re.IGNORECASE)

def equation_score(s: str) -> float:
    if not s or len(s.strip()) < 3:
        return 0.0
    s_clean = s.strip()
    sym_count = sum(ch in MATH_SYMBOLS for ch in s_clean)
    sym_density = sym_count / max(1, len(s_clean))
    has_equal = "=" in s_clean
    has_fraction_like = bool(re.search(r"\b\d+\s*/\s*\d+\b", s_clean)) or "frac" in s_clean.lower()
    has_supsub = bool(re.search(r"[A-Za-z]\_\{?[A-Za-z0-9]+\}?|[A-Za-z]\^\{?[A-Za-z0-9+\-*/]+\}?", s_clean))
    has_functions = bool(MATH_WORDS_RE.search(s_clean))
    has_greek = bool(re.search(r"[α-ωΑ-ΩµπϕφθλΩωσ]", s_clean))
    length_penalty = 0.2 if len(s_clean) > 220 else 0.0
    if len(s_clean) > 400: length_penalty = 0.35
    score = (
        0.35*sym_density + 0.25*has_equal + 0.15*has_fraction_like +
        0.15*has_supsub + 0.10*has_functions + 0.10*has_greek
    ) - length_penalty
    return max(0.0, min(1.0, score))

def is_equation_candidate(s: str, threshold: float = 0.35) -> bool:
    return equation_score(s) >= threshold

def split_into_blocks(text: str) -> List[str]:
    parts = re.split(r"\n{2,}|\r{2,}", text)
    blocks = []
    for p in parts:
        lines = [ln.strip() for ln in p.splitlines()]
        joined = " ".join([ln for ln in lines if ln])
        if joined: blocks.append(joined)
    return blocks

def nearby_context(blocks: List[str], idx: int, window_before=1, window_after=1) -> Tuple[str, str]:
    before = " ".join(blocks[max(0, idx-window_before):idx]).strip()
    after = " ".join(blocks[idx+1:min(len(blocks), idx+1+window_after)]).strip()
    return before, after


In [None]:
def extract_from_pdf(path: Path) -> List[Dict[str, Any]]:
    doc = fitz.open(path.as_posix())
    results = []
    for i in range(doc.page_count):
        page = doc.load_page(i)
        blocks = []
        for b in page.get_text("blocks"):
            txt = b[4]
            if txt:
                blocks.extend(split_into_blocks(txt))

        # direct equation-like blocks
        for idx, blk in enumerate(blocks):
            if is_equation_candidate(blk):
                before, after = nearby_context(blocks, idx)
                results.append({
                    "source": path.name,
                    "page_or_slide": i+1,
                    "formula": blk.strip(),
                    "context_before": before,
                    "context_after": after,
                    "confidence": round(equation_score(blk), 3),
                    "format": "pdf",
                })

        # look inside long blocks for sub-sentences
        for idx, blk in enumerate(blocks):
            if not is_equation_candidate(blk) and len(blk) > 160:
                for sub in re.split(r"(?<=[.;:])\s+", blk):
                    if is_equation_candidate(sub):
                        before, after = nearby_context(blocks, idx)
                        results.append({
                            "source": path.name,
                            "page_or_slide": i+1,
                            "formula": sub.strip(),
                            "context_before": before,
                            "context_after": after,
                            "confidence": round(equation_score(sub), 3),
                            "format": "pdf",
                        })
    doc.close()
    return results


In [None]:
NS = {
    "p": "http://schemas.openxmlformats.org/presentationml/2006/main",
    "a": "http://schemas.openxmlformats.org/drawingml/2006/main",
    "r": "http://schemas.openxmlformats.org/officeDocument/2006/relationships",
    "m": "http://schemas.openxmlformats.org/officeDocument/2006/math"
}

def omml_to_text(elem: etree._Element) -> str:
    texts = []
    for t in elem.xpath(".//m:t", namespaces=NS):
        if t.text: texts.append(t.text)
    s = " ".join(texts).strip()
    s = re.sub(r"\s*([=+\-*/^_()])\s*", r" \1 ", s)
    s = re.sub(r"\s+", " ", s).strip()
    return s

def slide_plain_text(tree: etree._Element) -> List[str]:
    paras = []
    for p_el in tree.xpath("//a:p", namespaces=NS):
        runs = []
        for r in p_el.xpath(".//a:r", namespaces=NS):
            t = r.find("a:t", namespaces=NS)
            if t is not None and t.text:
                runs.append(t.text)
        if runs: paras.append(" ".join(runs))
    return [p.strip() for p in paras if p.strip()]

def extract_from_pptx(path: Path) -> List[Dict[str, Any]]:
    results = []
    with ZipFile(path, "r") as z:
        slide_files = sorted([f for f in z.namelist()
                              if f.startswith("ppt/slides/slide") and f.endswith(".xml")])
        for idx, sfile in enumerate(slide_files, start=1):
            xml = z.read(sfile)
            tree = etree.fromstring(xml)
            paras = slide_plain_text(tree)

            math_nodes = tree.xpath("//m:oMath | //m:oMathPara", namespaces=NS)
            if not math_nodes:
                # fallback: heuristic on visible text
                for p_i, para in enumerate(paras):
                    if is_equation_candidate(para):
                        before = paras[p_i-1] if p_i-1 >= 0 else ""
                        after = paras[p_i+1] if p_i+1 < len(paras) else ""
                        results.append({
                            "source": path.name,
                            "page_or_slide": idx,
                            "formula": para.strip(),
                            "context_before": before.strip(),
                            "context_after": after.strip(),
                            "confidence": round(equation_score(para), 3),
                            "format": "pptx",
                        })
                continue

            for mnode in math_nodes:
                ftxt = omml_to_text(mnode)
                if not ftxt: continue
                before = after = ""
                if paras:
                    best_j, best_overlap = 0, -1
                    f_tokens = set(re.findall(r"[A-Za-z0-9]+", ftxt))
                    for j, ptxt in enumerate(paras):
                        p_tokens = set(re.findall(r"[A-Za-z0-9]+", ptxt))
                        overlap = len(f_tokens & p_tokens)
                        if overlap > best_overlap:
                            best_overlap, best_j = overlap, j
                    before = paras[best_j-1] if best_j-1 >= 0 else ""
                    after = paras[best_j+1] if best_j+1 < len(paras) else ""
                conf = max(0.6, equation_score(ftxt))
                results.append({
                    "source": path.name,
                    "page_or_slide": idx,
                    "formula": ftxt,
                    "context_before": before.strip(),
                    "context_after": after.strip(),
                    "confidence": round(conf, 3),
                    "format": "pptx",
                })
    return results


In [None]:
# Choose your input method:

USE_UPLOAD_WIDGET = True   # set False if you prefer Google Drive mount

input_paths = []

if USE_UPLOAD_WIDGET and IN_COLAB:
    print("👇 Select one or more .pdf / .pptx files from your computer")
    uploaded = files.upload()  # opens a file chooser
    for fname, data in uploaded.items():
        with open(fname, "wb") as f:
            f.write(data)
        input_paths.append(Path(fname))
else:
    # (Optional) mount Google Drive instead
    from google.colab import drive
    drive.mount('/content/drive')
    # Example: point to a folder in your Drive
    folder = Path("/content/drive/MyDrive/formula_inputs")
    input_paths = list(folder.glob("*.pdf")) + list(folder.glob("*.pptx"))

print("Files to process:", [p.name for p in input_paths])


👇 Select one or more .pdf / .pptx files from your computer


Saving dcfegs.pdf to dcfegs.pdf
Files to process: ['dcfegs.pdf']


In [None]:
def write_outputs(rows: List[Dict[str, Any]], outdir: Path):
    outdir.mkdir(parents=True, exist_ok=True)
    csv_path = outdir / "formulas.csv"
    jsonl_path = outdir / "formulas.jsonl"

    fieldnames = ["source", "format", "page_or_slide", "formula",
                  "context_before", "context_after", "confidence"]

    with csv_path.open("w", newline="", encoding="utf-8") as f:
        w = csv.DictWriter(f, fieldnames=fieldnames)
        w.writeheader()
        for r in rows: w.writerow({k: r.get(k, "") for k in fieldnames})

    with jsonl_path.open("w", encoding="utf-8") as f:
        for r in rows: f.write(json.dumps(r, ensure_ascii=False) + "\n")

    return csv_path, jsonl_path

all_results = []
for p in tqdm(input_paths, desc="Processing"):
    try:
        if p.suffix.lower() == ".pdf":
            all_results.extend(extract_from_pdf(p))
        elif p.suffix.lower() == ".pptx":
            all_results.extend(extract_from_pptx(p))
        else:
            print(f"[Skip] Unsupported: {p.name}")
    except Exception as e:
        print(f"[Error] {p.name}: {e}")

# Deduplicate (same file + page/slide + exact formula)
seen = set(); dedup = []
for r in all_results:
    key = (r["source"], r["format"], r["page_or_slide"], r["formula"])
    if key in seen: continue
    seen.add(key); dedup.append(r)

# Sort nicely
dedup.sort(key=lambda x: (x["source"], x["format"], x["page_or_slide"]))

csv_path, jsonl_path = write_outputs(dedup, Path("outputs"))
print(f"\nSaved:\n- {csv_path}\n- {jsonl_path}\nTotal formulas: {len(dedup)}")

# Quick preview of first 10
for row in dedup[:10]:
    print(f"\n[{row['source']} @ {row['format']} #{row['page_or_slide']}] conf={row['confidence']}")
    print("Formula:", row['formula'])
    if row['context_before']: print("Before:", row['context_before'])
    if row['context_after']:  print("After :", row['context_after'])


Processing: 100%|██████████| 1/1 [00:00<00:00,  4.76it/s]


Saved:
- outputs/formulas.csv
- outputs/formulas.jsonl
Total formulas: 7

[dcfegs.pdf @ pdf #7] conf=0.435
Formula: Current Cashﬂow to Firm EBIT(1-t)=  5344 (1-.35)=    3474 - Nt CpX= 350 - Chg WC 691 = FCFF 2433 Reinvestment Rate = 1041/3474 =29.97% Return on capital = 25.19%
After : Expected Growth in EBIT (1-t) .30*.25=.075 7.5%

[dcfegs.pdf @ pdf #7] conf=0.435
Formula: Stable Growth g = 3%;  Beta = 1.10; Debt Ratio= 20%; Tax rate=35% Cost of capital = 6.76% ROC= 6.76%; Reinvestment Rate=3/6.76=44%
Before: Expected Growth in EBIT (1-t) .30*.25=.075 7.5%
After : Terminal Value5= 2645/(.0676-.03) = 70,409

[dcfegs.pdf @ pdf #8] conf=0.432
Formula: Current Cashﬂow to Firm EBIT(1-t)=  4810 (1-.35)=    3,180 - Nt CpX= 350 - Chg WC 691 = FCFF                         2139 Reinvestment Rate = 1041/3180
After : =33% Return on capital = 23.06%

[dcfegs.pdf @ pdf #8] conf=0.438
Formula: Stable Growth g = 3%;  Beta = 1.00;; ERP =4% Debt Ratio= 8%; Tax rate=35% Cost of capital = 7.55% ROC= 7.5




In [None]:
if IN_COLAB:
    print("Preparing downloads…")
    files.download(str(csv_path))
    files.download(str(jsonl_path))


Preparing downloads…


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# Choose your input method:

USE_UPLOAD_WIDGET = True   # set False if you prefer Google Drive mount

input_paths = []

if USE_UPLOAD_WIDGET and IN_COLAB:
    print("👇 Select one or more .pdf / .pptx files from your computer")
    uploaded = files.upload()  # opens a file chooser
    for fname, data in uploaded.items():
        with open(fname, "wb") as f:
            f.write(data)
        input_paths.append(Path(fname))
else:
    # (Optional) mount Google Drive instead
    from google.colab import drive
    drive.mount('/content/drive')
    # Example: point to a folder in your Drive
    folder = Path("/content/drive/MyDrive/formula_inputs")
    input_paths = list(folder.glob("*.pdf")) + list(folder.glob("*.pptx"))

print("Files to process:", [p.name for p in input_paths])


👇 Select one or more .pdf / .pptx files from your computer


Saving valpacket1spr25.pdf to valpacket1spr25.pdf
Files to process: ['valpacket1spr25.pdf']


In [None]:
def write_outputs(rows: List[Dict[str, Any]], outdir: Path):
    outdir.mkdir(parents=True, exist_ok=True)
    csv_path = outdir / "formulas.csv"
    jsonl_path = outdir / "formulas.jsonl"

    fieldnames = ["source", "format", "page_or_slide", "formula",
                  "context_before", "context_after", "confidence"]

    with csv_path.open("w", newline="", encoding="utf-8") as f:
        w = csv.DictWriter(f, fieldnames=fieldnames)
        w.writeheader()
        for r in rows: w.writerow({k: r.get(k, "") for k in fieldnames})

    with jsonl_path.open("w", encoding="utf-8") as f:
        for r in rows: f.write(json.dumps(r, ensure_ascii=False) + "\n")

    return csv_path, jsonl_path

all_results = []
for p in tqdm(input_paths, desc="Processing"):
    try:
        if p.suffix.lower() == ".pdf":
            all_results.extend(extract_from_pdf(p))
        elif p.suffix.lower() == ".pptx":
            all_results.extend(extract_from_pptx(p))
        else:
            print(f"[Skip] Unsupported: {p.name}")
    except Exception as e:
        print(f"[Error] {p.name}: {e}")

# Deduplicate (same file + page/slide + exact formula)
seen = set(); dedup = []
for r in all_results:
    key = (r["source"], r["format"], r["page_or_slide"], r["formula"])
    if key in seen: continue
    seen.add(key); dedup.append(r)

# Sort nicely
dedup.sort(key=lambda x: (x["source"], x["format"], x["page_or_slide"]))

csv_path, jsonl_path = write_outputs(dedup, Path("outputs"))
print(f"\nSaved:\n- {csv_path}\n- {jsonl_path}\nTotal formulas: {len(dedup)}")

# Quick preview of first 10
for row in dedup[:20]:
    print(f"\n[{row['source']} @ {row['format']} #{row['page_or_slide']}] conf={row['confidence']}")
    print("Formula:", row['formula'])
    if row['context_before']: print("Before:", row['context_before'])
    if row['context_after']:  print("After :", row['context_after'])


Processing: 100%|██████████| 1/1 [00:00<00:00,  1.38it/s]


Saved:
- outputs/formulas.csv
- outputs/formulas.jsonl
Total formulas: 37

[valpacket1spr25.pdf @ pdf #10] conf=0.441
Formula: § Value of Equity = 50/1.13625 + 60/1.136252 + 68/1.136253 +
Before: § Cost of Equity = 13.625%
After : 76.2/1.136254 + (83.49+1603)/1.136255 = $1073

[valpacket1spr25.pdf @ pdf #10] conf=0.454
Formula: 76.2/1.136254 + (83.49+1603)/1.136255 = $1073
Before: § Value of Equity = 50/1.13625 + 60/1.136252 + 68/1.136253 +
After : § Method 2: Discount CF to Firm at Cost of Capital to get value

[valpacket1spr25.pdf @ pdf #10] conf=0.45
Formula: § Cost of Capital = 13.625% (1073/1873) + 5% (800/1873) = 9.94%
Before: § Cost of Debt = Pre-tax rate (1- tax rate) = 10% (1-.5) = 5%
After : § PV of Firm = 90/1.0994 + 100/1.09942 + 108/1.09943 + 116.2/1.09944

[valpacket1spr25.pdf @ pdf #10] conf=0.441
Formula: § PV of Firm = 90/1.0994 + 100/1.09942 + 108/1.09943 + 116.2/1.09944
Before: § Cost of Capital = 13.625% (1073/1873) + 5% (800/1873) = 9.94%
After : + (123.49+2363)/1




In [None]:
if IN_COLAB:
    print("Preparing downloads…")
    files.download(str(csv_path))
    files.download(str(jsonl_path))


Preparing downloads…


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# Choose your input method:

USE_UPLOAD_WIDGET = True   # set False if you prefer Google Drive mount

input_paths = []

if USE_UPLOAD_WIDGET and IN_COLAB:
    print("👇 Select one or more .pdf / .pptx files from your computer")
    uploaded = files.upload()  # opens a file chooser
    for fname, data in uploaded.items():
        with open(fname, "wb") as f:
            f.write(data)
        input_paths.append(Path(fname))
else:
    # (Optional) mount Google Drive instead
    from google.colab import drive
    drive.mount('/content/drive')
    # Example: point to a folder in your Drive
    folder = Path("/content/drive/MyDrive/formula_inputs")
    input_paths = list(folder.glob("*.pdf")) + list(folder.glob("*.pptx"))

print("Files to process:", [p.name for p in input_paths])


👇 Select one or more .pdf / .pptx files from your computer


Saving valpacket2spr25.pdf to valpacket2spr25.pdf
Files to process: ['valpacket2spr25.pdf']


In [None]:
def write_outputs(rows: List[Dict[str, Any]], outdir: Path):
    outdir.mkdir(parents=True, exist_ok=True)
    csv_path = outdir / "formulas.csv"
    jsonl_path = outdir / "formulas.jsonl"

    fieldnames = ["source", "format", "page_or_slide", "formula",
                  "context_before", "context_after", "confidence"]

    with csv_path.open("w", newline="", encoding="utf-8") as f:
        w = csv.DictWriter(f, fieldnames=fieldnames)
        w.writeheader()
        for r in rows: w.writerow({k: r.get(k, "") for k in fieldnames})

    with jsonl_path.open("w", encoding="utf-8") as f:
        for r in rows: f.write(json.dumps(r, ensure_ascii=False) + "\n")

    return csv_path, jsonl_path

all_results = []
for p in tqdm(input_paths, desc="Processing"):
    try:
        if p.suffix.lower() == ".pdf":
            all_results.extend(extract_from_pdf(p))
        elif p.suffix.lower() == ".pptx":
            all_results.extend(extract_from_pptx(p))
        else:
            print(f"[Skip] Unsupported: {p.name}")
    except Exception as e:
        print(f"[Error] {p.name}: {e}")

# Deduplicate (same file + page/slide + exact formula)
seen = set(); dedup = []
for r in all_results:
    key = (r["source"], r["format"], r["page_or_slide"], r["formula"])
    if key in seen: continue
    seen.add(key); dedup.append(r)

# Sort nicely
dedup.sort(key=lambda x: (x["source"], x["format"], x["page_or_slide"]))

csv_path, jsonl_path = write_outputs(dedup, Path("outputs"))
print(f"\nSaved:\n- {csv_path}\n- {jsonl_path}\nTotal formulas: {len(dedup)}")

# Quick preview of first 10
for row in dedup[:20]:
    print(f"\n[{row['source']} @ {row['format']} #{row['page_or_slide']}] conf={row['confidence']}")
    print("Formula:", row['formula'])
    if row['context_before']: print("Before:", row['context_before'])
    if row['context_after']:  print("After :", row['context_after'])


Processing: 100%|██████████| 1/1 [00:00<00:00,  2.36it/s]


Saved:
- outputs/formulas.csv
- outputs/formulas.jsonl
Total formulas: 15

[valpacket2spr25.pdf @ pdf #22] conf=0.362
Formula: = PE= (FCFE/Earnings)*(1+gn)
Before: P0 EPS0
After : r-gn

[valpacket2spr25.pdf @ pdf #23] conf=0.367
Formula: P0=
Before: § Dividing both sides by the earnings per share:
After : EPS0*Payout Ratio*(1+g)* 1−(1+g)n

[valpacket2spr25.pdf @ pdf #24] conf=0.367
Formula: 𝑃𝐸=
Before: § Required rate of return = 1.5% + 1(5%)= 6.5%
After : .25 ∗1.15 ∗1 −1.15!

[valpacket2spr25.pdf @ pdf #36] conf=0.367
Formula: P0=
Before: the equation for the PE ratio. Dividing it again by the expected growth:
After : EPS0*Payout Ratio*(1+g)* 1−(1+g)n

[valpacket2spr25.pdf @ pdf #48] conf=0.403
Formula: EV =  EBITDA (1- t) + Depr (t) -  Cex  -  Δ Working Capital
Before: EV0 = FCFF1 WACC - g
After : WACC - g

[valpacket2spr25.pdf @ pdf #81] conf=0.411
Formula: PS = 30.61 - 2.77 ln(Rev) + 6.42 (Rev Growth) + 5.11 (Cash/Rev)
Before: higher cash balances should have a greater chance of s




In [None]:
if IN_COLAB:
    print("Preparing downloads…")
    files.download(str(csv_path))
    files.download(str(jsonl_path))


Preparing downloads…


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# Choose your input method:

USE_UPLOAD_WIDGET = True   # set False if you prefer Google Drive mount

input_paths = []

if USE_UPLOAD_WIDGET and IN_COLAB:
    print("👇 Select one or more .pdf / .pptx files from your computer")
    uploaded = files.upload()  # opens a file chooser
    for fname, data in uploaded.items():
        with open(fname, "wb") as f:
            f.write(data)
        input_paths.append(Path(fname))
else:
    # (Optional) mount Google Drive instead
    from google.colab import drive
    drive.mount('/content/drive')
    # Example: point to a folder in your Drive
    folder = Path("/content/drive/MyDrive/formula_inputs")
    input_paths = list(folder.glob("*.pdf")) + list(folder.glob("*.pptx"))

print("Files to process:", [p.name for p in input_paths])


👇 Select one or more .pdf / .pptx files from your computer


Saving dcfrates.pdf to dcfrates.pdf
Files to process: ['dcfrates.pdf']


In [None]:
def write_outputs(rows: List[Dict[str, Any]], outdir: Path):
    outdir.mkdir(parents=True, exist_ok=True)
    csv_path = outdir / "formulas.csv"
    jsonl_path = outdir / "formulas.jsonl"

    fieldnames = ["source", "format", "page_or_slide", "formula",
                  "context_before", "context_after", "confidence"]

    with csv_path.open("w", newline="", encoding="utf-8") as f:
        w = csv.DictWriter(f, fieldnames=fieldnames)
        w.writeheader()
        for r in rows: w.writerow({k: r.get(k, "") for k in fieldnames})

    with jsonl_path.open("w", encoding="utf-8") as f:
        for r in rows: f.write(json.dumps(r, ensure_ascii=False) + "\n")

    return csv_path, jsonl_path

all_results = []
for p in tqdm(input_paths, desc="Processing"):
    try:
        if p.suffix.lower() == ".pdf":
            all_results.extend(extract_from_pdf(p))
        elif p.suffix.lower() == ".pptx":
            all_results.extend(extract_from_pptx(p))
        else:
            print(f"[Skip] Unsupported: {p.name}")
    except Exception as e:
        print(f"[Error] {p.name}: {e}")

# Deduplicate (same file + page/slide + exact formula)
seen = set(); dedup = []
for r in all_results:
    key = (r["source"], r["format"], r["page_or_slide"], r["formula"])
    if key in seen: continue
    seen.add(key); dedup.append(r)

# Sort nicely
dedup.sort(key=lambda x: (x["source"], x["format"], x["page_or_slide"]))

csv_path, jsonl_path = write_outputs(dedup, Path("outputs"))
print(f"\nSaved:\n- {csv_path}\n- {jsonl_path}\nTotal formulas: {len(dedup)}")

# Quick preview of first 10
for row in dedup[:20]:
    print(f"\n[{row['source']} @ {row['format']} #{row['page_or_slide']}] conf={row['confidence']}")
    print("Formula:", row['formula'])
    if row['context_before']: print("Before:", row['context_before'])
    if row['context_after']:  print("After :", row['context_after'])


Processing: 100%|██████████| 1/1 [00:00<00:00,  4.66it/s]


Saved:
- outputs/formulas.csv
- outputs/formulas.jsonl
Total formulas: 7

[dcfrates.pdf @ pdf #56] conf=0.44
Formula: § Sensex on 9/5/07 = 15446
Before: § Inputs for the computation
After : § Dividend yield on index = 3.05%

[dcfrates.pdf @ pdf #70] conf=0.39
Formula: § bL = bu (1+ ((1-t)D/E))
Before: market risk (has a beta of zero), the beta of equity alone can be written as a function of the unlevered beta and the debt-equity ratio
After : § In some versions, the tax effect is ignored and there is no (1-t) in the

[dcfrates.pdf @ pdf #70] conf=0.374
Formula: § bL = bu (1+ ((1-t)D/E)) − bdebt (1-t) (D/E)
Before: estimate the beta of debt, you can estimate the levered beta as follows:
After : § While the latter is more realistic, estimating betas for debt can be

[dcfrates.pdf @ pdf #75] conf=0.427
Formula: § Gross D/E Ratio for Embraer = 1953/11,042 = 18.95%
Before: § For Embraer, using the gross debt ratio
After : § Levered Beta using Gross Debt ratio = 1.07

[dcfrates.pdf @ pdf #7




In [None]:
if IN_COLAB:
    print("Preparing downloads…")
    files.download(str(csv_path))
    files.download(str(jsonl_path))


Preparing downloads…


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>