In [None]:
# ============================================================
# Investor-Style Stock Report Generator (RAG + Personas)
# Colab-ready. Choose either local HF model or an API.
# ============================================================

!pip -q install yfinance pandas numpy python-docx jinja2 transformers accelerate sentencepiece bitsandbytes tiktoken

import os, re, math, datetime as dt
import pandas as pd
import numpy as np
import yfinance as yf
from jinja2 import Template

# OPTIONAL: choose your text model backend
USE_TRANSFORMERS_LOCAL = True   # set False if you will use an API instead
MODEL_NAME = "Qwen/Qwen2.5-1.5B-Instruct"  # small & capable; change as you like

if USE_TRANSFORMERS_LOCAL:
    from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline, BitsAndBytesConfig
    bnb = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_use_double_quant=True, bnb_4bit_quant_type="nf4")
    tok = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
    if tok.pad_token is None:
        tok.pad_token = tok.eos_token
    model = AutoModelForCausalLM.from_pretrained(
        MODEL_NAME, device_map="auto",
        torch_dtype="auto", quantization_config=bnb
    )
    gen = pipeline("text-generation", model=model, tokenizer=tok, max_new_tokens=900, do_sample=True, top_p=0.9, temperature=0.5)

# ==============
# Personas
# ==============
PERSONAS = {
    "buffett_like": {
        "name": "Quality & Moat",
        "principles": [
            "Prioritize durable competitive advantages (moat) and capable, shareholder-aligned management.",
            "Prefer predictable cash flows, high returns on capital, and conservative leverage.",
            "Value via discounted cash or simple yield heuristics; avoid speculation."
        ],
        "angle": "Focus on business quality, moat sources, capital allocation, and long-term FCF generation."
    },
    "lynch_like": {
        "name": "GARP Pragmatist",
        "principles": [
            "Look for understandable businesses with room to grow, reasonable P/E relative to growth (PEG).",
            "Favor category winners or niche compounders with expanding store/product units.",
            "Beware overhyped narratives; check inventory turns, unit economics, and insider ownership."
        ],
        "angle": "Emphasize growth runway, unit economics, and practical signals like PEG, store counts, product cycles."
    },
    "graham_like": {
        "name": "Deep Value & Safety",
        "principles": [
            "Insist on margin of safety; compare price to conservative intrinsic value estimates.",
            "Screen for balance-sheet strength, earnings quality, and normalized cash flow.",
            "Prefer statistical bargains over stories; avoid leverage and accounting red flags."
        ],
        "angle": "Lean on valuation ratios (P/B, EV/EBIT, FCF yield), working-capital strength, and reversion-to-mean cases."
    }
}

# ==============
# Helpers
# ==============
def pct(a, b):
    try:
        if b == 0 or pd.isna(a) or pd.isna(b): return np.nan
        return (a/b - 1.0) * 100.0
    except Exception:
        return np.nan

def safe(val, nd=2, pct_style=False):
    if pd.isna(val): return "NA"
    if pct_style: return f"{val:.{nd}f}%"
    if isinstance(val, (int, np.integer)): return f"{val:,}"
    if isinstance(val, (float, np.floating)):
        if abs(val) >= 1000: return f"{val:,.{nd}f}"
        return f"{val:.{nd}f}"
    return str(val)

def annualize_growth(series, years):
    try:
        if len(series) < 2: return np.nan
        start, end = series[0], series[-1]
        if start <= 0 or end <= 0: return np.nan
        return (end / start) ** (1.0 / years) - 1.0
    except Exception:
        return np.nan

# ==============
# Data fetch
# ==============
def fetch_core(ticker: str):
    tk = yf.Ticker(ticker)
    info = tk.info if hasattr(tk, "info") else {}
    hist = tk.history(period="5y", auto_adjust=True)
    fin = tk.financials if tk.financials is not None else pd.DataFrame()
    bs  = tk.balance_sheet if tk.balance_sheet is not None else pd.DataFrame()
    cf  = tk.cashflow if tk.cashflow is not None else pd.DataFrame()
    fast = getattr(tk, "fast_info", {}) or {}
    return tk, info, fast, hist, fin, bs, cf

def compute_kpis(info, fast, hist, fin, bs, cf):
    latest_price = fast.get("last_price") or (hist["Close"].iloc[-1] if len(hist) else np.nan)
    market_cap = info.get("marketCap", np.nan)
    shares = market_cap / latest_price if (pd.notna(market_cap) and pd.notna(latest_price) and latest_price>0) else np.nan

    # Income statement proxies
    rev = fin.loc["Total Revenue"] if "Total Revenue" in fin.index else pd.Series(dtype=float)
    op_income = fin.loc["Operating Income"] if "Operating Income" in fin.index else pd.Series(dtype=float)
    net_income = fin.loc["Net Income"] if "Net Income" in fin.index else pd.Series(dtype=float)

    # Cash flow proxies
    ocf = cf.loc["Total Cash From Operating Activities"] if "Total Cash From Operating Activities" in cf.index else pd.Series(dtype=float)
    capex = cf.loc["Capital Expenditures"] if "Capital Expenditures" in cf.index else pd.Series(dtype=float)
    fcf = None
    if len(ocf) and len(capex):
        fcf = (ocf + capex)  # capex negative in yfinance
    else:
        fcf = pd.Series(dtype=float)

    # Balance sheet proxies
    tot_assets = bs.loc["Total Assets"] if "Total Assets" in bs.index else pd.Series(dtype=float)
    tot_liab   = bs.loc["Total Liab"] if "Total Liab" in bs.index else pd.Series(dtype=float)
    cash = bs.loc["Cash"] if "Cash" in bs.index else pd.Series(dtype=float)
    debt = bs.loc["Total Debt"] if "Total Debt" in bs.index else pd.Series(dtype=float)

    # Trailing/most recent values
    last_rev = float(rev.iloc[0]) if len(rev) else np.nan
    last_oi  = float(op_income.iloc[0]) if len(op_income) else np.nan
    last_ni  = float(net_income.iloc[0]) if len(net_income) else np.nan
    last_fcf = float(fcf.iloc[0]) if len(fcf) else np.nan
    last_cash = float(cash.iloc[0]) if len(cash) else np.nan
    last_debt = float(debt.iloc[0]) if len(debt) else np.nan

    # Margins & returns
    op_margin = (last_oi / last_rev)*100 if (pd.notna(last_oi) and pd.notna(last_rev) and last_rev>0) else np.nan
    net_margin = (last_ni / last_rev)*100 if (pd.notna(last_ni) and pd.notna(last_rev) and last_rev>0) else np.nan

    # Growth (3Y CAGR if possible)
    rev_hist = list(rev.iloc[::-1].dropna().values)  # oldest → newest
    ni_hist  = list(net_income.iloc[::-1].dropna().values)
    years = max(1, len(rev_hist)-1)
    r_cagr = annualize_growth(rev_hist, years) if years>=2 else np.nan
    ni_cagr = annualize_growth(ni_hist, years) if years>=2 else np.nan

    # Simple valuation ratios
    pe = info.get("trailingPE", np.nan)
    pb = info.get("priceToBook", np.nan)
    ps = market_cap / last_rev if pd.notna(market_cap) and pd.notna(last_rev) and last_rev>0 else np.nan
    fcf_yield = (last_fcf / market_cap)*100 if pd.notna(last_fcf) and pd.notna(market_cap) and market_cap>0 else np.nan

    # EV/EBIT as a sanity (approx EV = MC + Debt - Cash)
    ev = (market_cap if pd.notna(market_cap) else 0) + (last_debt if pd.notna(last_debt) else 0) - (last_cash if pd.notna(last_cash) else 0)
    ebit = last_oi  # proxy
    ev_ebit = ev/ebit if (pd.notna(ev) and pd.notna(ebit) and ebit>0) else np.nan

    return {
        "price": latest_price,
        "market_cap": market_cap,
        "shares_out": shares,
        "last_rev": last_rev,
        "last_oi": last_oi,
        "last_ni": last_ni,
        "last_fcf": last_fcf,
        "op_margin_pct": op_margin,
        "net_margin_pct": net_margin,
        "rev_cagr": (r_cagr*100) if pd.notna(r_cagr) else np.nan,
        "ni_cagr": (ni_cagr*100) if pd.notna(ni_cagr) else np.nan,
        "pe": pe, "pb": pb, "ps": ps, "fcf_yield_pct": fcf_yield, "ev_ebit": ev_ebit,
        "debt": last_debt, "cash": last_cash
    }

# ==============
# Prompt builder
# ==============
BASE_SYSTEM = """You are a diligent equity research assistant. Write clear, structured, and reproducible investment memos.
Be factual, quantify claims, and keep a neutral, professional tone. Do not offer personalized investment advice."""

REPORT_PROMPT_TMPL = Template("""
{{system}}

Persona principles:
- {{persona.name}}: {{persona.angle}}
{% for p in persona.principles %}- {{p}}
{% endfor %}

Task: Create a structured investor-style memo for the stock {{ticker}} (as of {{as_of}}), using the **Data** below.
Focus on: business quality, growth, profitability, cash generation, balance sheet, valuation, risks, catalysts, and a checklist.

Data (parsed):
- Price: {{data.price}}
- Market Cap: {{data.market_cap}}
- Shares Out: {{data.shares_out}}
- Last Revenue: {{data.last_rev}}
- Last Operating Income: {{data.last_oi}}
- Last Net Income: {{data.last_ni}}
- Last FCF: {{data.last_fcf}}
- Operating Margin (%): {{data.op_margin_pct}}
- Net Margin (%): {{data.net_margin_pct}}
- 3Y (approx) Revenue CAGR (%): {{data.rev_cagr}}
- 3Y (approx) Net Income CAGR (%): {{data.ni_cagr}}
- P/E: {{data.pe}} | P/B: {{data.pb}} | P/S: {{data.ps}}
- FCF Yield (%): {{data.fcf_yield_pct}} | EV/EBIT: {{data.ev_ebit}}
- Debt: {{data.debt}} | Cash: {{data.cash}}

Write the memo as Markdown with the following sections:

# {{ticker}} — Investor-Style Memo ({{persona.name}})
**Date:** {{as_of}}

## 1) Business & Moat Snapshot
- What they sell, core segments, geography, and revenue drivers.
- Moat sources (cost advantage, network effects, switching costs, brand, regulation), evidence from margins/returns.

## 2) Growth & Unit Economics
- Summarize growth (CAGR trends), drivers (volume vs price vs mix), and unit-level economics if relevant.
- For {{persona.name}} emphasis: {{persona.angle}}

## 3) Profitability & Cash Generation
- Operating margin, net margin, FCF conversion; seasonality and cyclicality.
- Capital intensity and reinvestment needs.

## 4) Balance Sheet & Liquidity
- Cash/debt posture, maturity considerations; any covenant or dilution risks.

## 5) Valuation Snapshot
- Discuss P/E, P/B, P/S, EV/EBIT, FCF yield in context (historical/peer if known).
- Provide a quick sanity triangulation (e.g., if FCF yield ~X%, implied payback ~1/X years).

## 6) Risks & Uncertainties
- At least 3 specific, *quantified where possible*.

## 7) Catalysts (6–18 months)
- Product launches, regulatory approvals, operating leverage, capital returns, cost takeouts.

## 8) Checklist (Persona-tilted)
- Bullet items that a {{persona.name}} investor would verify before capital allocation.

## 9) Monitoring Plan
- KPIs to track, update cadence, trigger conditions to re-evaluate thesis.

Use the data above faithfully; if something is unknown, say “insufficient data.”
Conclude with: “This memo is for research and education; not investment advice.”
""".strip())

def llm_generate_markdown(ticker, persona_key, data, use_transformers=True):
    persona = PERSONAS[persona_key]
    prompt = REPORT_PROMPT_TMPL.render(
        system=BASE_SYSTEM,
        persona=persona,
        ticker=ticker.upper(),
        as_of=dt.date.today().isoformat(),
        data=data
    )
    if use_transformers:
        inp = f"<|system|>\n{BASE_SYSTEM}\n<|user|>\n{prompt}\n<|assistant|>\n"
        out = gen(inp)[0]["generated_text"]
        # Strip back to last assistant tag if model echoes
        if "<|assistant|>" in out:
            out = out.split("<|assistant|>")[-1].strip()
        return out
    else:
        raise NotImplementedError("Hook your preferred API here (OpenAI, etc.).")

# ==============
# DOCX writer (optional)
# ==============
from docx import Document
from docx.shared import Pt, Inches

def markdown_to_docx(md_text, path):
    # extremely lightweight: split on headings and bullets; keep it simple
    doc = Document()
    style = doc.styles['Normal']
    style.font.name = 'Times New Roman'
    style.font.size = Pt(12)

    lines = md_text.splitlines()
    for ln in lines:
        ln = ln.rstrip()
        if ln.startswith("# "):
            p = doc.add_paragraph()
            run = p.add_run(ln[2:].strip())
            run.bold = True
            run.font.size = Pt(16)
        elif ln.startswith("## "):
            p = doc.add_paragraph()
            run = p.add_run(ln[3:].strip())
            run.bold = True
            run.font.size = Pt(14)
        elif ln.startswith("- "):
            doc.add_paragraph(ln[2:].strip(), style=None).style = doc.styles['List Bullet']
        else:
            doc.add_paragraph(ln)
    doc.save(path)

# ==============
# Orchestrator
# ==============
def build_report(ticker: str, persona_key="buffett_like", out_dir="/content/reports"):
    os.makedirs(out_dir, exist_ok=True)
    tk, info, fast, hist, fin, bs, cf = fetch_core(ticker)
    data = compute_kpis(info, fast, hist, fin, bs, cf)
    md = llm_generate_markdown(ticker, persona_key, data, use_transformers=USE_TRANSFORMERS_LOCAL)

    md_path = os.path.join(out_dir, f"{ticker.upper()}_{persona_key}.md")
    with open(md_path, "w", encoding="utf-8") as f:
        f.write(md)

    docx_path = os.path.join(out_dir, f"{ticker.upper()}_{persona_key}.docx")
    markdown_to_docx(md, docx_path)

    return {"markdown": md_path, "docx": docx_path, "preview": md[:1000]}

# ==============
# Example usage
# ==============
ticker = "AAPL"       # change to your stock
persona = "buffett_like"  # "lynch_like" or "graham_like"
res = build_report(ticker, persona)
print("Saved:", res["markdown"], "and", res["docx"])
print("\n--- Preview ---\n", res["preview"])


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m253.0/253.0 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.3/61.3 MB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
[?25h

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/660 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors:   0%|          | 0.00/3.09G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/242 [00:00<?, ?B/s]

Device set to use cpu


In [None]:
!pip install PyPDF2 python-pptx openpyxl pandas openai-whisper

Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Collecting python-pptx
  Downloading python_pptx-1.0.2-py3-none-any.whl.metadata (2.5 kB)
Collecting openai-whisper
  Downloading openai_whisper-20250625.tar.gz (803 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m803.2/803.2 kB[0m [31m10.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting XlsxWriter>=0.5.7 (from python-pptx)
  Downloading xlsxwriter-3.2.9-py3-none-any.whl.metadata (2.7 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m17.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading python_pptx-1.0.2-py3-none-any.whl (472 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m472.8/472.8 kB[0m [31m28.9 MB/s[0m eta 

In [None]:
import zipfile
import os

# Path to your zip file
zip_path = "/content/lecture_pdf.zip"

# Destination folder where files will be extracted
extract_dir = "/content/damodaran_files"

# Make sure the directory exists
os.makedirs(extract_dir, exist_ok=True)

# Unzip
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_dir)

print(f"Unzipped to: {extract_dir}")


Unzipped to: /content/damodaran_files


In [None]:
!pip install PyPDF2 python-pptx openpyxl pandas openai-whisper

import os, json
import PyPDF2
from pptx import Presentation
import pandas as pd
import whisper

DATA_DIR = "/content/damodaran_files"  # put all your files here
OUTPUT_JSONL = "/content/damodaran_dataset.jsonl"

def extract_pdf(path):
    text = ""
    with open(path, "rb") as f:
        reader = PyPDF2.PdfReader(f)
        for page in reader.pages:
            text += page.extract_text() or ""
    return text

def extract_pptx(path):
    prs = Presentation(path)
    return "\n".join(shape.text for slide in prs.slides for shape in slide.shapes if hasattr(shape, "text"))

def extract_xlsx(path):
    dfs = pd.read_excel(path, sheet_name=None)
    return "\n\n".join([f"Sheet {name}:\n{df.to_string()}" for name, df in dfs.items()])

def extract_txt(path):
    with open(path, "r", encoding="utf-8", errors="ignore") as f:
        return f.read()

def extract_audio(path):
    model = whisper.load_model("small")
    result = model.transcribe(path)
    return result["text"]

dataset = []
for fn in os.listdir(DATA_DIR):
    path = os.path.join(DATA_DIR, fn)
    if fn.endswith(".pdf"): txt = extract_pdf(path)
    elif fn.endswith(".pptx"): txt = extract_pptx(path)
    elif fn.endswith((".xlsx",".xls")): txt = extract_xlsx(path)
    elif fn.endswith(".txt"): txt = extract_txt(path)
    elif fn.endswith((".mp4",".mp3",".wav")): txt = extract_audio(path)
    else: continue

    # Create training pair (system → user → assistant)
    dataset.append({
        "messages": [
            {"role": "system", "content": "You are a Damodaran-style investment analyst."},
            {"role": "user", "content": "Summarize this material in your valuation style."},
            {"role": "assistant", "content": txt[:2000]}  # use snippet as the 'desired style'
        ]
    })

with open(OUTPUT_JSONL, "w") as f:
    for row in dataset:
        f.write(json.dumps(row) + "\n")

print("Saved dataset:", OUTPUT_JSONL, "with", len(dataset), "examples")


Saved dataset: /content/damodaran_dataset.jsonl with 0 examples
