In [None]:

# 00_data_acquisition.ipynb
%pip install -q kaggle datasets gitpython pandas pyarrow tqdm

import os, json, zipfile
from pathlib import Path
import pandas as pd
from tqdm.auto import tqdm

BASE = Path.cwd()
RAW  = BASE / "data" / "raw"
RAW.mkdir(parents=True, exist_ok=True)

# ---------- (One-time) configure Kaggle API ----------
KAGGLE_TOKEN_JSON = None  # paste your kaggle.json dict here or keep None if already configured
if KAGGLE_TOKEN_JSON:
    kagdir = Path.home()/".kaggle"
    kagdir.mkdir(parents=True, exist_ok=True)
    with open(kagdir/"kaggle.json", "w") as f:
        json.dump(KAGGLE_TOKEN_JSON, f)
    os.chmod(kagdir/"kaggle.json", 0o600)

# ---------- LendingClub (Kaggle) ----------
!kaggle datasets download -d wordsforthewise/lending-club -p {RAW} -q
for z in RAW.glob("*.zip"):
    with zipfile.ZipFile(z, "r") as zip_ref:
        zip_ref.extractall(RAW)

cand = list(RAW.glob("*.csv")) + list(RAW.glob("**/*.csv"))
print("LendingClub CSV candidates:", len(cand))
LENDING_OUT = RAW / "lendingclub.csv"
if not LENDING_OUT.exists() and cand:
    frames = []
    for c in cand:
        try:
            head = pd.read_csv(c, nrows=20, low_memory=False)
        except Exception:
            continue
        if set(head.columns).intersection({"loan_status","desc","title","purpose"}):
            try:
                df_part = pd.read_csv(c, low_memory=False)
                frames.append(df_part)
            except Exception:
                pass
    if frames:
        df = pd.concat(frames, ignore_index=True)
        if "description" not in df.columns:
            if "desc" in df.columns:
                df["description"] = df["desc"]
            elif "title" in df.columns:
                df["description"] = df["title"].astype(str)
            else:
                df["description"] = ""
        df.to_csv(LENDING_OUT, index=False)
        print("Saved:", LENDING_OUT)

# ---------- German Credit (Kaggle) ----------
!kaggle datasets download -d prena0808/statlog-german-credit-data -p {RAW} -q
for z in RAW.glob("statlog-german-credit-data*.zip"):
    with zipfile.ZipFile(z, "r") as zip_ref:
        zip_ref.extractall(RAW)
print("German Credit files (sample):", list(RAW.glob("*german*.*"))[:5])

# ---------- MultiFin (HF) ----------
from datasets import load_dataset
mf = load_dataset("awinml/MultiFin")
mf_all = pd.concat([pd.DataFrame(mf[s]) for s in mf], ignore_index=True)
mf_all.rename(columns={"text":"text"}, inplace=True)
mf_all.to_csv(RAW/"multifin_text.csv", index=False)
print("Saved:", RAW/"multifin_text.csv", "rows:", len(mf_all))

# ---------- MultiFinBen EN/ES OCR (HF) ----------
mben_en = load_dataset("TheFinAI/MultiFinBen-EnglishOCR", split="train")
try:
    mben_es = load_dataset("TheFinAI/MultiFinBen-SpanishOCR", split="train")
except Exception:
    mben_es = None
pd.DataFrame(mben_en)["text"].to_csv(RAW/"multifinben_en_text.csv", index=False)
if mben_es is not None:
    pd.DataFrame(mben_es)["text"].to_csv(RAW/"multifinben_es_text.csv", index=False)

# ---------- MAEC (GitHub) ----------
%pip install -q gitpython
from git import Repo
MAEC_DIR = RAW / "MAEC"
if not MAEC_DIR.exists():
    Repo.clone_from("https://github.com/Earnings-Call-Dataset/MAEC-A-Multimodal-Aligned-Earnings-Conference-Call-Dataset-for-Financial-Risk-Prediction", MAEC_DIR)

rows = []
for textf in MAEC_DIR.glob("MAEC_Dataset/*/text.txt"):
    try:
        text = Path(textf).read_text(encoding="utf-8", errors="ignore")
        rows.append({"text": text, "source": textf.parent.name})
    except Exception:
        pass
pd.DataFrame(rows).to_csv(RAW/"maec_transcripts.csv", index=False)
print("MAEC transcripts saved:", RAW/"maec_transcripts.csv")

print("ALL DOWNLOADS COMPLETE.")
