<a href="https://colab.research.google.com/github/Yasaman-habibi/Modeling_Report/blob/main/GPU_06.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# config.py

from pathlib import Path
import torch

class Config:

    # -----------------------------------------------------
    # ۱. مسیرها (سازگار با Linux / GPU Server)
    # -----------------------------------------------------
    BASE_DIR = Path(".").resolve()

    INPUT_DIR = BASE_DIR / "data" / "input"
    OUTPUT_DIR = BASE_DIR / "data" / "output"

    OUTPUT_DIR.mkdir(exist_ok=True, parents=True)

    # -----------------------------------------------------
    # ۲. کلاس‌ها
    # -----------------------------------------------------
    CLASSES = ["E", "S", "G" , "EC"]

    # -----------------------------------------------------
    # ۳. Embedding / GPU
    # -----------------------------------------------------
    EMBEDDING_MODEL = "all-MiniLM-L6-v2"

    DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

    USE_FP16 = True if DEVICE == "cuda" else False

    BATCH_SIZE_GPU = 256
    BATCH_SIZE_CPU = 32

    BATCH_SIZE = BATCH_SIZE_GPU if DEVICE == "cuda" else BATCH_SIZE_CPU

    -------------------------------------------------------
        CLASS_SEEDS = {
        "E": [
            "emission", "carbon", "Environmental", "Environmental protection", "Environmental impact",
            "environment", "climate", "Climate mitigation", "Climate change", "Carbon footprint",
            "Carbon emissions", "Pollutants", "Greenhouse Gas Emissions", "Decarbonization",
            "Renewable energy", "Clean energy", "Energy efficiency", "Recycling", "Circular economy",
            "Waste management", "Zero waste", "Natural Resources", "Resource management",
            "Earth", "Air", "biodiversity", "Atmospheric", "Water", "pollution",  "Green technology",
            "Pollution reduction", "Drought", "Water conservation", "Ground Warming",
            "Global Warming", "Species extinction", "Ecosystem preservation", "Sustainable materials",
            "Life cycle assessment", "Eco-friendly", "Sustainable agriculture",
        ],
        "S": [
            "employee", "safety", "Human rights", "CSR", "Corporate", "social", "responsibility", "Society",
            "Responsible consumption", "Demographic changes", "Famine", "Better life", "Diversity", "Inclusion",
            "Equality", "Labor practices", "Worker health", "Sustainable supply chain",
        ],
        "G": [
            "Economic", "Economy", "Green economy", "governance", "Risk management", "Executive compensation",
            "Shareholder", "audit", "board", "Management structure", "Fiduciary duty", "Internal controls",
            "Ethics", "Compliance", "Regulation", "Anti-corruption", "Corruption", "Bribery", "Legal",
            "Code of conduct", "Disclosure", "Transparency", "Non-Financial", "Reporting", "Accountability",
            "Data security",
        ],
        "EC": [
            "economic", "financial performance", "profitability",
            "economic growth", "market risk", "capital allocation",
            "investment", "revenue", "cost efficiency"
        ]
    }


    # -----------------------------------------------------
    # ۴. Seed
    # -----------------------------------------------------
    RANDOM_STATE = 42

    # -----------------------------------------------------
    # ۵. وزن امتیازها
    # -----------------------------------------------------
    ALPHA = 0.5
    GAMMA = 0.3
    BETA = 0.2

    if not abs(ALPHA + GAMMA + BETA - 1.0) < 1e-6:
        print("⚠️ هشدار: مجموع وزن‌ها برابر ۱ نیست")

    # -----------------------------------------------------
    # ۶. آستانه‌های تصمیم
    # -----------------------------------------------------
    THRESH_LABEL = 0.6
    THRESH_MULTI = 0.45
    THRESH_NONE  = 0.4

    # -----------------------------------------------------
    # ۷. Chunking
    # -----------------------------------------------------
    MAX_CHUNK_TOKENS = 512
    SPLIT_SIZE = 256
    SPLIT_OVERLAP = 50

    # -----------------------------------------------------
    # ۸. Lexicon (TF-IDF)
    # -----------------------------------------------------
    TFIDF_TOPN = 50
    TFIDF_MIN_DF = 2


In [None]:
# io_utils.py

import json
from pathlib import Path
import pandas as pd
from tqdm.auto import tqdm
from typing import Iterator, Dict, Any

try:
    from .config import Config
except ImportError:
    class Config:
        MIN_CHUNK_CHARS = 10
        CLASSES = ["E", "S", "G" , "EC"]


def iter_cleaned_chunks(input_dir: Path) -> Iterator[Dict[str, Any]]:
    """
    Generator برای خواندن چانک‌ها بدون نگه داشتن کل داده در RAM
    """
    json_files = list(input_dir.glob("*.json"))
    print(f"در حال پردازش {len(json_files)} فایل JSON...")

    for file_path in tqdm(json_files, desc="Processing JSON Files"):
        try:
            with open(file_path, "r", encoding="utf-8") as f:
                data = json.load(f)

            document_id = file_path.stem

            for key, chunk_data in data.items():
                if key.startswith("chunk_") and isinstance(chunk_data, dict):
                    text = chunk_data.get("chunk_text", "").strip()

                    if text and len(text) >= Config.MIN_CHUNK_CHARS:
                        yield {
                            "chunk_id": chunk_data.get("chunk_id"),
                            "document_id": document_id,
                            "chunk_text": text,
                            "item_code": chunk_data.get("item_code", "N/A"),
                            "E_score": 0.0,
                            "S_score": 0.0,
                            "G_score": 0.0,
                            "EC_score": 0.0,
                            "final_label": "None",
                        }

        except json.JSONDecodeError:
            print(f"⚠️ فایل {file_path} JSON معتبر نیست.")
        except Exception as e:
            print(f"⚠️ خطا در {file_path}: {e}")


def load_cleaned_jsons(input_dir: Path, chunk_size: int = 50_000) -> pd.DataFrame:
    """
    بارگذاری چانک‌ها به صورت batch برای جلوگیری از مصرف بیش از حد RAM
    """
    buffer = []
    frames = []

    for row in iter_cleaned_chunks(input_dir):
        buffer.append(row)

        if len(buffer) >= chunk_size:
            df_part = pd.DataFrame(buffer)
            frames.append(df_part)
            buffer.clear()

    if buffer:
        frames.append(pd.DataFrame(buffer))

    if not frames:
        print("هشدار: هیچ چانک معتبری پیدا نشد.")
        return pd.DataFrame()

    df = pd.concat(frames, ignore_index=True)

    if "chunk_id" in df.columns:
        df.set_index("chunk_id", inplace=True)

    print(f"\nتعداد کل چانک‌های استخراج شده: {len(df)}")
    return df


def save_scored_dataframe(df: pd.DataFrame, output_dir: Path, filename: str = "scored_chunks.csv"):
    output_path = output_dir / filename
    df.to_csv(output_path, index=True, encoding="utf-8")
    print(f"\nDataFrame در {output_path} ذخیره شد.")


def load_previous_scores(output_dir: Path, filename: str = "scored_chunks.csv") -> pd.DataFrame:
    try:
        df = pd.read_csv(output_dir / filename, index_col="chunk_id", encoding="utf-8")
        print(f"DataFrame قبلی با {len(df)} سطر بارگذاری شد.")
        return df
    except FileNotFoundError:
        print("DataFrame قبلی پیدا نشد.")
        return pd.DataFrame()


In [None]:
# preprocessing.py

import re
import json
from pathlib import Path
from tqdm import tqdm
from typing import List, Dict, Any

from transformers import AutoTokenizer

try:
    from .config import Config
except ImportError:
    class Config:
        MIN_CHUNK_CHARS = 10
        MAX_CHUNK_TOKENS = 512
        SPLIT_OVERLAP = 50
        EMBEDDING_MODEL = "all-MiniLM-L6-v2"
        INPUT_DIR = Path("./raw_data")
        OUTPUT_DIR = Path("./results")


# --------------------------------
# 1. پاک‌سازی متن
# --------------------------------
def clean_text(text: str) -> str:
    if not isinstance(text, str):
        return ""

    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'-{3,}', ' ', text)

    return text.strip()


# --------------------------------
# 2. Chunking واقعی بر اساس tokenizer مدل
# --------------------------------
def split_text_into_chunks_by_tokens(
    text: str,
    tokenizer: AutoTokenizer,
    max_tokens: int,
    overlap: int
) -> List[Dict[str, Any]]:

    input_ids = tokenizer.encode(
        text,
        add_special_tokens=False,
        truncation=False
    )

    chunks = []
    step = max_tokens - overlap
    if step <= 0:
        raise ValueError("overlap must be smaller than max_tokens")

    chunk_index = 0
    for i in range(0, len(input_ids), step):
        chunk_ids = input_ids[i:i + max_tokens]
        chunk_text = tokenizer.decode(chunk_ids)

        if len(chunk_text) >= Config.MIN_CHUNK_CHARS:
            chunks.append({
                "chunk_text": chunk_text,
                "n_tokens": len(chunk_ids),
                "chunk_index": chunk_index
            })
            chunk_index += 1

    return chunks


# --------------------------------
# 3. اجرای خط لوله پیش‌پردازش
# --------------------------------
def run_preprocessing_pipeline(
    input_dir: Path,
    output_dir: Path,
    config: Any
):

    tokenizer = AutoTokenizer.from_pretrained(config.EMBEDDING_MODEL)

    raw_files = list(input_dir.glob("*.txt"))
    print(f"شروع پیش‌پردازش برای {len(raw_files)} فایل متنی")

    output_dir.mkdir(parents=True, exist_ok=True)

    for file_path in tqdm(raw_files, desc="Preprocessing"):
        document_id = file_path.stem
        output_json_path = output_dir / f"cleaned_{document_id}.json"

        if output_json_path.exists():
            continue

        try:
            with open(file_path, "r", encoding="utf-8") as f:
                raw_text = f.read()

            cleaned_text = clean_text(raw_text)

            chunks = split_text_into_chunks_by_tokens(
                cleaned_text,
                tokenizer,
                config.MAX_CHUNK_TOKENS,
                config.SPLIT_OVERLAP
            )

            output_data = {}
            for idx, chunk in enumerate(chunks):
                chunk_key = f"chunk_{idx:03d}"
                unique_chunk_id = f"{document_id}_chunk{idx:03d}"

                output_data[chunk_key] = {
                    "chunk_id": unique_chunk_id,
                    "chunk_text": chunk["chunk_text"],
                    "n_tokens": chunk["n_tokens"],
                    "item_code": "N/A"
                }

            with open(output_json_path, "w", encoding="utf-8") as f:
                json.dump(output_data, f, ensure_ascii=False, indent=2)

        except Exception as e:
            print(f"خطا در {file_path.name}: {e}")

    print("پیش‌پردازش کامل شد.")


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
import json
import re
from typing import Dict, List, Tuple


# --------------------------------------------------
# استخراج Lexicon مبتنی بر TF-IDF
# --------------------------------------------------
def extract_top_tfidf_by_item(
    df,
    text_col: str = "chunk_text",
    item_col: str = "item_code",
    topn: int = 50,
    min_df: int = 2
) -> Dict[str, List[Tuple[str, float]]]:

    out = {}

    for item in df[item_col].unique():
        sub = df[df[item_col] == item]
        texts = sub[text_col].fillna("").tolist()

        if len(texts) < 2:
            out[item] = []
            continue

        vec = TfidfVectorizer(
            ngram_range=(1, 3),
            min_df=min_df,
            max_features=5000
        )

        X = vec.fit_transform(texts)
        scores = X.sum(axis=0).A1
        terms = vec.get_feature_names_out()

        ranked = sorted(
            zip(terms, scores),
            key=lambda x: x[1],
            reverse=True
        )[:topn]

        out[item] = ranked

    return out


# --------------------------------------------------
# ذخیره / بارگذاری Lexicon
# --------------------------------------------------
def save_lexicons(lexicon_dict: dict, path):
    with open(path, "w", encoding="utf-8") as f:
        json.dump(lexicon_dict, f, ensure_ascii=False, indent=2)


def load_lexicons(path):
    with open(path, "r", encoding="utf-8") as f:
        return json.load(f)


# --------------------------------------------------
# امتیازدهی Lexicon برای هر chunk
# --------------------------------------------------
def lexicon_score_for_chunk(
    chunk_text: str,
    lexicons: Dict[str, List[Tuple[str, float]]]
) -> Dict[str, float]:

    text = chunk_text.lower()
    total_words = max(1, len(text.split()))
    scores = {}

    for cls, term_list in lexicons.items():
        count = 0
        for term, _ in term_list:
            if re.search(rf"\b{re.escape(term.lower())}\b", text):
                count += 1
        scores[cls] = count / total_words

    return scores


In [None]:
# embeddings.py

import os
import json
import hashlib
import numpy as np
from typing import List, Optional, Tuple
from tqdm import tqdm

import torch
from sentence_transformers import SentenceTransformer


def _hash_text(text: str) -> str:
    return hashlib.sha1(text.encode("utf-8")).hexdigest()


class SBERTEmbedder:

    def __init__(
        self,
        model_name: str = "all-MiniLM-L6-v2",
        device: Optional[str] = None,
        batch_size: int = 128,
        cache_dir: Optional[str] = None,
        use_fp16: bool = True
    ):

        self.device = device or ("cuda" if torch.cuda.is_available() else "cpu")
        self.batch_size = batch_size
        self.cache_dir = cache_dir

        self.model = SentenceTransformer(model_name, device=self.device)

        if self.device == "cuda" and use_fp16:
            self.model = self.model.half()

        self.dim = self.model.get_sentence_embedding_dimension()

        if cache_dir:
            os.makedirs(cache_dir, exist_ok=True)
            self.index_path = os.path.join(cache_dir, "emb_index.json")
            self.index = self._load_index()
        else:
            self.index = None

        print(f"SBERT loaded on {self.device} | dim={self.dim}")

    # ----------------------------
    # Index handling
    # ----------------------------
    def _load_index(self):
        if os.path.exists(self.index_path):
            try:
                with open(self.index_path, "r", encoding="utf-8") as f:
                    return json.load(f)
            except Exception:
                return {}
        return {}

    def _save_index(self):
        if self.cache_dir:
            with open(self.index_path, "w", encoding="utf-8") as f:
                json.dump(self.index, f)

    # ----------------------------
    # Core embedding
    # ----------------------------
    def embed_texts(
        self,
        texts: List[str],
        show_progress: bool = True
    ) -> np.ndarray:

        if not texts:
            return np.zeros((0, self.dim), dtype=np.float32)

        embeddings = self.model.encode(
            texts,
            batch_size=self.batch_size,
            convert_to_numpy=True,
            normalize_embeddings=True,
            show_progress_bar=show_progress
        )

        return embeddings.astype(np.float32)

    # ----------------------------
    # Cache-aware embedding
    # ----------------------------
    def embed_with_cache(
        self,
        texts: List[str],
        ids: Optional[List[str]] = None
    ) -> np.ndarray:

        if not self.cache_dir or ids is None:
            return self.embed_texts(texts)

        results = [None] * len(texts)
        to_compute = []

        for i, (tid, text) in enumerate(zip(ids, texts)):
            h = _hash_text(text)
            fname = os.path.join(self.cache_dir, f"{h}.npy")

            if os.path.exists(fname):
                results[i] = np.load(fname)
            else:
                to_compute.append((i, h, text))

        if to_compute:
            new_embs = self.embed_texts([x[2] for x in to_compute])
            for (i, h, _), emb in zip(to_compute, new_embs):
                np.save(os.path.join(self.cache_dir, f"{h}.npy"), emb)
                results[i] = emb

        return np.vstack(results)


In [None]:
# embeddings_mpnet.py

import os
import json
import hashlib
import numpy as np
from typing import List, Optional
from tqdm import tqdm

import torch
from sentence_transformers import SentenceTransformer


def _hash_text(text: str) -> str:
    return hashlib.sha1(text.encode("utf-8")).hexdigest()


class MPNetEmbedder:
    """
    Embedder اختصاصی برای مدل all-mpnet-base-v2
    """

    def __init__(
        self,
        model_name: str = "all-mpnet-base-v2",
        device: Optional[str] = None,
        batch_size: int = 32,
        cache_dir: Optional[str] = None,
        use_fp16: bool = True
    ):

        self.device = device or ("cuda" if torch.cuda.is_available() else "cpu")
        self.batch_size = batch_size
        self.cache_dir = cache_dir

        self.model = SentenceTransformer(model_name, device=self.device)

        if self.device == "cuda" and use_fp16:
            self.model = self.model.half()

        self.dim = self.model.get_sentence_embedding_dimension()

        if cache_dir:
            os.makedirs(cache_dir, exist_ok=True)

        print(
            f"MPNet loaded | device={self.device} | dim={self.dim} | batch={self.batch_size}"
        )

    # -------------------------------------------------
    # embedding اصلی
    # -------------------------------------------------
    def embed_texts(
        self,
        texts: List[str],
        show_progress: bool = True
    ) -> np.ndarray:

        if not texts:
            return np.zeros((0, self.dim), dtype=np.float32)

        embeddings = self.model.encode(
            texts,
            batch_size=self.batch_size,
            convert_to_numpy=True,
            normalize_embeddings=True,
            show_progress_bar=show_progress
        )

        return embeddings.astype(np.float32)

    # -------------------------------------------------
    # embedding با cache (بر اساس متن)
    # -------------------------------------------------
    def embed_with_cache(
        self,
        texts: List[str]
    ) -> np.ndarray:

        if not self.cache_dir:
            return self.embed_texts(texts)

        results = [None] * len(texts)
        to_compute = []

        for i, text in enumerate(texts):
            h = _hash_text(text)
            path = os.path.join(self.cache_dir, f"{h}.npy")

            if os.path.exists(path):
                results[i] = np.load(path)
            else:
                to_compute.append((i, h, text))

        if to_compute:
            new_embs = self.embed_texts([x[2] for x in to_compute])
            for (i, h, _), emb in zip(to_compute, new_embs):
                np.save(os.path.join(self.cache_dir, f"{h}.npy"), emb)
                results[i] = emb

        return np.vstack(results)


In [None]:
# priors.py


from typing import Dict, List
from pathlib import Path
import json

import numpy as np
from typing import Iterator, Dict, Any

try:
    from .config import Config
except ImportError:
    class Config:
        CLASSES = ["E", "S", "G" , "EC"]

CLASSES = Config.CLASSES


# fallback کاملاً امن
UNIFORM_PRIOR = [1.0 / len(CLASSES)] * len(CLASSES)


def load_priors(priors_path: Path) -> Dict[str, List[float]]:
    """
    بارگذاری priors ساخته‌شده از لیبل‌ها (json)
    """
    with open(priors_path, "r", encoding="utf-8") as f:
        priors = json.load(f)

    return priors


def _list_to_dict(lst: List[float]) -> Dict[str, float]:
    """
    [0.4,0.2,0.4] → {'E':0.4,'S':0.2,'G':0.4}
    """
    return {
        cls: float(lst[i]) if i < len(lst) else 0.0
        for i, cls in enumerate(CLASSES)
    }


def normalize_prior(d: Dict[str, float]) -> Dict[str, float]:
    """
    نرمال‌سازی برای اطمینان از sum=1
    """
    s = sum(d.values())
    if s <= 0:
        return {k: 1.0 / len(d) for k in d}
    return {k: v / s for k, v in d.items()}


def get_prior_for_item(
    item_code: str,
    priors_map: Dict[str, List[float]],
) -> Dict[str, float]:
    """
    گرفتن prior مناسب برای یک item_code
    """

    key = (item_code or "").strip()
    ku = key.upper()

    # 1️⃣ exact
    if ku in priors_map:
        return normalize_prior(_list_to_dict(priors_map[ku]))

    # 2️⃣ prefix (1A → 1)
    if len(ku) >= 2 and ku[:2] in priors_map:
        return normalize_prior(_list_to_dict(priors_map[ku[:2]]))

    if len(ku) >= 1 and ku[:1] in priors_map:
        return normalize_prior(_list_to_dict(priors_map[ku[:1]]))

    # 3️⃣ unknown
    return normalize_prior(
        _list_to_dict(priors_map.get("unknown", UNIFORM_PRIOR))
    )


In [None]:
# scoring.py

import numpy as np
from typing import Iterator, Dict, Any

try:
    from .config import Config
except ImportError:
    class Config:
        CLASSES = ["E", "S", "G" , "EC"]

CLASSES = Config.CLASSES


def l2_norm(vec: np.ndarray) -> np.ndarray:
    norm = np.linalg.norm(vec)
    if norm == 0:
        return vec
    return vec / norm


def compute_embedding_sims(
    chunk_vec: np.ndarray,
    class_reps: Dict[str, np.ndarray]
) -> Dict[str, float]:
    out = {}

    if chunk_vec is None:
        dim = next(iter(class_reps.values())).shape[0]
        chunk_vec = np.zeros(dim)

    chunk_n = l2_norm(chunk_vec.astype(float))

    for cls in CLASSES:
        rep = class_reps.get(cls)
        if rep is None:
            out[cls] = 0.0
            continue

        rep_n = l2_norm(rep.astype(float))
        sim = float(np.dot(chunk_n, rep_n))
        out[cls] = max(-1.0, min(1.0, sim))

    return out


def normalize_scores(d: Dict[str, float]) -> Dict[str, float]:
    vals = np.array(list(d.values()), dtype=float)

    if len(vals) == 0 or vals.max() - vals.min() <= 1e-9:
        return {k: 1.0 / len(d) for k in d}

    mn, mx = vals.min(), vals.max()
    return {k: float((v - mn) / (mx - mn)) for k, v in d.items()}


def softmax_dict(d: Dict[str, float], temp: float = 1.0) -> Dict[str, float]:
    arr = np.array(list(d.values()), dtype=float) / float(temp)
    arr -= arr.max()
    ex = np.exp(arr)
    probs = ex / ex.sum()
    return {k: float(p) for k, p in zip(d.keys(), probs)}


def combine_scores(
    sim_scores: Dict[str, float],
    lex_scores: Dict[str, float],
    prior_scores: Dict[str, float],
    alpha: float = 0.5,
    gamma: float = 0.3,
    beta: float = 0.2
):
    sim_n = normalize_scores({k: sim_scores.get(k, 0.0) for k in CLASSES})
    lex_n = normalize_scores({k: lex_scores.get(k, 0.0) for k in CLASSES})

    prior_scores = prior_scores or {}
    prior_n = normalize_scores({
        k: float(prior_scores.get(k, 1.0 / len(CLASSES)))
        for k in CLASSES
    })

    total_w = alpha + gamma + beta
    if total_w <= 0:
        alpha, gamma, beta = 1.0, 0.0, 0.0
        total_w = 1.0

    alpha, gamma, beta = alpha / total_w, gamma / total_w, beta / total_w

    combined = {
        k: alpha * sim_n[k] + gamma * lex_n[k] + beta * prior_n[k]
        for k in CLASSES
    }

    probs = softmax_dict(combined)
    return combined, probs


def decide_label(
    probs: Dict[str, float],
    thresh_label: float = 0.6,
    thresh_multi: float = 0.45,
    thresh_none: float = 0.4
):
    if not probs:
        return "ambiguous", []

    sorted_items = sorted(probs.items(), key=lambda x: x[1], reverse=True)
    max_label, max_prob = sorted_items[0]

    if max_prob < thresh_none:
        return "ambiguous", []

    if max_prob >= thresh_label:
        return "single", [max_label]

    labels = [k for k, v in probs.items() if v >= thresh_multi]
    if labels:
        return "multi", labels

    return "single", [max_label]


In [None]:
# pipeline.py
import pandas as pd
import numpy as np


def build_class_representatives_from_seed(seed_lexicons, embedder):
    reps = {}
    for cls, terms in seed_lexicons.items():
        if not terms:
            reps[cls] = np.zeros(embedder.dim)
            continue

        embs = embedder.embed_texts(terms)  # GPU-safe
        vec = embs.mean(axis=0)

        norm = np.linalg.norm(vec)
        reps[cls] = vec / (norm if norm > 0 else 1.0)

    return reps


def run_pipeline(
    input_dir=INPUT_DIR,
    output_dir=OUTPUT_DIR,
    priors_map=None,
    lexicon_path=None,
):
    if priors_map is None:
        raise ValueError("priors_map must be provided")

    # ---------- load data ----------
    df = load_cleaned_jsons(input_dir)
    print(f"Loaded {len(df)} chunks")

    df["chunk_text_clean"] = (
        df["chunk_text"].fillna("").apply(clean_chunk_text)
    )

    # ---------- lexicons ----------
    if lexicon_path:
        seed_lexicons = load_lexicons(lexicon_path)
    else:
        seed_lexicons = {
            "E": E + core_esg_seeds,
            "S": S + core_esg_seeds,
            "G": G + core_esg_seeds,
        }

    # ---------- embedder (GPU inside) ----------
    embedder = SBERTEmbedder(
        model_name=EMBEDDING_MODEL,
        batch_size=BATCH_SIZE,
        cache_dir="emb_cache",
    )

    class_reps = build_class_representatives_from_seed(
        seed_lexicons, embedder
    )

    # ---------- embeddings ----------
    ids = df["chunk_id"].tolist()
    texts = df["chunk_text_clean"].tolist()

    print("Computing embeddings (GPU if available)...")
    all_embs = embedder.embed_with_cache(
        ids, texts, show_progress=True
    )

    # ---------- scoring ----------
    results = []
    n = len(df)

    for i in range(n):
        row = df.iloc[i]
        emb = all_embs[i]

        sim_scores = compute_embedding_sims(emb, class_reps)
        lex_scores = lexicon_score_for_chunk(
            row["chunk_text_clean"], seed_lexicons
        )

        prior_dict = get_prior_for_item(
            row.get("item_code", "unknown"),
            priors_map,
        )

        _, probs = combine_scores(
            sim_scores,
            lex_scores,
            prior_dict,
            alpha=ALPHA,
            beta=BETA,
            gamma=GAMMA,
        )

        label_type, labels = decide_label(
            probs,
            thresh_label=THRESH_LABEL,
            thresh_multi=THRESH_MULTI,
            thresh_none=THRESH_NONE,
        )

        results.append(
            {
                "file_id": row["file_id"],
                "item_code": row["item_code"],
                "chunk_id": row["chunk_id"],
                "label_type": label_type,
                "labels": labels,
                "probs": probs,
            }
        )

    outdf = pd.DataFrame(results)
    output_path = output_dir / "chunks_esg_labels.csv"
    outdf.to_csv(output_path, index=False)

    print(f"Saved results to {output_path}")
    return outdf


In [None]:
# evaluate.py
import pandas as pd
import random

def sample_for_manual_review(labels_csv, n=200, stratify_col="item_code"):
    df = pd.read_csv(labels_csv)
    # stratified sample by item_code (simple)
    sample = df.groupby(stratify_col, group_keys=False).apply(lambda x: x.sample(max(1, int(n * len(x)/len(df))))).reset_index(drop=True)
    sample = sample.sample(min(n, len(df)), random_state=42)
    # export sample for annotation
    sample.to_csv(labels_csv.replace(".csv", f".sample_{n}.csv"), index=False)
    print("Sample saved for manual review.")
    return sample


In [None]:
# from embeddings import SBERTEmbedder
# from pipeline import build_class_representatives_from_seed, run_pipeline
# from config import EMBEDDING_MODEL, BATCH_SIZE, CLASSES

embedder = SBERTEmbedder(model_name=EMBEDDING_MODEL, device="cpu", batch_size=BATCH_SIZE, cache_dir="emb_cache_test")
# seed = {"E":["emission","carbon"], "S":["employee","safety"], "G":["board","audit"]}

core_esg_seeds = ["Sustainable", "Sustainability", "ESG", "Development", "Sustainable development",
                  "SDGs", "Environmental, social, and governance", "Sustainable finance",
                  "Sustainable innovation", "Economic sustainability", "Economy", "Green economy",
                  "Crisis", "Future Needs"]

E = ["emission", "carbon", "Environmental", "Environmental protection", "Environmental impact",
     "environment", "climate", "Climate mitigation", "Climate change", "Carbon footprint",
     "Carbon emissions", "Pollutants", "Greenhouse Gas Emissions", "Decarbonization",
     "Renewable energy", "Clean energy", "Energy efficiency", "Recycling", "Circular economy",
     "Waste management", "Zero waste", "Natural Resources", "Resource management",
     "Earth", "Air", "biodiversity", "Atmospheric", "Water", "pollution",  "Green technology",
     "Pollution reduction", "Drought", "Water conservation", "Ground Warming",
     "Global Warming", "Species extinction", "Ecosystem preservation", "Sustainable materials",
     "Life cycle assessment", "Eco-friendly", "Sustainable agriculture"]

S = ["employee", "safety", "Human rights", "CSR", "Corporate", "social", "responsibility", "Society",
     "Responsible consumption", "Demographic changes", "Famine", "Better life", "Diversity", "Inclusion",
     "Equality", "Labor practices", "Worker health", "Sustainable supply chain"]

G = ["Economic", "Economy", "Green economy", "governance", "Risk management", "Executive compensation",
     "Shareholder", "audit", "board", "Management structure", "Fiduciary duty", "Internal controls",
     "Ethics", "Compliance", "Regulation", "Anti-corruption", "Corruption", "Bribery", "Legal",
     "Code of conduct", "Disclosure", "Transparency", "Non-Financial", "Reporting", "Accountability",
     "Data security"]

seed = { "E": E + core_esg_seeds,
         "S": S + core_esg_seeds,
         "G": G + core_esg_seeds
       }


reps = build_class_representatives_from_seed(seed, embedder)
print("class reps dim:", {k:v.shape for k,v in reps.items()})
# سپس run pipeline روی یک پوشه نمونه کوچک
# out = run_pipeline(input_dir="path/to/small_sample", output_dir=Path("outs_test"), extract_seeds=False, lexicon_path=None)
out = run_pipeline(INPUT_DIR, OUTPUT_DIR , extract_seeds=False, lexicon_path=None)

print(out.head())