In [None]:
# wikipedia_scoring_pipeline.py
"""
Pipeline pour calculer les scores Heat, Quality, ActorRisk et le score global
à partir des scripts Jupyter existants.

Hypothèses :
- Chaque notebook a été factorisé en un module Python exposant une fonction
  qui renvoie la métrique principale sous forme de Pandas Series ou scalaire.
  * ano_editor   -> get_anon_edit_share(pages, start, end)
  * edit         -> get_revert_rate(pages, start, end)
  * pageviews    -> get_pageview_spikes(pages, start, end)
  * protection   -> get_protection_level(pages)
  * taille_talk  -> get_talk_activity(pages, start, end)
  * ref          -> get_citation_gap(pages)
  * readability  -> get_readability_score(pages)
- Toutes les fonctions acceptent une liste de titres d’articles et renvoient
  un DataFrame indexé par "page" avec au moins la colonne "value".

Étapes :
1. Récupération des métriques unitaires.
2. Mise à l’échelle 0‑1 par min‑max.
3. Agrégation pondérée en trois pôles : Heat, Quality, ActorRisk.
4. Score global = α*Heat + β*ActorRisk – γ*Quality.

Vous pouvez ajuster les poids HEAT_WEIGHTS, QUALITY_WEIGHTS, RISK_WEIGHTS
et GLOBAL_WEIGHTS selon la validation empirique.
"""

from __future__ import annotations
from dataclasses import dataclass
from typing import List, Dict
import pandas as pd
import numpy as np

# ────────────────────────────────────────────  Poids (modifiable) ─────────────
HEAT_WEIGHTS    = {
    "revert_rate": 0.30,
    "protection_level": 0.20,
    "pageview_spike": 0.30,
    "talk_intensity": 0.20,
}
QUALITY_WEIGHTS = {
    "citation_gap": -0.30,      # négatif : plus de gap = moins de qualité
    "article_quality": 0.40,
    "source_age": -0.10,        # ancienneté élevée -> pénalité
    "readability": 0.20,
    "unreliable_refs": -0.20,
}
RISK_WEIGHTS    = {
    "anon_share": 0.25,
    "sockpuppet_score": 0.25,
    "micro_edits": 0.20,
    "author_revertrisk": 0.20,
    "automod_flag": 0.10,
}
GLOBAL_WEIGHTS  = {"heat": 0.4, "quality": -0.3, "risk": 0.3}

# ────────────────────────────────────────────  Helpers ────────────────────────

def min_max_scale(series: pd.Series) -> pd.Series:
    """Ramène une série sur [0,1]."""
    if series.empty:
        return series
    return (series - series.min()) / (series.max() - series.min() or 1)

@dataclass
class MetricResult:
    name: str
    values: pd.Series  # index = page, value = métrique brute

@dataclass
class ScoringResult:
    heat: pd.Series
    quality: pd.Series
    risk: pd.Series
    global_score: pd.Series

# ────────────────────────────────────────────  Pipeline ───────────────────────

def compute_scores(pages: List[str], start: str, end: str) -> ScoringResult:
    """Calcule les quatre scores pour la liste de pages entre start et end."""

    # 1. Collecte des métriques unitaires
    from ano_editor   import get_anon_edit_share        # % anon edits
    from edit         import get_revert_rate            # revert_rate
    from pageviews    import get_pageview_spikes        # pageview_spike
    from protection   import get_protection_level       # protection_level
    from taille_talk  import get_talk_activity          # talk_intensity
    from ref          import get_citation_gap           # citation_gap
    from readability  import get_readability_score      # readability
    #from val          import get_unreliable_ref_share   # part refs peu fiables
    #from val          import get_source_age_avg         # age moyen des sources
    #from sockdetect   import get_sockpuppet_score       # sockpuppet_score
    #from val          import get_micro_edit_rate        # micro_edits
    from edit         import get_author_revertrisk      # author_revertrisk
    #from automod      import get_automod_flag_count     # automod_flag
    #from artqual      import get_article_quality        # article_quality

    data: Dict[str, pd.Series] = {}

    data["revert_rate"]      = get_revert_rate(pages, start, end)
    data["protection_level"] = get_protection_level(pages)
    data["pageview_spike"]   = get_pageview_spikes(pages, start, end)
    data["talk_intensity"]   = get_talk_activity(pages, start, end)

    data["citation_gap"]     = get_citation_gap(pages)
    data["article_quality"]  = get_article_quality(pages)
    data["source_age"]       = get_source_age_avg(pages)
    data["readability"]      = get_readability_score(pages)
    data["unreliable_refs"]  = get_unreliable_ref_share(pages)

    data["anon_share"]       = get_anon_edit_share(pages, start, end)
    data["sockpuppet_score"] = get_sockpuppet_score(pages)
    data["micro_edits"]      = get_micro_edit_rate(pages, start, end)
    data["author_revertrisk"] = get_author_revertrisk(pages, start, end)
    data["automod_flag"]     = get_automod_flag_count(pages)

    df = pd.DataFrame(data)
    # 2. Normalisation 0‑1
    df_norm = df.apply(min_max_scale, axis=0)

    # 3. Agrégation par pôle
    heat    = (df_norm[list(HEAT_WEIGHTS.keys())]    * pd.Series(HEAT_WEIGHTS)).sum(axis=1)
    quality = (df_norm[list(QUALITY_WEIGHTS.keys())] * pd.Series(QUALITY_WEIGHTS)).sum(axis=1)
    risk    = (df_norm[list(RISK_WEIGHTS.keys())]    * pd.Series(RISK_WEIGHTS)).sum(axis=1)

    # 4. Score global
    global_score = (pd.concat([
        heat.rename("heat"),
        quality.rename("quality"),
        risk.rename("risk")], axis=1) * pd.Series(GLOBAL_WEIGHTS)).sum(axis=1)

    return ScoringResult(heat, quality, risk, global_score)

# ────────────────────────────────────────────  Exécution directe ─────────────
if __name__ == "__main__":
    import argparse, json
    parser = argparse.ArgumentParser(description="Calcule les scores Wikipédia PROMPT.")
    parser.add_argument("pages", nargs="+", help="Titres d’articles Wikipédia avec _")
    parser.add_argument("--start", default="2023-01-01", help="Date début YYYY-MM-DD")
    parser.add_argument("--end",   default="2023-12-31", help="Date fin YYYY-MM-DD")
    parser.add_argument("--json",  action="store_true", help="Sortie JSON plutôt que tableau")
    args = parser.parse_args()

    res = compute_scores(args.pages, args.start, args.end)
    output = pd.DataFrame({
        "heat": res.heat,
        "quality": res.quality,
        "risk": res.risk,
        "global": res.global_score
    })

    if args.json:
        print(output.to_json(orient="index", indent=2, force_ascii=False))
    else:
        print(output.round(3).to_markdown())


usage: ipykernel_launcher.py [-h] [--start START] [--end END] [--json]
                             pages [pages ...]
ipykernel_launcher.py: error: the following arguments are required: pages


SystemExit: 2

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)
