In [2]:
!pip install pandas transformers accelerate torch



In [3]:
!pip install spacy
!pip install cupy-cuda12x



In [4]:
!python -m spacy download fr_dep_news_trf

Collecting fr-dep-news-trf==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/fr_dep_news_trf-3.8.0/fr_dep_news_trf-3.8.0-py3-none-any.whl (397.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m397.7/397.7 MB[0m [31m59.0 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('fr_dep_news_trf')


In [7]:
import spacy
import torch
from tqdm import tqdm

#spacy.prefer_gpu()
device = torch.device("cpu")
nlp = spacy.load("fr_dep_news_trf")

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
doc = nlp("This is a sentence.")

In [None]:
for token in doc:
    print(token.text, token.pos_, token.dep_, token.head.text)

In [None]:
text = "La victime a été agressée par son compagnon."
doc = nlp(text)

for token in doc:
    print(f"{token.text:12} | {token.dep_:10} | {token.head.text:12} | {token.pos_:6}")

In [8]:
def analyser_phrase(text, contexte=None):
    doc = nlp(text)

    verbes_vss = {
        "agresser", "violer", "harceler", "frapper", "tuer", "insulter",
        "abuser", "menacer", "étrangler", "gifler", "molester", "violenter"
    }

    pronoms = {"il", "elle", "lui", "leur", "ce", "cela", "ça", "qui", "on", "l’", "la", "le", "me", "m’", "l'", "y"}

    victime = None
    agresseur = None
    verbe_cible = None
    voix = "autre"
    negation = False
    victime_pronom = False

    noun_chunks_map = {chunk.root: chunk.text for chunk in doc.noun_chunks}
    ent_map = {ent.start: ent.text for ent in doc.ents if ent.label_ == "PER"}

    # --- Identifications classiques ---
    for token in doc:
        if token.pos_ == "VERB" and token.lemma_ in verbes_vss:
            verbe_cible = token

            if any(child.dep_ == "advmod" and child.lemma_ == "pas" for child in token.children):
                negation = True

            for child in token.children:
                if child.dep_ == "nsubj:pass":
                    victime = noun_chunks_map.get(child, child.text)
                    voix = "passive"
                elif child.dep_ == "obj":
                    victime = noun_chunks_map.get(child, child.text)
                    voix = "active"
                elif child.dep_ == "obl:agent":
                    agresseur = noun_chunks_map.get(child, child.text)
                elif child.dep_ == "nsubj":
                    agresseur = noun_chunks_map.get(child, ent_map.get(child.i, child.text))
                    voix = "active"

            if not agresseur:
                ancestor = token
                while ancestor.head != ancestor:
                    ancestor = ancestor.head
                    for child in ancestor.children:
                        if child.dep_ == "nsubj":
                            agresseur = noun_chunks_map.get(child, ent_map.get(child.i, child.text))
                            break
                    if agresseur:
                        break

    # --- Co-référence simple ---
    if victime and victime.lower() in pronoms and contexte:
        victime_coref = contexte.get("last_female") or contexte.get("last_person")
        if victime_coref:
            victime = victime_coref
    if agresseur and agresseur.lower() in pronoms and contexte:
        agresseur_coref = contexte.get("last_male") or contexte.get("last_person")
        if agresseur_coref:
            agresseur = agresseur_coref

    # --- Mettre à jour le contexte ---
    if contexte is not None:
        for ent in doc.ents:
            if ent.label_ == "PER":
                contexte["last_person"] = ent.text
                # tentative naïve de genre
                if ent.text.lower().endswith(("e", "a", "ine", "ette")):
                    contexte["last_female"] = ent.text
                else:
                    contexte["last_male"] = ent.text

    if verbe_cible and victime:
        return {
            "phrase": text,
            "victime": victime,
            "agresseur": agresseur,
            "verbe": verbe_cible.text,
            "voix": voix,
            "negation": negation
        }

    return None


In [9]:
import pandas as pd

articles = pd.read_csv('Data/data_faits_divers.csv')
#articles = articles.sample(10)

In [None]:
from tqdm import tqdm
import torch

articles["victime"] = ""
articles["agresseur"] = ""
articles["verbe"] = ""
articles["voix"] = ""
articles["negation"] = ""
articles["phrase_extraite"] = ""

with torch.no_grad():
    contexte_par_article = [{} for _ in range(len(articles))]


    docs = list(tqdm(nlp.pipe(articles["texte"], batch_size=32), total=len(articles)))


  0%|          | 0/39263 [00:00<?, ?it/s]

In [None]:
for i in tqdm(range(len(articles)), leave=True, ncols=100):
    doc = docs[i]
    contexte = contexte_par_article[i]

    for sent in doc.sents:
        res = analyser_phrase(sent.text, contexte)
        if res:
            articles.at[i, "victime"] = res["victime"]
            articles.at[i, "agresseur"] = res["agresseur"]
            articles.at[i, "verbe"] = res["verbe"]
            articles.at[i, "voix"] = res["voix"]
            articles.at[i, "negation"] = res["negation"]
            articles.at[i, "phrase_extraite"] = res["phrase"]
            break
