In [1]:
import pandas as pd
faits_divers = pd.read_csv('data_faits_divers.csv')

In [3]:
!python -m spacy download fr_core_news_md

Collecting fr-core-news-md==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/fr_core_news_md-3.8.0/fr_core_news_md-3.8.0-py3-none-any.whl (45.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.8/45.8 MB[0m [31m94.8 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('fr_core_news_md')


In [None]:
import spacy 

nlp = spacy.load("fr_core_news_md")

violence_keywords = ["agression", "violer", "violence", "harceler", "frapper", "attaquer", "abuser", "battre"]
victim_keywords = {"victime", "femme", "fille", "enfant", "mineur", "jeune fille", "personne", "ado", "étudiante"}
perp_keywords = {"agresseur", "homme", "compagnon", "père", "ex", "partenaire", "harceleur", "violenteur"}


def detect_role(noun_text):
    noun_text = noun_text.lower()
    for word in victim_lexicon:
        if word in noun_text:
            return "victime"
    for word in perp_lexicon:
        if word in noun_text:
            return "auteur"
    return "indéterminé"

def get_named_entity_role(ent_text, sent_doc):
    for token in sent_doc:
        if token.text in ent_text:
            if token.dep_ in ("nsubj", "nsubj:pass"):
                return "victime"
            if token.dep_ == "obl:agent":
                return "auteur"
    return "indéterminé"

def analyse_phrase(phrase):
    doc = nlp(phrase)
    results = []

    for sent in doc.sents:
        sent_doc = nlp(sent.text)
        violence_detected = False

        for token in sent_doc:
            if token.lemma_.lower() in violence_keywords:
                violence_detected = True

                # Vérifie la voix passive : le verbe est passif s'il a un sujet passif (nsubj:pass) ou agent (par...)
                is_passive = any(child.dep_ == "aux:pass" or child.dep_ == "obl:agent" for child in token.children)

                # Qui est le sujet ?
                sujet = None
                agent = None

                for child in token.children:
                    if child.dep_ in ("nsubj", "nsubj:pass"):
                        sujet = child
                    if child.dep_ == "obl:agent":
                        agent = child

                results.append({
                    "phrase": sent.text,
                    "violence_detectée": True,
                    "verbe": token.lemma_,
                    "voix_passive": is_passive,
                    "sujet": sujet.text if sujet else None,
                    "agent": agent.text if agent else None
                })

        if not violence_detected:
            results.append({
                "phrase": sent.text,
                "violence_detectée": False
            })

    return results

def analyse_phrase_ner(phrase):
    doc = nlp(phrase)
    results = []

    for sent in doc.sents:
        sent_doc = nlp(sent.text)
        violence_detected = False

        for token in sent_doc:
            if token.lemma_.lower() in violence_keywords:
                violence_detected = True
                is_passive = any(child.dep_ == "aux:pass" or child.dep_ == "obl:agent" for child in token.children)

                sujet = next((child for child in token.children if child.dep_ in ("nsubj", "nsubj:pass")), None)
                agent = next((child for child in token.children if child.dep_ == "obl:agent"), None)

                # Cherche des entités de type PERSON
                entites_personnes = [ent for ent in sent_doc.ents if ent.label_ == "PER"]

                roles = []
                for ent in entites_personnes:
                    role = get_named_entity_role(ent.text, sent_doc)
                    roles.append({"entité": ent.text, "rôle": role})

                # Fallback heuristique si pas d'entités
                if not entites_personnes:
                    if sujet:
                        roles.append({"entité": sujet.text, "rôle": detect_role_heuristic(sujet.text)})
                    if agent:
                        roles.append({"entité": agent.text, "rôle": detect_role_heuristic(agent.text)})

                results.append({
                    "phrase": sent.text,
                    "violence_detectée": True,
                    "verbe": token.lemma_,
                    "voix_passive": is_passive,
                    "sujet": sujet.text if sujet else None,
                    "agent": agent.text if agent else None,
                    "entités_et_roles": roles
                })

        if not violence_detected:
            results.append({
                "phrase": sent.text,
                "violence_detectée": False
            })


In [None]:
article = faits_divers['texte_clean'].sample(1)