In [None]:
import os
os.getcwd()

In [None]:
!pip install pandas transformers tiktoken SentencePiece

In [None]:
import pandas as pd
faits_divers = pd.read_csv('Data/data_faits_divers.csv')
#os.chdir("..")

In [None]:
articles = faits_divers.sample(30)

In [None]:
!pip install accelerate

In [None]:
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
from tqdm import tqdm 

tokenizer = AutoTokenizer.from_pretrained("Jean-Baptiste/camembert-ner-with-dates", use_fast=False)
model = AutoModelForTokenClassification.from_pretrained("Jean-Baptiste/camembert-ner-with-dates")

nlp = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple",device=0)

In [None]:
from tqdm import tqdm
tqdm.pandas()
def safe_ner(text):
    try:
        return nlp(text)
    except Exception as e:
        return []

articles['NER'] = articles['texte'].progress_apply(safe_ner)

In [None]:
def extract_persons(ner_output):
    return list(set(
        ent['word'].lower().strip() 
        for ent in ner_output 
        if ent['entity_group'] == 'PER'
    ))
articles['personnes'] = articles['NER'].apply(extract_persons)

In [None]:
nul = articles[articles['personnes'].apply(len) == 0]

In [None]:
exp = [
    "concubin", "concubine",
    "victime", "suspect", "agresseur",
    "policier", "homme", "femme", 'épouse', "copain", "copine", "petite amie",
    "adolescent", "fillette", "témoin", "procureur",
    "le prévenu", "auteur des faits", "les forces de l'ordre", "mari", "compagnon", "père", "amant", "copain"
]

def detect_generic_mentions(text, expressions=exp):
    found = []
    lower_text = text.lower()
    for expr in expressions:
        if expr in lower_text:
            found.append(expr)
    return list(set(found))

articles['mentions_generiques'] = articles['texte_clean'].apply(detect_generic_mentions)

In [None]:
def combine_mentions(row):
    return row['personnes'] + row['mentions_generiques']

articles['sujets'] = articles.apply(combine_mentions, axis=1)

In [None]:
from random import randint
i=randint(0,29)
print(articles.iloc[i]['sujets'])
print(articles.iloc[i]['texte'])

In [None]:
!pip install spacy

In [None]:
!python -m spacy download fr_core_news_md

In [None]:
victime_indicateurs = [
    "victime", "été agressée", "été violée", "subi", "a dénoncé", "harcelée", "plaintes", "été tuée", "battue", "frappée", "violée", "tuée", "assassinée", "agressée"
]

agresseur_indicateurs = [
    "coupable","suspect","suspecté",
    "violent","a agressé", "a violé", "harceleur", "accusé", "mis en examen", "a frappé", "condamné", "a avoué avoir tué", "a avoué avoir frappé", "a avoué", "a tué",
    "accusé de", "accusé d'avoir", "suspecté de", "suspecté d'avoir"
    ]

import re
import spacy
nlp_fr = spacy.load("fr_core_news_md")

def detect_roles(text, sujets):
    if not isinstance(text, str):
        return {}

    doc = nlp_fr(text)
    text_lower = text.lower()
    tokens = text_lower.split()

    victime_indicateurs = [
    "victime", "été agressée", "été violée", "subi", "a dénoncé", "harcelée", 
    "plaintes", "été tuée", "battue", "frappée", "violée", "tuée", "assassinée", "agressée"
    ]

    agresseur_indicateurs = [
    "coupable","suspect","suspecté",
    "violent","a agressé", "a violé", "harceleur", "accusé", "mis en examen", "a frappé",
    "condamné", "a avoué avoir tué", "a avoué avoir frappé", "a avoué", "a tué",
    "accusé de", "accusé d'avoir", "suspecté de", "suspecté d'avoir", "a avoué", "placé en garde à vue", 
    "mis en garde à vue", "incriminé", "a affirmé être l'auteur"
    ]
    verbes = ["tuer", "violer", "agresser", "harceler", "frapper"]

    roles = {}

    for sujet in sujets:
        sujet_lower = sujet.lower()
        roles[sujet] = 'inconnu'
        match_found = False

        for ent in doc.ents:
            if ent.text.lower() == sujet_lower:
                token = ent.root
                head = token.head

                # 1. Sujet passif → victime
                if token.dep_ == "nsubj:pass" and head.lemma_ in AGRESSEUR_VERBS:
                    roles[sujet] = "victime"
                    match_found = True
                    break

                # 2. Sujet actif → agresseur
                elif token.dep_ == "nsubj" and head.lemma_ in AGRESSEUR_VERBS:
                    roles[sujet] = "agresseur"
                    match_found = True
                    break

                # 3. Objet direct → victime
                elif token.dep_ == "obj" and head.lemma_ in AGRESSEUR_VERBS:
                    roles[sujet] = "victime"
                    match_found = True
                    break

                # 4. Agent passif introduit par "par"
                for child in head.children:
                    if child.dep_ == "obl:agent" and sujet_lower in child.text.lower():
                        roles[sujet] = "agresseur"
                        match_found = True
                        break

        if not match_found:
            for i in range(len(tokens)):
                sujet_tokens = sujet_lower.split()
                if tokens[i:i+len(sujet_tokens)] == sujet_tokens:
                    start = max(0, i - 10)
                    end = min(len(tokens), i + len(sujet_tokens) + 10)
                    context = ' '.join(tokens[start:end])

                    if any(ind in context for ind in victime_indicateurs):
                        roles[sujet] = 'victime'
                        break
                    elif any(ind in context for ind in agresseur_indicateurs):
                        roles[sujet] = 'agresseur'
                        break

    return roles


In [None]:
articles['roles_detectes'] = articles.progress_apply(
    lambda row: detect_roles(row['texte_clean'], row['sujets']),
    axis=1
)

In [None]:
from random import randint
i=randint(0,29)
print(articles.iloc[i]['sujets'])
print(articles.iloc[i]['roles_detectes'])
print(articles.iloc[i]['texte'])
