In [1]:
import pandas as pd
import re

def import_annotations_to_dataframe(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        content = f.read()

    # Séparer les blocs d'articles à l'aide de regex
    articles = re.split(r"([a-f0-9]{64}) : \n\n", content)[1:]  
    article_chunks = [(articles[i], articles[i+1]) for i in range(0, len(articles), 2)]


    variables = [
        "article_vss", "sujet_victime", "sujet_agresseur", "fait_agression",
        "voc_violence", "hierar", "portrait_victime", "portrait_auteur",
        "relation", "meanisme_violence", "stat", "ressource"
    ]

    data = []

    for identifiant, article_text in article_chunks:
        record = {"identifiant": identifiant}
        for var in variables:
            match = re.search(rf"{var} ?: ?(.*)", article_text)
            record[var] = match.group(1).strip() if match else None
        data.append(record)

    return pd.DataFrame(data)


In [2]:
annotated = import_annotations_to_dataframe('annotations.txt')

In [3]:
annotated

Unnamed: 0,identifiant,article_vss,sujet_victime,sujet_agresseur,fait_agression,voc_violence,hierar,portrait_victime,portrait_auteur,relation,meanisme_violence,stat,ressource
0,3a4723d9d754ca30c68ba8e420cd6683548f2af7ff7af3...,oui,"""nathalie""","""quatre jeunes de 19 à 23 ans""","""viol collectif""",1,1,0,1,0,0,-1,-1
1,2b6ce23c0fbfd6e213b7cd196a4b12eda4323e322f5f35...,oui,"""une jeune femme"" ""une femme"" ; ""valérie hitt...","""son mari""","""tuée""",0,1,0,0,1,-1,-1,-1
2,5b65c6d7e4f5ad08803aeb5e7921b6cb6321f1fef50b7f...,non,,,,,,,,,,,
3,d82fc0248b80e33ca1fc2a0da41f749d51f53f2db569b4...,non,,,,,,,,,,,
4,2293f5c041bec1dcdfe52bbe2655e0bec9a449310f534b...,non,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
93,90c97873a3028ff7d25616acbf7f6d662f255d37985f6b...,non,,,,,,,,,,,
94,cdd9ae89b1a8738a4e2eca205b6918093b627b7e8275d1...,oui,"""une jeune femme pompier""","""un homme de 77 ans""","""agressée sexuellement""",1,1,1,0,1,1,1,-1
95,533587a067dfeb0b1b2b14e238ff6320f28845770f2ed0...,non,,,,,,,,,,,
96,a3c235da28477a6045b1e97abd2b5491b1aa772c273ae4...,non,,,,,,,,,,,


In [14]:
import pandas as pd


def calcul_score(df):
    # Travailler sur une copie pour éviter les SettingWithCopyWarning
    df = df.copy()

    required_cols = [
        'voc_violence', 'hierar', 'portrait_victime',
        'portrait_auteur', 'relation', 'meanisme_violence', 'ressource'
    ]

    # Convertir en numérique en remplaçant les valeurs non valides par 0
    for col in required_cols:
        df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0)

    # Calculer le score comme moyenne simple
    df['score'] = df[required_cols].sum(axis=1) / len(required_cols)
    
    return df

annotated = annotated[annotated['article_vss'] == 'oui']
df = calcul_score(annotated)
df['score'].unique()

array([ 0.28571429,  0.        , -0.57142857, -0.85714286,  0.14285714,
       -0.28571429, -0.71428571, -0.42857143,  1.        , -0.14285714,
        0.42857143,  0.71428571,  0.85714286,  0.57142857])

In [15]:
corpus_scores = df[['identifiant', 'score']]

In [18]:
corpus_scores

Unnamed: 0,identifiant,score
0,3a4723d9d754ca30c68ba8e420cd6683548f2af7ff7af3...,0.285714
1,2b6ce23c0fbfd6e213b7cd196a4b12eda4323e322f5f35...,0.0
19,d2e6b1c3da136d5869a4576be1c3aa085d40d683ec5f82...,-0.571429
20,5a00f9cf3fd9c21934332eff096343b6b8a501c9072b48...,-0.857143
28,dcddfab8f7f5a4d8f9444093513006cd3ae3bb96401a13...,0.142857
30,04b4c0483c703b28e5d35bef6ecf1f7a3c287c163506e6...,-0.285714
31,0f27d76856c7cacd100dc896e531a28762744fe5b8707b...,-0.285714
32,20a90459871077184755b4067570f09c740e29a963675e...,-0.714286
34,292e307098f454fe1165f488d1a03ddbe081236e61cea6...,-0.428571
36,9cb4c377f1d450a9adcfa34e7577fcfd970c716405887c...,-0.285714
