In [None]:
# !pip install pandas

In [None]:
from lxml import etree
from os import listdir, getcwd
from os.path import join, dirname
import re
import pandas as pd

# Assemblee nationale

In [None]:
def parseSeance(path, document: str):
    path = join(path, document)  # generate full path
    seance = etree.parse(path)  # missing argument error can be ignored

    # extract metadata
    legislature = seance.find(
        ".//{http://schemas.assemblee-nationale.fr/referentiel}legislature"
    ).text
    date = seance.find(
        ".//{http://schemas.assemblee-nationale.fr/referentiel}dateSeance"
    ).text

    # init list for text extraction
    text_list = []

    # loop over all speakers
    for a in seance.findall(
        ".//{http://schemas.assemblee-nationale.fr/referentiel}orateur"
    ):
        paragraphe = (
            a.getparent().getparent()
        )  # the paragraph is two nodes up from the speaker
        if (
            paragraphe.tag
            != "{http://schemas.assemblee-nationale.fr/referentiel}paragraphe"
        ):  # this basically catches a "vote" result block
            continue
        valeur_ptsodj = paragraphe.get("valeur_ptsodj")  # ordre du jour
        code_grammaire = paragraphe.get("code_grammaire")  # type of speech
        ordre_absolu_seance = paragraphe.get("ordre_absolu_seance")  # index number
        ordinal_prise = paragraphe.get("ordinal_prise")  # another index number
        orateur_name = a.find(
            "{http://schemas.assemblee-nationale.fr/referentiel}nom"
        )  # name of the speaker
        orateur_name_str = etree.tostring(
            orateur_name, method="text", encoding="unicode"
        ).strip()
        orateur_id = a.find(
            "{http://schemas.assemblee-nationale.fr/referentiel}id"
        )  # id of the speaker
        orateur_ref = a.find(
            "{http://schemas.assemblee-nationale.fr/referentiel}acteurRef"
        )  # this is only to catch errors with the 15th legislature
        if orateur_id is not None:
            orateur_id = orateur_id
        elif orateur_ref is not None:
            orateur_id = orateur_ref
        else:
            orateur_id = etree.fromstring(
                text="<orateur>None</orateur>"
            )  # default to None if nothing found
        orateur_id_str = etree.tostring(
            orateur_id, method="text", encoding="unicode"
        ).strip()
        content = paragraphe.find(
            "{http://schemas.assemblee-nationale.fr/referentiel}texte"
        )  # extract the text
        content_str = etree.tostring(content, method="text", encoding="unicode").strip()
        talking = {
            "legislature": legislature,
            "nom_fichier": document,
            "date": date,
            "orateur_name": orateur_name_str,
            "orateur_id": orateur_id_str,
            "ordre_absolu_seance": int(ordre_absolu_seance),
            "valeur_ptsodj": int(valeur_ptsodj),
            "ordinal_prise": ordinal_prise,
            "code_grammaire": code_grammaire,
            "content": content_str,
        }
        text_list.append(talking)
    return text_list  # returns a list of all speech nodes

In [None]:
all_cr = []

In [None]:
# unzip Assemblee nationale opendata and keep only "compteRendu" and put them in "data/assemblee_nationale"
folders = ["15_compteRendu", "16_compteRendu", "17_compteRendu"]
root_folder = dirname(dirname(getcwd()))

In [None]:
for folder in folders:
    path = join(root_folder, "data", "assemblee_nationale", folder)
    for f in listdir(path):
        if (f == 'CRSJOCGR5L15S2017E1N001.xml') | (f == '.DS_Store'): # this file is a duplicate, so we exclude it
            continue
        cr = parseSeance(path, f)
        all_cr.extend(cr)

# Dataframe

In [None]:
# create dataframe
df = pd.DataFrame(all_cr)

In [None]:
# cleaning 
df["date"] = df["date"].str[:12]
df["datetime"] = pd.to_datetime(df["date"])
match_weird_punctuation = re.compile(r'([!\.,)?])(?=\w)')
df['content'] = df['content'].str.replace(match_weird_punctuation, r'\1 ', regex=True) # catches punctuation errors
df['content'] = df['content'].str.replace('\xa0', ' ') # catches white spaces
df['content'] = df['content'].str.replace('gouvernement', 'Gouvernement') # both spellings exist in the data

In [None]:
# Create a grouped df to take into account interruptions
# Groups by "ordinal_prise" (speech) and only if it's not an "interruption"

df_paroles = df.loc[~(df["code_grammaire"].str.contains("interruption", case=False))]
df_rest = df.loc[(df["code_grammaire"].str.contains("interruption", case=False))]

df_grouped = (
    df_paroles.groupby(['legislature', 'datetime', 'nom_fichier', 'orateur_id', 'ordinal_prise'], as_index=False)
    .agg({
        'orateur_name': 'first',
        'content': ' '.join,
        'ordre_absolu_seance': list,
        'code_grammaire': 'first'
    })
)

df_grouped = pd.concat([df_grouped, df_rest], ignore_index=True)
df_grouped = df_grouped.sort_values(
    by=["datetime", "valeur_ptsodj", "ordre_absolu_seance"],
    key=lambda col: col.map(
        lambda x: (
            x
            if col.name == "datetime"
            else (
                x
                if col.name == "valeur_ptsodj"
                else (x[0] if isinstance(x, list) and len(x) > 0 else float(x))
            )
        )
    ),
).reset_index(drop=True)
df_grouped = df_grouped.reset_index().drop(columns="index")

In [None]:
# fun stats
# df.loc[df["code_grammaire"].str.contains("PAROLE")]["content"].value_counts()

In [None]:
# filter by keywords

df_vsa = df_grouped.loc[
    df_grouped["content"].str.contains("vidéo-surveillance", case=False)
    | df_grouped["content"].str.contains("vidéo-protection", case=False)
    | df_grouped["content"].str.contains("vidéosurveillance", case=False)
    | df_grouped["content"].str.contains("vidéoprotection", case=False)
    | df_grouped["content"].str.contains("vie privée", case=False)
    | df_grouped["content"].str.contains("RGPD", case=False)
    | df_grouped["content"].str.contains("VSA", case=False)
    | df_grouped["content"].str.contains("surveillance", case=False)
    | df_grouped["content"].str.contains("protection des données", case=False)
    | df_grouped["content"].str.contains(" data ", case=False)
    | df_grouped["content"].str.contains("open-data", case=False)
    | df_grouped["content"].str.contains("opendata", case=False)
].reset_index()

In [None]:
del(df, df_grouped, df_paroles, df_rest)

In [None]:
del(cr, all_cr)

In [None]:
df_vsa.to_pickle(join(root_folder, "data", "assemblee_nationale", "df_mots_cles.pickle"))

# Parsing

In [None]:
# !pip install wtpsplit

In [None]:
root_folder = dirname(dirname(getcwd()))

In [None]:
df_vsa = pd.read_pickle(join(root_folder, "data", "assemblee_nationale", "df_mots_cles.pickle"))

In [None]:
from wtpsplit import SaT

In [None]:
sat = SaT("sat-3l-sm")
sat.half().to("mps")

In [None]:
sat.split(df_vsa['content'][], do_paragraph_segmentation=True, paragraph_threshold=0.7)

In [None]:
df_vsa['bool'] = df_vsa["content"].str.contains("Bonjour")

In [None]:
df_vsa['segmented'] = df_vsa['content'].apply(lambda x: sat.split(x))

# Timeseries & QBert comparison

In [None]:
raw_data = pd.read_parquet("../../data/activetigger/sicss-an-spacy1000_data_all.parquet")

In [None]:
at_model_predictions = pd.read_csv('../../data/activetigger/predictions_sicss-an-spacy1000_camembertbase 9 EP__sicss_schreiber__sicss-an-spacy1000__default__02-07-2025_22h08_all.csv')

In [None]:
merges = pd.merge(raw_data, at_model_predictions, on='id')

In [None]:
merges.to_csv('../../data/activetigger/merged_at_annotations.csv')

In [None]:
grouped_by_date = merges[merges['prediction'] == 'RELEVANT'].groupby(['dataset_datetime', 'prediction']).agg('size')

In [None]:
grouped_by_date.to_csv("../../grouped_by_date_relevant_only.csv")

In [None]:
qbert = pd.read_csv("../../data/activetigger/QAmembert_spacy1000.csv")

In [None]:
qbert.loc[qbert['score'] >= 0.8].groupby('prediction').agg('count')