In [75]:
import pandas as pd 
import numpy as np
import json

In [76]:
data = pd.read_csv("data_annotated.csv")
data.columns

Index(['doc_id', 'url', 'cache', 'fulltext', 'nature', 'published',
       'entity_name', 'entity_type', 'geo_path', 'extracted_text',
       'embeddings', 'typed_embeddings', 'km_cluster_labels'],
      dtype='object')

In [77]:
categories = list(np.unique(data["nature"]))

In [78]:
import spacy

nlp = spacy.load("fr_core_news_md")

In [79]:
from nltk.corpus import stopwords

stop_words = stopwords.words('french')

In [80]:
def tokenize_french(text):
    
    doc = nlp(text)
    tokens = [token.lemma_ for token in doc if token.is_alpha]
    
    return tokens

In [81]:
def filter_data(df, category):
    
    df.dropna(inplace = True)
    df = df[df["nature"] == category]
    
    n_samples = len(df)
    
    return n_samples, df

In [82]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier

In [83]:
def create_tfidf(X_train, X_test):
    
    tfidf = TfidfVectorizer(lowercase=True, stop_words=stop_words, max_features=200, tokenizer=tokenize_french)
    
    X_tf_train = tfidf.fit_transform(X_train)
    X_tf_test = tfidf.transform(X_test)
    feature_names = tfidf.get_feature_names_out()
    
    return X_tf_train, X_tf_test, feature_names, tfidf
    
    

In [93]:
def explain_cluster_tfidf(df, category):
    
    n_samples, df = filter_data(df, category)
    if n_samples < 30:
        return "not enough data"

    X = df["extracted_text"]
    y = df["km_cluster_labels"]
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)
    
    X_tf_train, X_tf_test, feature_names, tfidf = create_tfidf(X_train, X_test)
    
    rf = RandomForestClassifier(criterion="entropy", random_state=42)
    rf.fit(X_tf_train, y_train)
    
    y_pred = rf.predict(X_tf_test)
    f1 = f1_score(y_test, y_pred)
    print("RF f1-score:", f1)
    
    importances = rf.feature_importances_
    
    dict_feature_importance_tfidf = dict(zip(feature_names, importances))
    with open(f"{category}_explainability_tfidf.json", "w") as f:
        json.dump(dict_feature_importance_tfidf, f)

    return dict_feature_importance_tfidf

In [None]:
results = []
for category in categories:
    dict_feature_importance_tfidf = explain_cluster_tfidf(data, category)
    results.append(dict_feature_importance_tfidf)



RF f1-score: 0.8888888888888888




RF f1-score: 0.9151670951156813




RF f1-score: 0.5




RF f1-score: 1.0




RF f1-score: 1.0




RF f1-score: 0.3333333333333333




In [16]:
import os
cf = os.listdir()
list_files = [file_ for file_ in cf if ".json" in file_]
print(list_files)

['acte.arrete_explainability_tfidf.json', 'acte.delib_explainability_tfidf.json', 'acte.raa_explainability_tfidf.json', 'bdj_explainability_tfidf.json', 'comm_explainability_tfidf.json', 'dlao.autres_explainability_tfidf.json']


In [1]:
import json

json_file = "acte.arrete_explainability_tfidf.json"

with open(json_file, "r") as f:
    data = json.load(f)

In [10]:
import pandas as pd
df = pd.DataFrame([data]).T
df.columns


RangeIndex(start=0, stop=1, step=1)

In [19]:
df.sort_values(by=0, ascending=False, inplace=True)

In [24]:
df.head(10).index

Index(['région', 'direction', 'subdélégation', 'recueil', 'agrément',
       'relatif', 'mission', 'transport', 'arrêter', 'mois'],
      dtype='object')

In [40]:
data_results = []
top_20_per_cat = []
for json_file in list_files:
    # print(json_file)
    with open(json_file, "r") as f:
        data = json.load(f)
    data_results.append(data)
    
    df = pd.DataFrame([data]).T
    df.sort_values(by=0, ascending=False, inplace=True)
    top_20 = list(df.head(20).index)
    dict_doc_results = {"doc_type": json_file.replace("_explainability_tfidf.json", ""),
                        "important_words": top_20}
    top_20_per_cat.append(dict_doc_results)

In [41]:
top_20_per_cat

[{'doc_type': 'acte.arrete',
  'important_words': ['région',
   'direction',
   'subdélégation',
   'recueil',
   'agrément',
   'relatif',
   'mission',
   'transport',
   'arrêter',
   'mois',
   'département',
   'décret',
   'national',
   'devoir',
   'e',
   'être',
   'signer',
   'maire',
   'faire',
   'délai']},
 {'doc_type': 'acte.delib',
  'important_words': ['séance',
   'délibération',
   'municipal',
   'conseil',
   'présent',
   'Monsieur',
   'avoir',
   'être',
   'id',
   'monsieur',
   'exercice',
   'préfecture',
   'maire',
   'code',
   'recevoir',
   'publier',
   'voir',
   'envoyer',
   'relatif',
   'après']},
 {'doc_type': 'acte.raa',
  'important_words': ['Monsieur',
   'faire',
   'sou',
   'présent',
   'article',
   'ville',
   'général',
   'ci',
   'travail',
   'municipal',
   'autre',
   'police',
   'objet',
   'droit',
   'total',
   'activité',
   'étude',
   'national',
   'arrêté',
   'loi']},
 {'doc_type': 'bdj',
  'important_words': ['local',

In [44]:
for results in top_20_per_cat:
    words = results["important_words"]
    string = ", ".join(words)
    print(results['doc_type'])
    print(string)

acte.arrete
région, direction, subdélégation, recueil, agrément, relatif, mission, transport, arrêter, mois, département, décret, national, devoir, e, être, signer, maire, faire, délai
acte.delib
séance, délibération, municipal, conseil, présent, Monsieur, avoir, être, id, monsieur, exercice, préfecture, maire, code, recevoir, publier, voir, envoyer, relatif, après
acte.raa
Monsieur, faire, sou, présent, article, ville, général, ci, travail, municipal, autre, police, objet, droit, total, activité, étude, national, arrêté, loi
bdj
local, avoir, fonds, travail, investissement, compte, service, être, opération, personnel, subvention, public, recette, général, année, entrer, devoir, dépense, budget, charge
comm
avoir, être, tout, projet, travail, public, faire, permettre, pouvoir, plus, énergie, entrer, service, place, local, commune, nouveau, mettre, territoire, depuis
dlao.autres
e, règlement, mettre, immeuble, porter, qualité, avoir, lier, applicable, arrêté, article, ensemble, règle, c