# Annotation

In [None]:
import logging
logging.getLogger().setLevel(logging.INFO)
import sys
sys.path.append("..")

In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import re

## Annotation helpers

In [None]:
from dissidentia.infrastructure.doccano import DoccanoDataset
import os; os.environ["DOCCANO_LOGIN"] = "yotta-charles-henri"; os.environ["DOCCANO_PASSWORD"]="regex>DL"
dds = DoccanoDataset()

In [None]:
df_dds = dds.load_data(only_approved=False)

In [None]:
from dissidentia.infrastructure.grand_debat import GDAnswers
sentences = GDAnswers().load_sentences()

In [None]:
sentences.info()

### with sentiment analysis

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
tokenizer = AutoTokenizer.from_pretrained("nlptown/bert-base-multilingual-uncased-sentiment")
model = AutoModelForSequenceClassification.from_pretrained("nlptown/bert-base-multilingual-uncased-sentiment")

In [None]:
def predict_huggingface(test_sentence):
    inputs = tokenizer(test_sentence, return_tensors="pt")

    with torch.no_grad():
        try:
            logits = model(**inputs).logits
        except:
            return -1
        
    predicted_class_id = logits.argmax().item()
    softmaxFunc = torch.nn.Softmax(dim=1)
    softmaxScores = softmaxFunc(logits)
    probas = softmaxScores[0].detach().numpy()
    
    return np.sum(probas * np.arange(5)) / 4

predict_huggingface("c'est archi nul !")

In [None]:
hf_predict = [predict_huggingface(sentence) for sentence in tqdm(sentences.text[:100])]

In [None]:
df_gd = pd.DataFrame({"sentences": sentences.text[:100], "hf_pred": hf_predict})
df_gd.head()

### with terms matching

In [None]:
dissident_voc = ["copinage", "corrompu", "incompétent"]
dissident_candidate = sentences.text.apply(lambda x: any(word in x for word in dissident_voc))
dissident_terms = sentences.loc[dissident_candidate]

In [None]:
df = dds.safe_upload_data(dissident_terms)

### with sentence similarity

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
from sentence_transformers import SentenceTransformer
model =  SentenceTransformer("dangvantuan/sentence-camembert-base")

In [None]:
from tqdm import tqdm
tqdm.pandas()

In [None]:
sentences = GDAnswers().load_sentences().sample(1000)

In [None]:
sentences.info()

In [None]:
embedings = sentences.text.progress_apply(lambda x : model.encode(x, show_progress_bar=False))

In [None]:
df_dds.loc[df_dds.label == "dissident", "text"].values

In [None]:
sent= "Une gestion catastrophique de l'argent public."

sent_enc = model.encode(sent, show_progress_bar=False).reshape(1,-1)

cos_sim = embedings.apply(lambda x: cosine_similarity(x.reshape(1,-1), sent_enc)[0][0])

sorted_sents = pd.concat([sentences, cos_sim.rename("similarity")], axis=1).sort_values(
    by="similarity", ascending=False)
HEAD_NUM=100
sorted_sents.reset_index().head(HEAD_NUM).text.to_dict()

In [None]:
dds.safe_upload_data(sorted_sents.head(HEAD_NUM), proceed=True)

## Performance de la labelisation

### Retrieve doccano labelling

first import doccano labels from web interface

In [None]:
pd.DataFrame().empty

In [None]:
import json
import os

def retrieve_labelling(json_file):
    ret = pd.DataFrame(json.load(open(json_file)))
    if ret.empty:
        return pd.DataFrame(columns=["sentendes", "label"])
    ret = ret.set_index("id")
    ret.label = ret.label.apply(lambda x: x[0] if len(x)>0 else None)
    return ret

In [None]:
root_path = "/Users/charlesprat/Downloads/0ba0c385-e509-4a9a-8824-d27d7dabf93a"
amir = retrieve_labelling(os.path.join(root_path,"yotta-amir.json"))
charles = retrieve_labelling(os.path.join(root_path,"yotta-charles-henri.json"))
moindze = retrieve_labelling(os.path.join(root_path,"yotta-moindze.json"))

In [None]:
dfm = charles.join(amir.label.rename("amir")).join(moindze.label.rename("moindze")).rename(
    columns={"label": "charles"}
)

In [None]:
dfm.to_csv("labels_merge.csv")

In [None]:
dfm["final"] = dfm.moindze
final_v2 = pd.read_csv("/Users/charlesprat/Downloads/labels_errors - labels_errors.csv", index_col="id")
dfm.loc[final_v2.index,"final"] = final_v2.final

dfm.to_csv("../data/labels_v2.csv")

In [None]:
pd.concat([pd.read_csv("../data/labels_v1.csv"),
           pd.read_csv("../data/labels_v2.csv")])


### Performance

In [None]:
from sklearn.metrics import accuracy_score

In [None]:
#label_v1 = pd.read_csv("../data/labels_v1.csv")
label_v1 = pd.read_csv("labels_merge.csv")

def compute_accuracy(i_v1, i_v2):
    null_val = i_v1.isnull() | i_v2.isnull()
    v1 = i_v1[~null_val]
    v2 = i_v2[~null_val]
    drop_val = (v1 == "inclassable") | (v2=="inclassable")
    annotation_yield = 1-drop_val.mean()
    return accuracy_score(v1[~drop_val], v2[~drop_val]), annotation_yield

annotators = [("moindze", "amir"), ("moindze", "charles"), ("amir", "charles")]

pd.DataFrame([compute_accuracy(label_v1[an1], label_v1[an2]) 
              for an1,an2 in annotators], 
             columns=["accuracy", "rendement"],
             index=[(an1,an2) for an1, an2 in annotators]
            ).style.format(precision=2)


In [None]:
errors.loc[~inclassable].to_csv("labels_errors.csv")


In [None]:
errors = dfm.dropna()
no_error = (errors.charles == errors.amir) & (errors.charles == errors.moindze)
errors = errors.loc[~no_error]
inclassable = (errors.charles == 'inclassable') | (errors.amir == 'inclassable') | (errors.moindze == "inclassable")
display(errors.loc[~inclassable])
errors