# Annotation

In [122]:
import logging
logging.getLogger().setLevel(logging.INFO)
import sys
sys.path.append("..")

In [123]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import re

## Performance de la labelisation

### Retrieve doccano labelling

first import doccano labels from web interface

In [162]:
import json
import os

def retrieve_labelling(json_file):
    ret = pd.DataFrame(json.load(open(json_file))).set_index("id")
    ret.label = ret.label.apply(lambda x: x[0] if len(x)>0 else None)
    return ret

In [163]:
root_path = "/Users/charlesprat/Downloads/03d3a4cf-165c-41a9-8650-b3ecdbfe00dd"
amir = retrieve_labelling(os.path.join(root_path,"yotta-amir.json"))
charles = retrieve_labelling(os.path.join(root_path,"yotta-charles-henri.json"))
moindze = retrieve_labelling(os.path.join(root_path,"yotta-moindze.json"))

In [164]:
dfm = moindze.join(amir.label.rename("amir")).join(charles.label.rename("charles")).rename(
    columns={"label": "moindze"}
)

In [165]:
dfm.to_csv("labels_merge.csv")

### Performance

In [129]:
from sklearn.metrics import accuracy_score

In [166]:
#label_v1 = pd.read_csv("../data/labels_v1.csv")
label_v1 = pd.read_csv("labels_merge.csv")

def compute_accuracy(v1,v2):
    drop_val = (v1 == "inclassable") | (v2=="inclassable")
    annotation_yield = 1-drop_val.mean()
    return accuracy_score(v1[~drop_val], v2[~drop_val]), annotation_yield

annotators = [("moindze", "amir"), ("moindze", "charles"), ("amir", "charles")]

pd.DataFrame([compute_accuracy(label_v1[an1], label_v1[an2]) 
              for an1,an2 in annotators], 
             columns=["accuracy", "rendement"],
             index=[(an1,an2) for an1, an2 in annotators]
            ).style.format(precision=2)


Unnamed: 0,accuracy,rendement
"('moindze', 'amir')",0.75,0.63
"('moindze', 'charles')",0.73,0.74
"('amir', 'charles')",0.7,0.69


## Generation d'un set de labellisation

In [3]:
from dissidentia.infrastructure.grand_debat import GDAnswers

answs = GDAnswers().load_data(1000)

INFO:root:answers.csv imported on the disk


In [41]:
def preprocess(text):
    text = re.sub("\.+", ".", text)
    test = re.sub(r"\.([A-Z])", r". \1", text)
    return text

In [43]:
from nltk import tokenize

sentences = np.sum([tokenize.sent_tokenize(preprocess(ans)) for ans in tqdm(answs)])



100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████| 772/772 [00:00<00:00, 9077.13it/s]
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


In [153]:
len(sentences)

2515

In [6]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
tokenizer = AutoTokenizer.from_pretrained("nlptown/bert-base-multilingual-uncased-sentiment")
model = AutoModelForSequenceClassification.from_pretrained("nlptown/bert-base-multilingual-uncased-sentiment")

2022-11-15 20:49:28.503740: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [7]:
import torch

test_sentence = "c'est archi nul !"

def predict_huggingface(test_sentence):
    inputs = tokenizer(test_sentence, return_tensors="pt")

    with torch.no_grad():
        try:
            logits = model(**inputs).logits
        except:
            return -1
        
    predicted_class_id = logits.argmax().item()
    softmaxFunc = torch.nn.Softmax(dim=1)
    softmaxScores = softmaxFunc(logits)
    probas = softmaxScores[0].detach().numpy()
    
    return np.sum(probas * np.arange(5)) / 4

predict_huggingface(test_sentence)

0.01940380927408114

In [45]:
hf_predict = [predict_huggingface(sentence) for sentence in tqdm(sentences)]

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████| 2515/2515 [03:40<00:00, 11.41it/s]


In [46]:
df_gd = pd.DataFrame({"sentences": sentences, "hf_pred": hf_predict})

In [154]:
df_gd.sort_values(by="hf_pred").sentences.iloc[100:200].rename("text").to_csv("second_ds.csv", index=False)