In [None]:
import pandas as pd
from tqdm.notebook import tqdm

In [None]:
df_sentencias = pd.read_json('xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx.jsonl',lines=True)
df_sentencias['bias'] = [1 if len(x) > 0 else 0 for x in df_sentencias['highlight']]
df_sentencias

In [None]:
from genbit.genbit_metrics import GenBitMetrics

In [None]:
listi = []

language_code = 'es'
for i in tqdm(range(0,len(df_sentencias))):
    x = df_sentencias['text'].values[i]
    genbit_metrics_object = GenBitMetrics(language_code, context_window=3, distance_weight=0.95, percentile_cutoff=80)
    genbit_metrics_object.add_data(x, tokenized=False)
    metrics = {}
    metrics['doc'] = df_sentencias['doc'].values[i]
    metrics['page'] = df_sentencias['page'].values[i]
    metrics['text'] = x
    metrics['bias'] = df_sentencias['bias'].values[i]
    metrics.update(genbit_metrics_object.get_metrics(output_statistics=True, output_word_list=False))
    listi.append(metrics)

df_microsoft = pd.DataFrame(listi)
df_microsoft.to_pickle('df_microsoft.pickle')
df_microsoft

In [None]:
aa = pd.json_normalize(df_microsoft['additional_metrics'])
aa = aa.rename(columns={x:'additional_metrics__'+x for x in aa.columns})

bb = pd.json_normalize(df_microsoft['statistics'])
bb = bb.rename(columns={x:'statistics__'+x for x in bb.columns})

cc = pd.concat([df_microsoft,aa,bb],axis=1)
cc = cc.drop(columns=['additional_metrics','statistics'])
cc.to_pickle('df_microsoft_normalized.pickle')

In [None]:
from flair.data import Sentence
from flair.models import SequenceTagger

# load tagger
tagger = SequenceTagger.load("aymurai/flair-ner-spanish-judicial")

In [None]:
listi = []
for i in tqdm(range(0,len(df_sentencias))):
    x = df_sentencias['text'].values[i]
    metrics = {}
    metrics['doc'] = df_sentencias['doc'].values[i]
    metrics['page'] = df_sentencias['page'].values[i]
    metrics['text'] = x
    metrics['bias'] = df_sentencias['bias'].values[i]
    x = Sentence(x)
    tagger.predict(x)
    metrics['NER'] = [entity.to_dict() for entity in x.get_spans('ner')]
    listi.append(metrics)

df_ner = pd.DataFrame(listi)
df_ner.to_pickle('df_ner.pickle')
df_ner

In [None]:
from collections import defaultdict

listi = []
listi_avg = []
for i in range(0,len(df_ner['NER'])):
    x = df_ner['NER'].values[i]
    metrics = {}
    metrics['doc'] = df_sentencias['doc'].values[i]
    metrics['page'] = df_sentencias['page'].values[i]
    metrics['text'] = df_sentencias['text'].values[i]
    metrics['bias'] = df_sentencias['bias'].values[i]
    metrics_avg = dict(metrics)
    confidences = defaultdict(list)
    for y in x:
        for z in y['labels']:
            confidences[z['value']].append(z['confidence'])
        for k,v in confidences.items():
            metrics_avg[k] = sum(v) / len(v)
            metrics[k] = len(v)
    listi.append(metrics)
    listi_avg.append(metrics_avg)

dd_avg = pd.DataFrame(listi_avg)
dd_avg.to_pickle('df_NER_normalized_avg.pickle')
dd_avg.to_csv('df_NER_normalized_avg.csv',index=False)

dd = pd.DataFrame(listi)
dd.to_pickle('df_NER_normalized.pickle')
dd.to_csv('df_NER_normalized.csv',index=False)

In [None]:
from transformers import BertModel, BertTokenizer

tokenizer = BertTokenizer.from_pretrained("dccuchile/bert-base-spanish-wwm-uncased", do_lower_case=False)
model = BertModel.from_pretrained("dccuchile/bert-base-spanish-wwm-uncased")

In [None]:
import torch
from torch import IntTensor
listi = []
listi_embeddings = []
for i in tqdm(range(0,len(df_sentencias))):
    x = df_sentencias['text'].values[i]
    metrics = {}
    metrics['doc'] = df_sentencias['doc'].values[i]
    metrics['page'] = df_sentencias['page'].values[i]
    metrics['text'] = x
    metrics['bias'] = df_sentencias['bias'].values[i]
    tokens = tokenizer(x, return_tensors='pt')
    if len(tokens["input_ids"][-1]) > 512:
        # https://arxiv.org/pdf/1905.05583.pdf
        tokens_ = {}
        tokens_['input_ids'] = Tensor([tokens['input_ids'][-1][:128].tolist() + tokens['input_ids'][-1][-384:].tolist()]).int()
        tokens_['token_type_ids'] = IntTensor([tokens['token_type_ids'][-1][:128].tolist() + tokens['token_type_ids'][-1][-384:].tolist()])
        tokens_['attention_mask'] = Tensor([tokens['attention_mask'][-1][:128].tolist() + tokens['attention_mask'][-1][-384:].tolist()]).to(torch.int64)
        tokens = transformers.BatchEncoding(tokens_)

    print(len(tokens['input_ids'][-1]))
    outputs = model(**tokens)
    outputs = outputs['last_hidden_state'][-1][-1]

    listi.append(metrics)
    listi_embeddings.append(outputs.tolist())

df_bert = pd.concat([pd.DataFrame(listi),pd.DataFrame(listi_embeddings)],axis=1)
df_bert.to_pickle('df_bert.pickle')
df_bert

In [None]:
from sentence_transformers import SentenceTransformer
sentences = ["This is an example sentence", "Each sentence is converted"]

model = SentenceTransformer('espejelomar/sentece-embeddings-BETO')

In [None]:
listi = []
listi_embeddings = []
for i in tqdm(range(0,len(df_sentencias))):
    x = df_sentencias['text'].values[i]
    metrics = {}
    metrics['doc'] = df_sentencias['doc'].values[i]
    metrics['page'] = df_sentencias['page'].values[i]
    metrics['text'] = x
    metrics['bias'] = df_sentencias['bias'].values[i]
    embeddings = model.encode(x)
    listi.append(metrics)
    listi_embeddings.append(embeddings)

df_st = pd.concat([pd.DataFrame(listi),pd.DataFrame(listi_embeddings)],axis=1)
df_st.to_pickle('df_st.pickle')
df_st

In [None]:
df_sentencias = df_sentencias[~df_sentencias['doc'].str.startswith('xxxxxxxxxxxxxxxxxxxxxxxxxxxx')]
df_sentencias

In [None]:
aa = df_sentencias.groupby('doc')[['bias']].sum()
aa = set(aa[aa['bias'] > 0].index)
df_sentencias_with = df_sentencias[df_sentencias['doc'].isin(aa)]
df_sentencias_with

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GroupKFold
from sklearn.model_selection import StratifiedGroupKFold

In [None]:
import pickle
def get_splits(df_sentencias, name, n_splits=5, ss=None):
    listi_ids = []
    listi_rows = []
    print(name)
    for partition in tqdm(ss.split(df_sentencias,y=df_sentencias['bias'])):
#         print(partition[0])
#         print(partition[1])
#         print('------------')
        listi_ids.append((list(df_sentencias.iloc[partition[0]].index),list(df_sentencias.iloc[partition[1]].index)))
        listi_rows.append(([(x[1][3],x[1][0],x[1][1]) for x in df_sentencias[df_sentencias.index.isin(listi_ids[-1][0])].iterrows()],[(x[1][3],x[1][0],x[1][1]) for x in df_sentencias[df_sentencias.index.isin(listi_ids[-1][1])].iterrows()]))

    with open(f'split_{name}_{n_splits}_ids.pickle','wb') as file:
        pickle.dump(listi_ids,file)

    with open(f'split_{name}_{n_splits}_rows.pickle','wb') as file:
        pickle.dump(listi_rows,file)
    return listi_ids

In [None]:
ss = KFold(n_splits=5,random_state=None)
name = 'KFold_alldocs'
get_splits(df_sentencias, name, n_splits=5, ss=ss)

name = 'StratifiedKFold_alldocs'
ss = StratifiedKFold(n_splits=5,random_state=None)
get_splits(df_sentencias, name, n_splits=5, ss=ss)

ss = KFold(n_splits=5,random_state=None)
name = 'KFold_with'
get_splits(df_sentencias_with, name, n_splits=5, ss=ss)

name = 'StratifiedKFold_with'
ss = StratifiedKFold(n_splits=5,random_state=None)
listi = get_splits(df_sentencias_with, name, n_splits=5, ss=ss)

In [None]:
i = 5
df_sentencias_with[df_sentencias_with.index.isin(listi[i][1])]

In [None]:
def get_group_splits(df_sentencias, name, n_splits=5, ss=None):
    listi_ids = []
    listi_rows = []
    print(name)
    for train_index, test_index in ss.split(df_sentencias.index, df_sentencias['bias'],groups=df_sentencias['doc']):
        listi_ids.append((list(df_sentencias.iloc[train_index].index),list(df_sentencias.iloc[test_index].index)))
        listi_rows.append(([(x[1][3],x[1][0],x[1][1]) for x in df_sentencias[df_sentencias.index.isin(listi_ids[-1][0])].iterrows()],[(x[1][3],x[1][0],x[1][1]) for x in df_sentencias[df_sentencias.index.isin(listi_ids[-1][1])].iterrows()]))
    with open(f'split_{name}_{n_splits}_ids.pickle','wb') as file:
        pickle.dump(listi_ids,file)

    with open(f'split_{name}_{n_splits}_rows.pickle','wb') as file:
        pickle.dump(listi_rows,file)
    return listi_ids

In [None]:
ss = GroupKFold(n_splits=5)
name = 'GroupKFold_alldocs'
get_group_splits(df_sentencias, name, n_splits=5, ss=ss)

name = 'StratifiedGroupKFold_alldocs'
ss = StratifiedKFold(n_splits=5,random_state=None)
listi = get_group_splits(df_sentencias, name, n_splits=5, ss=ss)

ss = KFold(n_splits=5)
name = 'GroupKFold_with'
get_group_splits(df_sentencias_with, name, n_splits=5, ss=ss)

name = 'StratifiedGroupKFold_with'
ss = StratifiedGroupKFold(n_splits=5,random_state=None)
get_group_splits(df_sentencias_with, name, n_splits=5, ss=ss)

In [None]:
# probar parseando sentences_with_annotations_20220323a.txt y all_sentencias_ss.html
# en ese sabemos que están todos los párrafos que necesitamos
import re
ww = re.compile('<<.*>>')

# los nombres de los archivos deberían matchear todos acá!
listi = []
with open('all_sentencias_ss.html') as fp:
    soup = BeautifulSoup(fp, 'html.parser')

for x in tqdm(soup.find('body').find_all(class_='tabla_termino')):
    sesgo = {}
    sesgo['concepto'] = x.find(class_='tabla_termino_columna_concepto').text
    sesgo['context'] = x.find(class_='tabla_termino_columna_contexto').text.replace('<<','').replace('>>','')
    sesgo['words'] = ww.search(x.find(class_='tabla_termino_columna_contexto').text).group().replace('<<','').replace('>>','')
#     print('..',ww.search(x.find(class_='tabla_termino_columna_contexto').text).group())
#     print(sesgo)
    listi.append(sesgo)

df_themis_all = pd.DataFrame(listi)
df_themis_all.to_pickle('df_themis_all.pickle')
df_themis_all.to_csv('df_themis_all.csv',index=False)
df_themis_all

In [None]:
listi = []
with open('xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx.txt','r',encoding='utf-8') as file:
    title = None
    for x in file.readlines():
        if x.startswith('####'):
            title = x[5:].replace('\n','')
#             print(title)
        else: # es un paragraph
            if len(x) > 1:
                listi.append({'doc':title,'text':x.replace('\n','')})
df_all_sentencias = pd.DataFrame(listi)
df_all_sentencias

In [None]:
# ahora hay que hacer el maching entre el text de este y el context del otro df
ll = []
for i in tqdm(range(0,len(df_all_sentencias))):
    text = df_all_sentencias['text'].values[i]
    ll.append(1 if any(x for x in df_themis_all['context'] if x in text) else 0)
df_all_sentencias['themis'] = ll
df_all_sentencias

In [None]:
listi = []
for i in tqdm(range(0,len(df_sentencias))):
    doc = df_sentencias['doc'].values[i]
    text = df_sentencias['text'].values[i]
    dd = df_all_sentencias[(df_all_sentencias['doc'] == doc) & (df_all_sentencias['text'] == text)]
    listi.append(dd['themis'].values[0])

df_sentencias['themis'] = listi
df_sentencias.to_pickle('df_themis.pickle')
df_sentencias.to_csv('df_themis.csv',index=False)
df_sentencias

In [None]:
from transformers import RobertaTokenizer, RobertaModel
import torch

tokenizer = RobertaTokenizer.from_pretrained('PlanTL-GOB-ES/RoBERTalex')
model = RobertaModel.from_pretrained('PlanTL-GOB-ES/RoBERTalex')

In [None]:
from torch import Tensor, IntTensor
import transformers

listi = []
listi_embeddings = []
for i in tqdm(range(0,len(df_sentencias))): # 314
    x = df_sentencias['text'].values[i]
    metrics = {}
    metrics['doc'] = df_sentencias['doc'].values[i]
    metrics['page'] = df_sentencias['page'].values[i]
    metrics['text'] = x
    metrics['bias'] = df_sentencias['bias'].values[i]

    tokens = tokenizer(x, return_tensors='pt')
    if len(tokens["input_ids"][-1]) > 512:
        # https://arxiv.org/pdf/1905.05583.pdf
        tokens_ = {}
        tokens_['input_ids'] = Tensor([tokens['input_ids'][-1][:128].tolist() + tokens['input_ids'][-1][-384:].tolist()]).int()
        tokens_['attention_mask'] = Tensor([tokens['attention_mask'][-1][:128].tolist() + tokens['attention_mask'][-1][-384:].tolist()]).to(torch.int64)
        tokens = transformers.BatchEncoding(tokens_)

    embeddings = model(**tokens)
    embeddings = embeddings.last_hidden_state[-1][-1]
    listi.append(metrics)
    listi_embeddings.append(embeddings.tolist())
    del tokens
    del embeddings

In [None]:
df_st = pd.concat([pd.DataFrame(listi),pd.DataFrame(listi_embeddings)],axis=1)
df_st

In [None]:
df_st.to_pickle('df_robertalex.pickle')

In [None]:
df_sentencias['text'].values[1]

In [None]:
import spacy
nlp = spacy.load('es_core_news_sm')

In [None]:
# OJO! No es para usar antes de la división de training/test, es código de ejemplo
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk

stop_words=set(nltk.corpus.stopwords.words('spanish'))
def preproc(text):
    ll = []
    for t in nlp(text):
        if t.is_stop or t.is_punct or not t.is_alpha or t.text in stop_words:
            continue
        ll.append(t.lemma_.lower())
    return ' '.join(ll)

vectorizer = TfidfVectorizer(lowercase = True, # Si querés podés agregar max features, y un par de cosas más
                             preprocessor=preproc)

In [None]:
X = vectorizer.fit_transform(df_sentencias['text'])
df_tf = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out(), index=df_sentencias.index)
df_tf.to_pickle('df_tfidf_full.pickle')
df_tf

In [None]:
# X = vectorizer.fit_transform(df_sentencias_with['text'])
df_tf = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out(), index=df_sentencias_with.index)
df_tf.to_pickle('df_tfidf_with.pickle')
df_tf

In [None]:
import os
dir_path = './'

for ff in os.listdir(dir_path):

    if not ff.endswith('.pickle'):
        continue

    if not ff.startswith('df'):
        continue

    df = pd.read_pickle(dir_path + ff)

    if 'doc' not in df.columns:
        continue

    print(ff)

    df = df[[x for x in df.columns if x != 'doc' and x != 'page' and x != 'text' and x != 'highlight']]
    df = df.rename(columns={'label':'bias'})

    df = df[df.index.isin(df_sentencias_with.index)]

    df.to_pickle(dir_path + '__anon__' + ff)

In [None]:
df_expli = pd.read_pickle(dir_path + '__anon__df_explicaciones.pickle')
df_expli = df_expli[['bias','responses_ZS_expli_gemma2b','responses_ZS_expli_gpt-3.5-turbo','responses_ZS_expli_llama27b-chat','responses_ZS_expli_mistral7b-instruct','responses_bert_FS_expli_gpt-3.5-turbo_dynamic_all_False_4','responses_bert_FS_expli_mistral7b-instruct_dynamic_all_False_4','responses_bert_FS_expli_gemma2b_dynamic_all_False_4',]]

df_expli = df_expli.rename(columns={'responses_ZS_expli_gemma2b':'zero-shot__gemma2b',
'responses_ZS_expli_gpt-3.5-turbo':'zero-shot__gpt-3.5-turbo',
'responses_ZS_expli_llama27b-chat':'zero-shot__llama27b-chat',
'responses_ZS_expli_mistral7b-instruct':'zero-shot__mistral7b-instruct',
'responses_bert_FS_expli_gpt-3.5-turbo_dynamic_all_False_4':'few-shot_dynamic__gpt-3.5-turbo',
'responses_bert_FS_expli_mistral7b-instruct_dynamic_all_False_4':'few-shot_dynamic__mistral7b-instruct',
'responses_bert_FS_expli_gemma2b_dynamic_all_False_4':'few-shot_dynamic__gemma2b'})
df_expli.to_pickle(dir_path + '__anon__df_explicaciones.pickle')

In [None]:
df_expli = df_expli[['bias','few-shot_dynamic__gpt-3.5-turbo']]

df_expli['lala'] = [x[0] for x in df_expli['few-shot_dynamic__gpt-3.5-turbo']]
df_expli['expli'] = [x[1] for x in df_expli['few-shot_dynamic__gpt-3.5-turbo']]
df_expli = df_expli[df_expli['bias'] == df_expli['lala']]

In [None]:
list(df_expli[df_expli['bias'] == 1]['expli'].values)