In [1]:
import pandas as pd
import torch
from prep_input_data import get_labelled_data
from transformers import BertTokenizer, BertModel, BertConfig
from transformers import RobertaTokenizer, RobertaModel, RobertaConfig

  df = pd.read_csv(corpus_file, dtype={'year': str},comment='#')


985


In [2]:
corpus_file = '../../data/democracy_reports_corpus_annelisa_9_fixed.csv'
df = get_labelled_data(corpus_file, all=True)
df = df.dropna(subset=['source'])
df = df[df['source'].str.contains('freedom')]
print(df.sample(1))

  df = pd.read_csv(corpus_file, dtype={'year': str},comment='#')


                                                sentence  country  year  \
56838  "However, this revenue was obtained mainly fro...  moldova  2018   

                              source dimension1 dimension2  backsliding  \
56838  freedomhouse_nations-transit"        NaN        NaN          NaN   

      cat_4_sentence_nuance  start_idea comments undefined0 undefined1  \
56838                   NaN         NaN      NaN        NaN        NaN   

      consensus  
56838       NaN  


In [3]:
MAX_LEN = 512
device = 'cpu'
# Get label names
labels = sorted(df['dimension1'].dropna().unique())
print(f'Labels: {labels}')
NO_LABELS = len(labels)

Labels: ['ambiguous', 'civil society', 'direct democracy', 'elections', 'electoral', 'equality', 'freedoms', 'liberal institutions', 'media', 'open government', 'participatory', 'political competition']


In [4]:
class ROBERTAClass(torch.nn.Module):
    def __init__(self):
        super(ROBERTAClass, self).__init__()
        self.l1 = transformers.RobertaModel.from_pretrained('roberta-base')
        self.l2 = torch.nn.Dropout(0.3)
        self.l3 = torch.nn.Linear(768, NO_LABELS)
    
    def forward(self, ids, mask, token_type_ids):
        _, output_1= self.l1(ids, attention_mask = mask, token_type_ids = token_type_ids, return_dict=False)
        output_2 = self.l2(output_1)
        output = self.l3(output_2)
        return output
        
class LEGALBERTClass(torch.nn.Module):
    def __init__(self):
        super(BERTClass, self).__init__()
        self.l1 = transformers.BertModel.from_pretrained('nlpaueb/legal-bert-base-uncased')
        self.l2 = torch.nn.Dropout(0.3)
        self.l3 = torch.nn.Linear(768, NO_LABELS)
    
    def forward(self, ids, mask, token_type_ids):
        _, output_1= self.l1(ids, attention_mask = mask, token_type_ids = token_type_ids, return_dict=False)
        output_2 = self.l2(output_1)
        output = self.l3(output_2)
        return output

class BERTClass(torch.nn.Module):
    def __init__(self):
        super(BERTClass, self).__init__()
        self.l1 = transformers.BertModel.from_pretrained('bert-base-uncased')
        self.l2 = torch.nn.Dropout(0.3)
        self.l3 = torch.nn.Linear(768, NO_LABELS)
    
    def forward(self, ids, mask, token_type_ids):
        _, output_1= self.l1(ids, attention_mask = mask, token_type_ids = token_type_ids, return_dict=False)
        output_2 = self.l2(output_1)
        output = self.l3(output_2)
        return output

In [5]:
tokenizer_bert = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case = True)
tokenizer_legal_bert = BertTokenizer.from_pretrained('nlpaueb/legal-bert-base-uncased', do_lower_case = True)
tokenizer_roberta = RobertaTokenizer.from_pretrained('roberta-base', do_lower_case = True)
model_bert = torch.load('BERT_classifier_democracy.pth')
model_legal_bert = torch.load('LEGAL_BERT_classifier_democracy.pth')
model_roberta = torch.load('ROBERTA_classifier_democracy.pth')
models = [('bert', model_bert, tokenizer_bert), ('legal_bert', model_legal_bert, tokenizer_legal_bert), ('roberta_base', model_roberta, tokenizer_roberta)]
#tokenizers = [tokenizer_bert, tokenizer_legal_bert, tokenizer_roberta]

In [7]:
import math
def get_dimension_from_prediction(v, l=labels, t=0.5):
    m = max(v)
    
    index = v.index(m)  
    if (m < t):
        #return f"bt:{l[index]}:{round(m,3)}"
        return "NoClass"
    return l[index]
    
def predict_single_sentence(sentence, tokenizer, model):
    # Encode the sentence
    encoding = tokenizer.encode_plus(
      sentence.lower(),
      add_special_tokens=True,
      max_length=MAX_LEN,
      padding='max_length',
      truncation=True,
      return_token_type_ids=True,
      return_tensors='pt'
    )
    
    input_ids = encoding["input_ids"]
    mask = encoding["attention_mask"]
    token_type_ids = encoding["token_type_ids"]
    input_ids = input_ids.to(device, dtype=torch.long)
    mask = mask.to(device, dtype=torch.long)
    token_type_ids = token_type_ids.to(device, dtype=torch.long)
    
    # Inference
    output = torch.sigmoid(model(input_ids, mask, token_type_ids))
    predictions = output[0].tolist()
    #for label, prediction in zip(labels, predictions):
    #    print(f"{label}: {prediction}")
    predicted_class = get_dimension_from_prediction(predictions)
    #print(predicted_class)
    return predicted_class

In [8]:
import time
import math
start_time = time.time()

#sample_row = df.sample()[['sentence']]
#sample_text = sample_row['sentence'].values[0]
#print(sample_text)
sample_size = 1000
df_cleaned = df.dropna(subset=['sentence'])
sample_df = df_cleaned.sample(sample_size).copy()
for name, model, tokenizer in models:
    print(f'Classifying using {name}.')
    predictions = []
    for sentence in sample_df['sentence']:
        #print(sentence)
        predictions.append(predict_single_sentence(sentence, tokenizer, model))
    sample_df[name] = predictions
    #result_dim = predict_single_sentence(sample_row, tokenizer, model)
    #print(f'Model {name} classifies statement as: {result_dim}.')
end_time = time.time()
execution_time = end_time - start_time
print(f"Execution time: {execution_time} seconds")

Classifying using bert.
Classifying using legal_bert.
Classifying using roberta_base.
Execution time: 534.6859068870544 seconds


In [9]:
estimated_time = (execution_time * len(df)) / sample_size
hours = estimated_time // 3600
remaining_seconds = estimated_time % 3600
minutes = remaining_seconds // 60
print(f"estimated execution time: {int(hours)}:{int(minutes)}")

estimated execution time: 13:55


In [10]:
# save sample
file_name = "sample_bert_legal_roberta_fh_" + str(sample_size) +".csv"
sample_df.to_csv(file_name, index=False)