In [2]:
import pandas as pd
import torch
from prep_input_data import get_labelled_data
from transformers import BertTokenizer, BertModel, BertConfig
from transformers import RobertaTokenizer, RobertaModel, RobertaConfig

In [3]:
corpus_file = '../../data/democracy_reports_corpus_merged_040624.csv'
model_file = '../../data/ROBERTA_merged_classifier_democracy.pth'

In [4]:
df = pd.read_csv(corpus_file)
df['predicted_class_roberta'] = None
print(df.sample(1))

                                                 sentence  country  year  \
267531  whenever possible, it provides examples of eth...  romania  2017   

       source dimension1_r1 dimension2_r1 backsliding_r1  start_idea_r1  \
267531  greco           NaN           NaN            NaN            NaN   

       dimension0_r2 dimension1_r2  ... dimension2_r3  backsliding_r3  \
267531           NaN           NaN  ...           NaN             NaN   

       start_idea_r2 start_idea_r3 dimension0_r1  correct_dimension  \
267531           NaN           NaN           NaN                NaN   

        expalained_ambiquous Unnamed: 21 dimension0 predicted_class_roberta  
267531                   NaN         NaN        NaN                    None  

[1 rows x 24 columns]


  df = pd.read_csv(corpus_file)


In [5]:
MAX_LEN = 512
device = 'cpu'
# Get label names
labels = sorted(df['dimension0'].dropna().unique())
print(f'Labels: {labels}')
NO_LABELS = len(labels)

Labels: ['ambiguous', 'democracy', 'electoral', 'liberal', 'media', 'participatory']


In [6]:
class ROBERTAClass(torch.nn.Module):
    def __init__(self):
        super(ROBERTAClass, self).__init__()
        self.l1 = transformers.RobertaModel.from_pretrained('roberta-base')
        self.l2 = torch.nn.Dropout(0.3)
        self.l3 = torch.nn.Linear(768, NO_LABELS)
    
    def forward(self, ids, mask, token_type_ids):
        _, output_1= self.l1(ids, attention_mask = mask, token_type_ids = token_type_ids, return_dict=False)
        output_2 = self.l2(output_1)
        output = self.l3(output_2)
        return output

In [7]:
#class LEGALBERTClass(torch.nn.Module):
#    def __init__(self):
#        super(BERTClass, self).__init__()
#        self.l1 = transformers.BertModel.from_pretrained('nlpaueb/legal-bert-base-uncased')
#        self.l2 = torch.nn.Dropout(0.3)
#        self.l3 = torch.nn.Linear(768, NO_LABELS)
#    
#    def forward(self, ids, mask, token_type_ids):
#        _, output_1= self.l1(ids, attention_mask = mask, token_type_ids = token_type_ids, return_dict=False)
#        output_2 = self.l2(output_1)
#        output = self.l3(output_2)
#        return output

In [8]:
#class BERTClass(torch.nn.Module):
#    def __init__(self):
#        super(BERTClass, self).__init__()
#        self.l1 = transformers.BertModel.from_pretrained('bert-base-uncased')
#        self.l2 = torch.nn.Dropout(0.3)
#        self.l3 = torch.nn.Linear(768, NO_LABELS)
#    
#    def forward(self, ids, mask, token_type_ids):
#        _, output_1= self.l1(ids, attention_mask = mask, token_type_ids = token_type_ids, return_dict=False)
#        output_2 = self.l2(output_1)
#        output = self.l3(output_2)
#        return output

In [9]:
#tokenizer_bert = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case = True)
#tokenizer_legal_bert = BertTokenizer.from_pretrained('nlpaueb/legal-bert-base-uncased', do_lower_case = True)
tokenizer_roberta = RobertaTokenizer.from_pretrained('roberta-base', do_lower_case = True)
#model_bert = torch.load('BERT_classifier_democracy.pth')
#model_legal_bert = torch.load('LEGAL_BERT_classifier_democracy.pth')
model_roberta = torch.load(model_file)
#models = [('bert', model_bert, tokenizer_bert), ('legal_bert', model_legal_bert, tokenizer_legal_bert), ('roberta_base', model_roberta, tokenizer_roberta)]
#tokenizers = [tokenizer_bert, tokenizer_legal_bert, tokenizer_roberta]
models = [('roberta_base', model_roberta, tokenizer_roberta)]

In [14]:
import math
def get_dimension_from_prediction(v, l=labels, t=0.5):
    m = max(v)
    
    index = v.index(m)  
    if (m < t):
        #return f"bt:{l[index]}:{round(m,3)}"
        return "NoClass"
    return l[index]
    
def predict_single_sentence(sentence, tokenizer, model):
    # Encode the sentence
    encoding = tokenizer.encode_plus(
      sentence.lower(),
      add_special_tokens=True,
      max_length=MAX_LEN,
      padding='max_length',
      truncation=True,
      return_token_type_ids=True,
      return_tensors='pt'
    )
    
    input_ids = encoding["input_ids"]
    mask = encoding["attention_mask"]
    token_type_ids = encoding["token_type_ids"]
    input_ids = input_ids.to(device, dtype=torch.long)
    mask = mask.to(device, dtype=torch.long)
    token_type_ids = token_type_ids.to(device, dtype=torch.long)
    
    # Inference
    output = torch.sigmoid(model(input_ids, mask, token_type_ids))
    predictions = output[0].tolist()
    #for label, prediction in zip(labels, predictions):
    #    print(f"{label}: {prediction}")
    predicted_class = get_dimension_from_prediction(predictions)
    #print(predicted_class)
    return predicted_class

In [None]:
import time
import math
start_time = time.time()
df_cleaned = df.dropna(subset=['sentence'])

for name, model, tokenizer in models:
    no_rows = len(df_cleaned)
    print(f'Classifying using {name} number of rows {no_rows}.')
    
    for index, row in df_cleaned.iterrows():
        if (index % 100 == 0):
            time_stamp = time.time()
            execution_time = time_stamp - start_time
            estimated_time = (execution_time * (no_rows - (index + 1))) / (index + 1)
            hours = estimated_time // 3600
            remaining_seconds = estimated_time % 3600
            minutes = remaining_seconds // 60
            #print(f"estimated execution time: {int(hours)}:{int(minutes)}")
            print(f"Progress at row {index}: {round((100 * index)/no_rows, 3)}%. Time remaining {int(hours)}h {int(minutes)}m.")            
            
        sentence = row['sentence']
        df_cleaned.at[index, 'predicted_class_roberta'] = predict_single_sentence(sentence, tokenizer, model)
        
        
    
end_time = time.time()
execution_time = end_time - start_time
print(f"Execution time: {execution_time} seconds")

Classifying using roberta_base number of rows 459931.
Progress at row 0: 0.0%. Time remaining 38h 0m.
Progress at row 100: 0.022%. Time remaining 22h 15m.


In [19]:
#estimated_time = (execution_time * len(df)) / sample_size
#hours = estimated_time // 3600
#remaining_seconds = estimated_time % 3600
#minutes = remaining_seconds // 60
#print(f"estimated execution time: {int(hours)}:{int(minutes)}")

In [18]:
# save sample
from datetime import datetime

# Get current date
current_date = datetime.now()

# Format the date
formatted_date = current_date.strftime('%d%m%y')


file_name = f"../../data/democracy_reports_corpus_merged_predictions_{formatted_date}.csv"
print(file_name)
df_cleaned.to_csv(file_name, index=False)

../../data/democracy_reports_corpus_merged_predictions_110624.csv
