In [213]:
import numpy as np
import pandas as pd
import pyarrow
from sklearn import metrics
import transformers
import torch
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertModel, BertConfig

In [214]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'
print("Using device: " + device)

Using device: cpu


In [215]:
import urllib
import os

# Corpus and dictionary files to use
corpus_file = 'democracy_reports_corpus.csv'
corpus_file_url = "https://github.com/backdem/democracy-datasets/raw/main/democracy_reports_corpus.csv"

# Download datsets if not already downloaded
if not os.path.exists(corpus_file):
    urllib.request.urlretrieve(corpus_file_url, corpus_file)

In [216]:
# Read csv file into Dataframe
df = pd.read_csv(corpus_file, dtype={'year': str, 'sentence': str}, comment='#')
# Print first row
print(df.sample(5))

                                                 sentence  \
403487  in particular, the oversight role to be perfor...   
287463  there have been no such criminal cases in rece...   
250805  greco concludes that recommendation xi has bee...   
216381  it furthermore took note of a planned amendmen...   
36625   During the year, civil society organizations (...   

                                                  section  country  year  \
403487                                               none    italy  2011   
287463                                               none   sweden  2013   
250805                                               none  andorra  2020   
216381                                               none  germany  2021   
36625   ['Executive Summary', 'At a Glance', 'Assesses...  armenia  2022   

                              source  
403487                         greco  
287463                         greco  
250805                         greco  
216381              

In [217]:
# Generate random labels for calssification
def generate_label_vector(x):
    return np.random.randint(2, size=6)

In [218]:
df_sentences_labels = pd.DataFrame(df['sentence'])
# Insert string label column into DF
df_sentences_labels.insert(loc=1,column="label_str", value=df_labels)
# Convert string label colum to numerical column
#df_sentences_labels['label'] = pd.factorize(df_sentences_labels['label_str'])[0]
df_sentences_labels['label'] = df_sentences_labels['label_str'].apply(generate_label_vector)


In [223]:
# Replace df from here onwords
new_df = df_sentences_labels
# Display a sample of the dataset
new_df.sample(10)[['sentence', 'label']]

Unnamed: 0,sentence,label
366301,it is answerable to the minister for the civil...,"[0, 1, 1, 0, 1, 0]"
430509,the parliament of georgia is unicameral.,"[0, 1, 0, 1, 0, 1]"
10440,"In 2020, about 16,000 Venezuelans resided on C...","[0, 0, 0, 1, 1, 0]"
286398,high-profile cases allegedly rarely reached th...,"[0, 1, 0, 1, 1, 1]"
242515,greco welcomes that such a system is now in pl...,"[1, 0, 1, 0, 1, 1]"
414926,police and sbgs officers may be dismissed for ...,"[0, 1, 0, 1, 1, 1]"
123282,Meanwhile programs helped to stimulate the hou...,"[0, 1, 0, 0, 1, 0]"
238028,"in the meantime, the newly elected president h...","[0, 1, 1, 1, 0, 1]"
87589,"For example, In April 2022, a court sentenced ...","[0, 1, 1, 0, 0, 1]"
376945,les partis politiques doivent être enregistrés...,"[0, 1, 1, 1, 0, 0]"


In [224]:
from transformers import BertTokenizer
MAX_LEN = 512
TRAIN_BATCH_SIZE = 8
VALID_BATCH_SIZE = 4
EPOCHS = 1
LEARNING_RATE = 1e-05
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case = True)

In [233]:
class CustomDataset(Dataset):

    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.comment_text = dataframe.sentence
        self.targets = self.data.label
        self.max_len = max_len

    def __len__(self):
        return len(self.comment_text)

    def __getitem__(self, index):
        comment_text = str(self.comment_text[index])
        comment_text = " ".join(comment_text.split())

        inputs = self.tokenizer.encode_plus(
            comment_text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]


        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets[index], dtype=torch.float)
        }

In [241]:
train_size = 0.1
train_dataset=new_df.sample(frac=train_size,random_state=200)
test_dataset=new_df.drop(train_dataset.index).reset_index(drop=True)
train_dataset = train_dataset.reset_index(drop=True)


print("FULL Dataset: {}".format(new_df.shape))
print("TRAIN Dataset: {}".format(train_dataset.shape))
print("TEST Dataset: {}".format(test_dataset.shape))

training_set = CustomDataset(train_dataset, tokenizer, MAX_LEN)
testing_set = CustomDataset(test_dataset, tokenizer, MAX_LEN)

FULL Dataset: (460728, 3)
TRAIN Dataset: (46073, 3)
TEST Dataset: (414655, 3)


In [242]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

In [243]:
# Creating the customized model, by adding a drop out and a dense layer on top of distil bert to get the final output for the model. 

class BERTClass(torch.nn.Module):
    def __init__(self):
        super(BERTClass, self).__init__()
        self.l1 = transformers.BertModel.from_pretrained('bert-base-uncased')
        self.l2 = torch.nn.Dropout(0.3)
        self.l3 = torch.nn.Linear(768, 6)
    
    def forward(self, ids, mask, token_type_ids):
        _, output_1= self.l1(ids, attention_mask = mask, token_type_ids = token_type_ids, return_dict=False)
        output_2 = self.l2(output_1)
        output = self.l3(output_2)
        return output

model = BERTClass()
model.to(device)

BERTClass(
  (l1): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=Tr

In [244]:
def loss_fn(outputs, targets):
    return torch.nn.BCEWithLogitsLoss()(outputs, targets)

In [245]:
optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

In [246]:
def train(epoch):
    model.train()
    for _,data in enumerate(training_loader, 0):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.float)

        outputs = model(ids, mask, token_type_ids)

        optimizer.zero_grad()
        loss = loss_fn(outputs, targets)
        if _%5000==0:
            print(f'Epoch: {epoch}, Loss:  {loss.item()}')
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

In [None]:
# Train
for epoch in range(EPOCHS):
    train(epoch)

Epoch: 0, Loss:  0.7036323547363281


In [None]:
def validation(epoch):
    model.eval()
    fin_targets=[]
    fin_outputs=[]
    with torch.no_grad():
        for _, data in enumerate(testing_loader, 0):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.float)
            outputs = model(ids, mask, token_type_ids)
            fin_targets.extend(targets.cpu().detach().numpy().tolist())
            fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
    return fin_outputs, fin_targets

In [None]:
for epoch in range(EPOCHS):
    outputs, targets = validation(epoch)
    outputs = np.array(outputs) >= 0.5
    accuracy = metrics.accuracy_score(targets, outputs)
    f1_score_micro = metrics.f1_score(targets, outputs, average='micro')
    f1_score_macro = metrics.f1_score(targets, outputs, average='macro')
    print(f"Accuracy Score = {accuracy}")
    print(f"F1 Score (Micro) = {f1_score_micro}")
    print(f"F1 Score (Macro) = {f1_score_macro}")