In [213]:
import numpy as np
import pandas as pd
import pyarrow
from sklearn import metrics
import transformers
import torch
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertModel, BertConfig

In [214]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'
print("Using device: " + device)

Using device: cpu


In [309]:
import urllib
import os
import json

# Corpus and dictionary files to use
corpus_file = 'democracy_reports_corpus.csv'
dictionary_file = 'dimension_dictionary.json'
corpus_file_url = "https://github.com/backdem/democracy-datasets/raw/main/democracy_reports_corpus.csv"
dictionary_file_url = "https://raw.githubusercontent.com/backdem/democracy-datasets/main/dimension_dictionary.json"

# Download datsets if not already downloaded
if not os.path.exists(corpus_file):
    urllib.request.urlretrieve(corpus_file_url, corpus_file)
if not os.path.exists(dictionary_file):
    urllib.request.urlretrieve(dictionary_file_url, dictionary_file)

def load_json_dict(dict_file):
    with open(dict_file, 'r') as file:
        dictionary = json.load(file)
        dictionary.append({
            'name': 'no_dimension',
            'words': []
        })
        return dictionary

dimension_dictionary = load_json_dict(dictionary_file)
print(f'Found topics: {[dim["name"] for dim in dimension_dictionary]}')
# Naive approach to label sentences with dictionary, producing 
# a masked label vector of the form [0, 0, 0, 0, 0, 1] where the 
# indicies match topics ['electoral', 'participatory', 'media', 'liberal_institution', 'liberal_rights', 'no_dimension']
def generate_label_vector(sentence, dict=dimension_dictionary):
    topics = [dim['name'] for dim in dict]
    matched_dim = 'no_dimension'
    for dim in dict:
        if matched_dim != 'no_dimension':
            break
        for w in dim['words']:
            if w in sentence.lower():
                matched_dim = dim['name']
                break
    return [int(t == matched_dim) for t in topics]
        

Found topics: ['electoral', 'participatory', 'media', 'liberal_institution', 'liberal_rights', 'no_dimension']


In [310]:
# Read csv file into Dataframe
df = pd.read_csv(corpus_file, dtype={'year': str, 'sentence': str}, comment='#')
# Print first row
print(df.sample(5))

                                                 sentence section  country  \
424987  for those reasons, greco is of the firm opinio...    none  hungary   
449753  no official study has been made of the extent ...    none   greece   
429339  the code was debated upon in three joint sessi...    none   greece   
349006  su has a separate budget from the general budg...    none   sweden   
311493                            v) privatisation agency    none   latvia   

        year source  
424987  2008  greco  
449753  2001  greco  
429339  2017  greco  
349006  2018  greco  
311493  2002  greco  


In [311]:
df_sentences_labels = pd.DataFrame(df['sentence'])
# Insert string label column into DF
df_sentences_labels.insert(loc=1,column="label_str", value=df_labels)
# Convert string label colum to numerical column
#df_sentences_labels['label'] = pd.factorize(df_sentences_labels['label_str'])[0]
df_sentences_labels['label'] = df_sentences_labels['label_str'].apply(generate_label_vector)


In [319]:
from transformers import BertTokenizer
FRACTION_OF_DS_TO_USE = 0.1
MAX_LEN = 256
TRAIN_BATCH_SIZE = 8
VALID_BATCH_SIZE = 4
EPOCHS = 1
LEARNING_RATE = 1e-05
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case = True)

In [320]:
# Use part of the dataset
new_df = df_sentences_labels.sample(frac=FRACTION_OF_DS_TO_USE, random_state=200)
# Display a sample of the dataset
new_df.sample(10)[['sentence', 'label']]

Unnamed: 0,sentence,label
22470,"In practice, each has links to the ruling part...","[0, 0, 0, 0, 0, 1]"
249126,the functions of judge and prosecutor are inco...,"[0, 0, 0, 0, 0, 1]"
194272,Particularly during the period prior to the 20...,"[0, 0, 0, 1, 0, 0]"
247373,the right to institute the procedure becomes s...,"[0, 0, 0, 0, 0, 1]"
215569,the salaries of ptef are of a public nature an...,"[0, 0, 0, 0, 0, 1]"
257751,"finally, the get met with representatives of t...","[0, 0, 0, 0, 0, 1]"
233115,the prosecutor has the right to decide on the ...,"[0, 0, 0, 0, 0, 1]"
262432,"this concerns in particular, the insufficientl...","[0, 0, 0, 0, 0, 1]"
235385,(2) if with the crime from item 1 a larger pro...,"[0, 0, 0, 0, 0, 1]"
143376,"The next prime minister, Andris Ðíçle 1999 200...","[0, 0, 0, 0, 0, 1]"


In [321]:
class CustomDataset(Dataset):

    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.comment_text = dataframe.sentence
        self.targets = self.data.label
        self.max_len = max_len

    def __len__(self):
        return len(self.comment_text)

    def __getitem__(self, index):
        comment_text = str(self.comment_text[index])
        comment_text = " ".join(comment_text.split())

        inputs = self.tokenizer.encode_plus(
            comment_text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]


        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets[index], dtype=torch.float)
        }

In [322]:
train_size = 0.8
train_dataset=new_df.sample(frac=train_size,random_state=200)
test_dataset=new_df.drop(train_dataset.index).reset_index(drop=True)
train_dataset = train_dataset.reset_index(drop=True)


print("FULL Dataset: {}".format(new_df.shape))
print("TRAIN Dataset: {}".format(train_dataset.shape))
print("TEST Dataset: {}".format(test_dataset.shape))

training_set = CustomDataset(train_dataset, tokenizer, MAX_LEN)
testing_set = CustomDataset(test_dataset, tokenizer, MAX_LEN)

FULL Dataset: (46073, 3)
TRAIN Dataset: (36858, 3)
TEST Dataset: (9215, 3)


In [323]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

In [324]:
# Creating the customized model, by adding a drop out and a dense layer on top of distil bert to get the final output for the model. 

class BERTClass(torch.nn.Module):
    def __init__(self):
        super(BERTClass, self).__init__()
        self.l1 = transformers.BertModel.from_pretrained('bert-base-uncased')
        self.l2 = torch.nn.Dropout(0.3)
        self.l3 = torch.nn.Linear(768, 6)
    
    def forward(self, ids, mask, token_type_ids):
        _, output_1= self.l1(ids, attention_mask = mask, token_type_ids = token_type_ids, return_dict=False)
        output_2 = self.l2(output_1)
        output = self.l3(output_2)
        return output

model = BERTClass()
model.to(device)

BERTClass(
  (l1): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=Tr

In [325]:
def loss_fn(outputs, targets):
    return torch.nn.BCEWithLogitsLoss()(outputs, targets)

In [326]:
optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

In [327]:
def train(epoch):
    model.train()
    for _,data in enumerate(training_loader, 0):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.float)

        outputs = model(ids, mask, token_type_ids)

        optimizer.zero_grad()
        loss = loss_fn(outputs, targets)
        if _%5000==0:
            print(f'Epoch: {epoch}, Loss:  {loss.item()}')
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

In [None]:
# Train
for epoch in range(EPOCHS):
    train(epoch)

Epoch: 0, Loss:  0.6795785427093506


In [None]:
def validation(epoch):
    model.eval()
    fin_targets=[]
    fin_outputs=[]
    with torch.no_grad():
        for _, data in enumerate(testing_loader, 0):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.float)
            outputs = model(ids, mask, token_type_ids)
            fin_targets.extend(targets.cpu().detach().numpy().tolist())
            fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
    return fin_outputs, fin_targets

In [None]:
for epoch in range(EPOCHS):
    outputs, targets = validation(epoch)
    outputs = np.array(outputs) >= 0.5
    accuracy = metrics.accuracy_score(targets, outputs)
    f1_score_micro = metrics.f1_score(targets, outputs, average='micro')
    f1_score_macro = metrics.f1_score(targets, outputs, average='macro')
    print(f"Accuracy Score = {accuracy}")
    print(f"F1 Score (Micro) = {f1_score_micro}")
    print(f"F1 Score (Macro) = {f1_score_macro}")