In [1]:
import numpy as np
import pandas as pd
#import pyarrow
from sklearn import metrics
import transformers
import torch
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertModel, BertConfig

In [2]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'
print("Using device: " + device)

Using device: cpu


In [3]:
import urllib
import os
import json

# Corpus and dictionary files to use
#corpus_file = 'democracy_reports_corpus.csv'
corpus_file = '../../data/democracy_reports_corpus_annelisa_fixed.csv'
#dictionary_file = 'dimension_dictionary.json'
#corpus_file_url = "https://github.com/backdem/democracy-datasets/raw/main/democracy_reports_corpus.csv"
#dictionary_file_url = "https://raw.githubusercontent.com/backdem/democracy-datasets/main/dimension_dictionary.json"

# Download datsets if not already downloaded
#if not os.path.exists(corpus_file):
#    urllib.request.urlretrieve(corpus_file_url, corpus_file)
#if not os.path.exists(dictionary_file):
#    urllib.request.urlretrieve(dictionary_file_url, dictionary_file)

#def load_json_dict(dict_file):
#    with open(dict_file, 'r') as file:
#        dictionary = json.load(file)
#        dictionary.append({
#           'name': 'no_dimension',
#            'words': []
#        })
#        return dictionary


        

In [7]:
# Read csv file into Dataframe
df = pd.read_csv(corpus_file, dtype={'year': str},comment='#')
# Filter for labelled data
df_labelled = df[(df['dimension1'].notnull()) | (df['dimension2'].notnull()) | (df['dimension3'].notnull()) | (df['dimension4'].notnull())] 
# Print sample rows
print(df_labelled.sample(5))

                                              sentence   country  year  \
192  "The PP and its allies also took 130 directly ...     spain  2018   
516  "However, a significant portion of the general...  slovenia  2021   
402  "Several mass-casualty incidents were reported...   germany  2021   
107  "Police use force to break up unsanctioned pro...    turkey  2021   
450  "Parties winning less than 4 percent but more ...  slovenia  2021   

                          source            dimension1 dimension2 dimension3  \
192  freedomhouse_freedom-world"             electoral        NaN        NaN   
516  freedomhouse_freedom-world"  liberal institutions        NaN        NaN   
402  freedomhouse_freedom-world"             ambiguous        NaN        NaN   
107  freedomhouse_freedom-world"  liberal institutions        NaN        NaN   
450  freedomhouse_freedom-world"             electoral        NaN        NaN   

    dimension4 backsliding cat_4_sentence_nuance  \
192        NaN        

  df = pd.read_csv(corpus_file, dtype={'year': str},comment='#')


In [16]:
# Get label names
labels = df_labelled['dimension1'].unique()
print(f'Found labels: {labels}')
# Fix typos in labels
def find_replace_in_column(df, column_name, string_to_match, new_value):
    df.loc[df[column_name] == string_to_match, column_name] = new_value
    return df
df_labelled = find_replace_in_column(df_labelled, 'dimension1', 'media ', 'media')
df_labelled = find_replace_in_column(df_labelled, 'dimension1', 'electoral?', 'electoral')
df_labelled = find_replace_in_column(df_labelled, 'dimension1', 'liberal rights?', 'liberal rights')
df_labelled = find_replace_in_column(df_labelled, 'dimension1', 'liberal rights ', 'liberal rights')
df_labelled = find_replace_in_column(df_labelled, 'dimension1', 'liberal righ', 'liberal rights')
df_labelled = find_replace_in_column(df_labelled, 'dimension1', 'media l', 'media')
labels = sorted(df_labelled['dimension1'].unique())
print(f'Fixed labels: {labels}')


Found labels: ['electoral' 'liberal institutions' 'participatory' 'liberal rights'
 'ambiguous' 'media']
Fixed labels: ['ambiguous', 'electoral', 'liberal institutions', 'liberal rights', 'media', 'participatory']


In [22]:
# Create label vector column
# Label sentences with topics, producing 
# a masked label vector of the form [0, 0, 0, 0, 0, 1] where the 
# indicies match topics ['ambiguous', 'electoral', 'liberal institutions', 'liberal rights', 'media', 'participatory']
def generate_label_vector(row, l):
    label = row['dimension1']
    return [int(l == label) for l in labels]
    

#df_sentences_labels = pd.DataFrame(df['sentence'])
df_labelled['label_vector'] = df_labelled.apply(generate_label_vector, l=labels, axis=1)
print(df_labelled.sample(5))

                                              sentence   country  year  \
68   "During the trial, Turkish-Iranian businessman...    turkey  2021   
36   "The government has also resorted to arresting...    turkey  2021   
186  "Either the PP or the PSOE have typically held...     spain  2018   
481  "Some critics assert that the change is aimed ...  slovenia  2021   
169  "Private property rights are legally enshrined...    turkey  2021   

                          source            dimension1            dimension2  \
68   freedomhouse_freedom-world"  liberal institutions                   NaN   
36   freedomhouse_freedom-world"             electoral                   NaN   
186  freedomhouse_freedom-world"             electoral                   NaN   
481  freedomhouse_freedom-world"                 media  liberal institutions   
169  freedomhouse_freedom-world"  liberal institutions                   NaN   

    dimension3 dimension4 backsliding cat_4_sentence_nuance comments  \
68

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_labelled['label_vector'] = df_labelled.apply(generate_label_vector, l=labels, axis=1)


In [39]:
from transformers import BertTokenizer
FRACTION_OF_DS_TO_USE = 1
MAX_LEN = 256
TRAIN_BATCH_SIZE = 8
VALID_BATCH_SIZE = 4
EPOCHS = 3
LEARNING_RATE = 1e-05
MAX_LOSS = 0.02
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case = True)

In [40]:
# Use part of the dataset
new_df = df_labelled.sample(frac=FRACTION_OF_DS_TO_USE, random_state=200)
# Display a sample of the dataset
new_df.sample(10)[['sentence', 'label_vector']]

Unnamed: 0,sentence,label_vector
448,"""The constitutional right to organize in diffe...","[0, 1, 0, 0, 0, 0]"
652,"""The Serb List has been accused of harassing r...","[0, 1, 0, 0, 0, 0]"
108,"""Pandemic-related rules on social distancing w...","[0, 0, 0, 0, 0, 1]"
82,"""The Committee to Protect Journalists reported...","[0, 0, 0, 0, 1, 0]"
234,"""Legal safeguards to ensure government transpa...","[0, 0, 1, 0, 0, 0]"
334,"""Germany is obligated to enhance legal protect...","[0, 0, 1, 0, 0, 0]"
441,"""The legislature is composed of the 40-seat Na...","[0, 1, 0, 0, 0, 0]"
316,"""The constitution gives all citizens age 18 or...","[0, 1, 0, 0, 0, 0]"
402,"""Several mass-casualty incidents were reported...","[1, 0, 0, 0, 0, 0]"
220,"""Elected officials are generally free to make ...","[0, 0, 1, 0, 0, 0]"


In [41]:
class CustomDataset(Dataset):

    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.comment_text = dataframe.sentence
        self.targets = self.data.label_vector
        self.max_len = max_len

    def __len__(self):
        return len(self.comment_text)

    def __getitem__(self, index):
        comment_text = str(self.comment_text[index])
        comment_text = " ".join(comment_text.split())

        inputs = self.tokenizer.encode_plus(
            comment_text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]


        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets[index], dtype=torch.float)
        }

In [42]:
train_size = 0.8
train_dataset=new_df.sample(frac=train_size,random_state=200)
test_dataset=new_df.drop(train_dataset.index).reset_index(drop=True)
train_dataset = train_dataset.reset_index(drop=True)

print("FULL Dataset: {}".format(new_df.shape))
print("TRAIN Dataset: {}".format(train_dataset.shape))
print("TEST Dataset: {}".format(test_dataset.shape))

training_set = CustomDataset(train_dataset, tokenizer, MAX_LEN)
testing_set = CustomDataset(test_dataset, tokenizer, MAX_LEN)

FULL Dataset: (665, 15)
TRAIN Dataset: (532, 15)
TEST Dataset: (133, 15)


In [43]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

In [44]:
# Creating the customized model, by adding a drop out and a dense layer on top of distil bert to get the final output for the model. 

class BERTClass(torch.nn.Module):
    def __init__(self):
        super(BERTClass, self).__init__()
        self.l1 = transformers.BertModel.from_pretrained('bert-base-uncased')
        self.l2 = torch.nn.Dropout(0.3)
        self.l3 = torch.nn.Linear(768, 6)
    
    def forward(self, ids, mask, token_type_ids):
        _, output_1= self.l1(ids, attention_mask = mask, token_type_ids = token_type_ids, return_dict=False)
        output_2 = self.l2(output_1)
        output = self.l3(output_2)
        return output

model = BERTClass()
model.to(device)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


BERTClass(
  (l1): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=Tr

In [45]:
def loss_fn(outputs, targets):
    return torch.nn.BCEWithLogitsLoss()(outputs, targets)

In [46]:
optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

In [47]:
def train(epoch):
    model.train()
    for _,data in enumerate(training_loader, 0):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.float)

        outputs = model(ids, mask, token_type_ids)

        optimizer.zero_grad()
        loss = loss_fn(outputs, targets)
        if _%500==0:
            print(f'Epoch: {epoch}, Loss:  {loss.item()}')
        if loss.item() < MAX_LOSS:
            break
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    return loss

In [None]:
# Train
for epoch in range(EPOCHS):
    loss = train(epoch)
    print(f'Epoch {epoch} done')
    if loss.item() < MAX_LOSS:
        break

Epoch: 0, Loss:  0.6473258137702942
Epoch 0 done
Epoch: 1, Loss:  0.46624812483787537


In [350]:
def validation(epoch):
    model.eval()
    fin_targets=[]
    fin_outputs=[]
    with torch.no_grad():
        for _, data in enumerate(testing_loader, 0):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.float)
            outputs = model(ids, mask, token_type_ids)
            fin_targets.extend(targets.cpu().detach().numpy().tolist())
            fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
    return fin_outputs, fin_targets

In [351]:
for epoch in range(EPOCHS):
    outputs, targets = validation(epoch)
    outputs = np.array(outputs) >= 0.5
    accuracy = metrics.accuracy_score(targets, outputs)
    f1_score_micro = metrics.f1_score(targets, outputs, average='micro')
    f1_score_macro = metrics.f1_score(targets, outputs, average='macro')
    print(f"Accuracy Score = {accuracy}")
    print(f"F1 Score (Micro) = {f1_score_micro}")
    print(f"F1 Score (Macro) = {f1_score_macro}")

Accuracy Score = 0.9306565382528487
F1 Score (Micro) = 0.9369202226345084
F1 Score (Macro) = 0.8762412181303967


In [409]:
# Test our inputs
# Choose a sentence
sample_row = new_df.sample()[['sentence', 'label']]
sample_text = sample_row['sentence'].values[0]

def get_dimension_from_prediction(v, dict=dimension_dictionary):
    dims = [dim["name"] for dim in dimension_dictionary]    
    index = v.index(max(v))    
    return dims[index]
    
# Encode the sentence
encoding = tokenizer.encode_plus(
  sample_text.lower(),
  add_special_tokens=True,
  max_length=MAX_LEN,
  padding='max_length',
  truncation=True,
  return_token_type_ids=True,
  return_tensors='pt'
)

input_ids = encoding["input_ids"]
mask = encoding["attention_mask"]
token_type_ids = encoding["token_type_ids"]
input_ids = input_ids.to(device, dtype=torch.long)
mask = mask.to(device, dtype=torch.long)
token_type_ids = token_type_ids.to(device, dtype=torch.long)

# Inference
output = model(input_ids, mask, token_type_ids)
predictions = output[0].tolist()
print(predictions)

[-2.3982105255126953, 1.8183958530426025, -2.7816433906555176, -2.939051628112793, -2.5525059700012207, -2.1630332469940186]


In [411]:
result_dim = get_dimension_from_prediction(predictions)
print(f'Statment "{sample_text}" was classified as: {result_dim}.')

Statment "Because of some limited repression against the political opposition and pro-democracy NGOs, the protests of 2011 and 2012 were not repeated in 2016." was classified as: participatory.


In [413]:
# Save model
torch.save(model, 'BERT_classifier_democracy.pth')
torch.save(model.state_dict(), 'BERT_classifier_democracy_state_dict.pth')