In [33]:
import os
import json
import torch
from transformers import BertTokenizer, BertModel, BertConfig

In [34]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case = True)
MAX_LEN = 256
device = 'cpu'

In [50]:
# Dictionary files to use
dictionary_file = 'dimension_dictionary.json'
dictionary_file_url = "https://raw.githubusercontent.com/backdem/democracy-datasets/main/dimension_dictionary.json"

if not os.path.exists(dictionary_file):
    urllib.request.urlretrieve(dictionary_file_url, dictionary_file)

def load_json_dict(dict_file):
    with open(dict_file, 'r') as file:
        dictionary = json.load(file)
        dictionary.append({
            'name': 'no_dimension',
            'words': []
        })
        return dictionary

dimension_dictionary = load_json_dict(dictionary_file)

In [51]:
# Creating the customized model, by adding a drop out and a dense layer on top of distil bert to get the final output for the model. 

class BERTClass(torch.nn.Module):
    def __init__(self):
        super(BERTClass, self).__init__()
        self.l1 = transformers.BertModel.from_pretrained('bert-base-uncased')
        self.l2 = torch.nn.Dropout(0.3)
        self.l3 = torch.nn.Linear(768, 6)
    
    def forward(self, ids, mask, token_type_ids):
        _, output_1= self.l1(ids, attention_mask = mask, token_type_ids = token_type_ids, return_dict=False)
        output_2 = self.l2(output_1)
        output = self.l3(output_2)
        return output

In [52]:
model = torch.load('BERT_classifier_democracy.pth')

In [53]:
def predict_sentence(sentence, model, max_length=256):
    # Encode the sentence
    encoding = tokenizer.encode_plus(
      sample_text.lower(),
      add_special_tokens=True,
      max_length=max_length,
      padding='max_length',
      truncation=True,
      return_token_type_ids=True,
      return_tensors='pt'
    )
    
    def get_dimension_from_prediction(v, dict=dimension_dictionary):
        dims = [dim["name"] for dim in dimension_dictionary]    
        index = v.index(max(v))    
        return dims[index]
    
    input_ids = encoding["input_ids"]
    mask = encoding["attention_mask"]
    token_type_ids = encoding["token_type_ids"]
    input_ids = input_ids.to(device, dtype=torch.long)
    mask = mask.to(device, dtype=torch.long)
    token_type_ids = token_type_ids.to(device, dtype=torch.long)
    
    # Inference
    output = model(input_ids, mask, token_type_ids)
    return output[0].tolist()

In [54]:
sample_text = "Because of some limited repression against the political opposition and pro-democracy NGOs, the protests of 2011 and 2012 were not repeated in 2016."
predictions = predict_sentence(sample_text, model, MAX_LEN)
result_dim = get_dimension_from_prediction(predictions)
print(f'Statment "{sample_text}" was classified as: {result_dim}.')

Statment "Because of some limited repression against the political opposition and pro-democracy NGOs, the protests of 2011 and 2012 were not repeated in 2016." was classified as: participatory.
