In [8]:
import tensorflow as tf
import torch

In [9]:
# If there's a GPU available...
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")


No GPU available, using the CPU instead.


In [22]:
import urllib
import os

# Corpus and dictionary files to use
corpus_file = 'democracy_reports_corpus.csv'
corpus_file_url = "https://github.com/backdem/democracy-datasets/raw/main/democracy_reports_corpus.csv"

# Download datsets if not already downloaded
if not os.path.exists(corpus_file):
    urllib.request.urlretrieve(corpus_file_url, corpus_file)

In [30]:
import pandas as pd
import pyarrow

# Read csv file into Dataframe
df = pd.read_csv(corpus_file, dtype={'year': str, 'sentence': str}, comment='#')
# Print first row
print(df.head(1))

                                            sentence  \
0  The president is directly elected for up to tw...   

                                             section country  year  \
0  ['Political Rights', 'Electoral Process', 'Was...  turkey  2021   

                       source  
0  freedomhouse_freedom-world  


In [110]:
# Use first 2 section names as label.
s = df['section'] \
    .str.replace('[','') \
    .str.replace(']','') \
    .str.split(',', expand=True)

s[0] = s[0].str.replace("'",'').str.replace(" ",'') \
        + s[1].str.replace("'",'').str.replace(" ",'')

# Where there is no section create "Nothing" label
df_labels = s[0].fillna("Nothing")

In [111]:
print(df_labels)

0         PoliticalRightsElectoralProcess
1         PoliticalRightsElectoralProcess
2         PoliticalRightsElectoralProcess
3         PoliticalRightsElectoralProcess
4         PoliticalRightsElectoralProcess
                       ...               
460723                            Nothing
460724                            Nothing
460725                            Nothing
460726                            Nothing
460727                            Nothing
Name: 0, Length: 460728, dtype: object


In [112]:
df_sentences_labels = pd.DataFrame(df['sentence'])
# Insert string label column into DF
df_sentences_labels.insert(loc=1,column="label_str", value=df_labels)
# Convert string label colum to numerical column
df_sentences_labels['label'] = pd.factorize(df_sentences_labels['label_str'])[0]


In [119]:
# Replace df from here onwords
df = df_sentences_labels

# Display a sample of the dataset
df.loc[df.label == 0].sample(5)[['sentence', 'label']]

Unnamed: 0,sentence,label
28391,The nationalist Alliance for the Union of Roma...,0
26216,Members of the Electoral Commission (EC) are a...,0
14993,"The GPS made gains, while the SP sustained los...",0
27621,The nationalist Alliance for the Union of Roma...,0
23698,Parliament maintains a direct role in electora...,0


In [126]:
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case = True)
df['tokenized'] = df['sentence'].apply(tokenizer.tokenize)


In [139]:
df = df[df['tokenized'].apply(lambda x: len(x) < 510)]
df = df[df["label_str"] != 'Nothing']
sentences = df.sentence.values
labels = df.label.values
# Print the original sentence.
print(' Original: ', sentences[0])

# Print the sentence split into tokens.
print('Tokenized: ', tokenizer.tokenize(sentences[0]))

# Print the sentence mapped to token ids.
print('Token IDs: ', tokenizer.convert_tokens_to_ids(tokenizer.tokenize(sentences[0])))

 Original:  The president is directly elected for up to two five-year terms, but is eligible to run for a third term if the parliament calls for early elections during the president’s initial terms.
Tokenized:  ['the', 'president', 'is', 'directly', 'elected', 'for', 'up', 'to', 'two', 'five', '-', 'year', 'terms', ',', 'but', 'is', 'eligible', 'to', 'run', 'for', 'a', 'third', 'term', 'if', 'the', 'parliament', 'calls', 'for', 'early', 'elections', 'during', 'the', 'president', '’', 's', 'initial', 'terms', '.']
Token IDs:  [1996, 2343, 2003, 3495, 2700, 2005, 2039, 2000, 2048, 2274, 1011, 2095, 3408, 1010, 2021, 2003, 7792, 2000, 2448, 2005, 1037, 2353, 2744, 2065, 1996, 3323, 4455, 2005, 2220, 3864, 2076, 1996, 2343, 1521, 1055, 3988, 3408, 1012]


In [None]:
# Tokenize all of the sentences and map the tokens to thier word IDs.
input_ids = []
attention_masks = []
for sent in sentences:
    encoded_dict = tokenizer.encode_plus(sent, add_special_tokens=True, padding='max_length', return_attention_mask = True, return_tensors='pt')
    input_ids.append(encoded_dict['input_ids'])
    attention_masks.append(encoded_dict['attention_mask'])

In [138]:
# Convert the lists into tensors.
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(labels)

# Print sentence 0, now as a list of IDs.
print('Original: ', sentences[0])
print('Token IDs:', input_ids[0])

Original:  The president is directly elected for up to two five-year terms, but is eligible to run for a third term if the parliament calls for early elections during the president’s initial terms.
Token IDs: tensor([ 101, 1996, 2343, 2003, 3495, 2700, 2005, 2039, 2000, 2048, 2274, 1011,
        2095, 3408, 1010, 2021, 2003, 7792, 2000, 2448, 2005, 1037, 2353, 2744,
        2065, 1996, 3323, 4455, 2005, 2220, 3864, 2076, 1996, 2343, 1521, 1055,
        3988, 3408, 1012,  102,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    