In [93]:
import pandas as pd
import csv
import json
import string
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import LancasterStemmer #PorterStemmer
from nltk.tokenize import word_tokenize



# Download stopwords and initialize stemmer
nltk.download('stopwords')
nltk.download('punkt')
stopwords = set(stopwords.words('english'))
stemmer = LancasterStemmer()

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/reggie/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/reggie/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [94]:
# load data setof all countries, years and sources
data_file = '../data/all_countries_0.0.2.csv'
all_countries_data = pd.read_csv(data_file, dtype={'year': str}, comment='#')
# cast sentence column to string
all_countries_data['sentence'] = all_countries_data['sentence'].astype(str)

In [151]:
# load dictionary of topics
# strcuture [{"name": TOPIC_NAME, "words": NGRAMS_OF_KEYWORDS}, ...]
dict_file = '../data/dict_2.json'
dictionary = None
with open(dict_file, 'r') as file:
     dictionary = json.load(file)
    
def get_seed_lists(dictionary, ngram_size):
    # create list of topics wit max ngram_size
    seeds = []
    for topic in dictionary:
        seed = [w for w in topic["words"] if len(w.split()) <= ngram_size]
        seeds.append(seed)
    return seeds

In [152]:
# choose a country and year
country = ["sweden"]
year = "2020"
df = pd.DataFrame(all_countries_data)
country_data = df[(df['year'] == year) & (df['country'].isin(country))]
# reset index; needed for proper parsing by BERT
country_data = country_data.reset_index(drop=True)



In [153]:
def get_stemmed_tokens(data_list):
    all_tokens = []
  
    for text in data_list:
        text = re.sub(r'\d+', '', text)
        translator = str.maketrans('', '', string.punctuation)
        text = text.translate(translator)
        # Tokenize the sentence
        tokens = word_tokenize(text)

        # Remove stopwords and perform stemming
        filtered_tokens = [stemmer.stem(token) for token in tokens if token.lower() not in stopwords and token.isalnum()]
        all_tokens += filtered_tokens
    return all_tokens

In [154]:
# Preprocessed tokens (list of strings)
data_list = np.squeeze(country_data[["sentence"]].to_numpy())
tokens = get_stemmed_tokens(data_list)
seed_list = np.squeeze(get_seed_lists(dictionary, 1))
stemmed_seeds = []
for l in seed_list:
    seed_tokens = get_stemmed_tokens(l)
    stemmed_seeds.append(seed_tokens)
#print(stemmed_seeds)



Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray.



In [157]:
def get_dict_counts(tokens, stemmed_seeds):
    counts = np.zeros(len(stemmed_seeds))
    for token in tokens:
        for i in range(len(stemmed_seeds)):
            if token in stemmed_seeds[i]:
                counts[i] += 1
    norms = [c/len(tokens) for c in counts]
    return (counts, norms)

counts, norms = get_dict_counts(tokens, stemmed_seeds)
print(norms)
print(counts)

[0.01383399209486166, 0.022480237154150196, 0.008399209486166008, 0.011610671936758894]
[56. 91. 34. 47.]


In [158]:
def get_vocab_counts(tokens):
    vocab_counts = {}
    for token in tokens:
        if token not in vocab_counts.keys():
            vocab_counts[token] = 1
        else:
            vocab_counts[token] += 1
    return dict(sorted(vocab_counts.items(), key=lambda x: x[1], reverse=True))

sorted_dict = get_vocab_counts(tokens)
print(sorted_dict)
    

{'swed': 119, 'govern': 58, 'court': 52, 'auth': 47, 'report': 43, 'law': 41, 'med': 41, 'nat': 40, 'publ': 37, 'parlia': 35, 'independ': 33, 'corrupt': 32, 'cas': 32, 'reg': 30, 'act': 30, 'commit': 26, 'right': 24, 'includ': 24, 'admin': 23, 'council': 23, 'also': 22, 'off': 22, 'constitut': 22, 'journ': 21, 'system': 20, 'legisl': 20, 'prev': 20, 'inquiry': 20, 'process': 20, 'freedom': 19, 'crim': 19, 'protect': 19, 'elect': 18, 'propos': 18, 'concern': 18, 'judg': 18, 'europ': 17, 'gen': 17, 'inform': 17, 'provid': 17, 'stat': 16, 'memb': 16, 'party': 16, 'spec': 16, 'anticorrupt': 16, 'rul': 16, 'leg': 15, 'crimin': 15, 'numb': 15, 'min': 14, 'democr': 14, 'rel': 14, 'press': 14, 'institut': 14, 'jud': 14, 'framework': 14, 'eu': 14, 'requir': 14, 'org': 13, 'pol': 13, 'judicy': 13, 'plac': 13, 'ag': 13, 'ombudsm': 13, 'addit': 13, 'cod': 13, 'review': 13, 'appoint': 12, 'howev': 12, 'form': 12, 'increas': 12, 'investig': 12, 'prosecut': 12, 'submit': 12, 'repres': 11, 'peopl': 11