In [1]:
from sklearn.feature_extraction.text import CountVectorizer
import string
import nltk
from nltk.corpus import stopwords
stopwords_en = set(stopwords.words('english'))
stopwords_fr = set(stopwords.words('french'))
stopwords_de = set(stopwords.words('german'))
stopwords_nl = set(stopwords.words('dutch'))
stopwords_pt = set(stopwords.words('portuguese'))
from collections import defaultdict
import pickle
import re

In [2]:
dataset_folder = '../../wiki_data/'
ctm_data_folder = '../contextualized_topic_models/data/wiki/'
texts = []

In [9]:
# vocab is a file with one token per line
# we use vocabs consisting of the 5,000 most frequent tokens per-language
def load_vocab(vocab_file):
    vocab = set()
    with open(vocab_file, 'r') as vocab_txt:
        for line in vocab_txt:
            token = line.strip()
            vocab.update([token])
    return vocab

vocabs = {}
articles = {}
for lang in ('en', 'fr', 'de', 'pt', 'nl'):
    vocabs[lang] = load_vocab(ctm_data_folder + f'/wiki_train_{lang}_vocab.txt')
    texts = []
    with open(dataset_folder + f'/articles_{lang}.txt', 'r') as corpus:
        for line in corpus:
            texts.append(line.strip())
    texts = texts[-120000:]
    articles[lang] = texts
    
def aligned_preprocessing(articles, vocabs):
    unprepr_corpora = {}
    for lang in ('en', 'fr', 'de', 'pt', 'nl'):
        articles[lang] = [doc.replace("\\n*", " ") for doc in articles[lang]]
        articles[lang] = [doc.replace("\\", " ") for doc in articles[lang]]

        articles[lang] = [' '.join(doc.split(" ")[:200]) for doc in articles[lang]]
        unprepr_corpora[lang] = articles[lang]

        articles[lang] = [doc.lower() for doc in articles[lang]]
        articles[lang] = [doc.translate(str.maketrans(string.punctuation, ' '*len(string.punctuation))) for doc in articles[lang]]
        vocab = vocabs[lang]
        articles[lang] = [' '.join([w for w in doc.split() if w in vocab]) for doc in articles[lang]]
    
    new_corpora, new_unprepr_corpora = defaultdict(list), defaultdict(list)
    for i, (en, fr, de, pt, nl) in enumerate(zip(articles['en'], \
                        articles['fr'], articles['de'], articles['pt'], articles['nl'])):
        if len(en) > 0 and len(fr) > 0 and len(de) > 0 and len(pt) > 0 and len(nl) > 0:
            new_corpora['en'].append(en); new_unprepr_corpora['en'].append(unprepr_corpora['en'][i])
            new_corpora['fr'].append(fr); new_unprepr_corpora['fr'].append(unprepr_corpora['fr'][i])
            new_corpora['de'].append(de); new_unprepr_corpora['de'].append(unprepr_corpora['de'][i])
            new_corpora['pt'].append(pt); new_unprepr_corpora['pt'].append(unprepr_corpora['pt'][i])
            new_corpora['nl'].append(nl); new_unprepr_corpora['nl'].append(unprepr_corpora['nl'][i])
    
    return new_corpora, new_unprepr_corpora

preprocessed_corpora, unpreprocessed_corpora = aligned_preprocessing(articles, vocabs)
for lang in ('en', 'fr', 'de', 'pt', 'nl'):
    print(preprocessed_corpora[lang][:5])
    print(unpreprocessed_corpora[lang][:5])

['article history republic macedonia history wider region see history macedonia region antiquity territory republic macedonia included kingdom populated people origins also parts ancient inhabited various peoples populated ancient greek tribes none fixed boundaries sometimes subject kings sometimes broke away bc philip ii conquered upper macedonia including northern part southern lie within republic macedonia philip son alexander great conquered remainder region empire romans included republic province macedonia northernmost parts lay time subdivided republic split macedonia medieval period period area divided line populated', 'spring one four conventional temperate seasons following winter summer various technical definitions spring local usage term varies according local climate cultures customs spring northern hemisphere autumn southern hemisphere spring days approximately hours long day length increasing season spring refer season also ideas subtropical tropical areas better descri

In [10]:
for lang in ('en', 'fr', 'de', 'pt', 'nl'):
    with open(dataset_folder + f'/wiki_train_{lang}_prep.txt', 'w') as prep:
        for l in preprocessed_corpora[lang][-100000:]:
            prep.write(l.strip() + "\n")
    with open(dataset_folder + f'/wiki_train_{lang}_unprep.txt', 'w') as unprep:
        for l in unpreprocessed_corpora[lang][-100000:]:
            unprep.write(l.strip() + "\n")