In [None]:
import string
import re
from pickle import dump
from unicodedata import normalize
from numpy import array
import nltk

# threshold for minimum count to be considered a valid word
MIN_VOCAB_COUNT = 8
OOV_TOKEN = "UNK"

# load doc into memory
def load_doc(filename):
    # open the file as read only
    file = open(filename, mode='r', encoding='utf-8')
    text = file.read()
    file.close()
    return text

def RepresentsInt(s):
    try: 
        int(s)
        return True
    except ValueError:
        return False

# remove numbers. they can be like 100 1090,200 2.123 etc
# strategy is to remove punctuation and then check if its an integer
def isNumber(word):
    word_no_num = re.sub(r'[^\w\s]','',word)
    if RepresentsInt(word_no_num):
        return True
    else:
        return False

#tokenizes raw strings
def getTokenized(corpus):
    lines = corpus.strip().split('~~')
    lines = lines[:-1]
    exclude = set(string.punctuation)
    words_list = []  
    for line in lines:
        line = line.strip()
        words = nltk.word_tokenize(line)
        words_nopunc_nonum = []
        for word in words:
            if word in exclude: # if punctuation
                continue
            else:
                word = word.lower()
                if(isNumber(word)): # if number
                    word = "###"
                words_nopunc_nonum.append(word)
        words_list.append(words_nopunc_nonum)
    return words_list


# save a list of clean sentences to file
def save_clean_data(sentences, filename):
    dump(sentences, open(filename, 'wb'))
    print('Saved: %s' % filename)

def getVocabulary(tokenized_corpus):
    vocabulary = {}
    for sentence in tokenized_corpus:
        for token in sentence:
            if token not in vocabulary:
                vocabulary[token] = 1
            else:
                vocabulary[token] += 1
    new_dict = {}
    oov_count = 0 
    # remove infrequent words
    for word, count in vocabulary.items():
        if(count >= MIN_VOCAB_COUNT):
            new_dict[word] = count
        else:
            oov_count += count
    new_dict[OOV_TOKEN] = oov_count
    word2id = {w: idx for (idx, w) in enumerate(new_dict)}
    id2word = {idx: w for (idx, w) in enumerate(new_dict)}
    return new_dict, word2id, id2word

def removeOOV(sentences, vocab):
    new_sentences = []
    for sentence in sentences:
        new_sent = []
        for word in sentence:
            if word in vocab:
                new_sent.append(word)
            else:
                new_sent.append(OOV_TOKEN)
        new_sentences.append(new_sent)
    return new_sentences

In [None]:
# # load dataset
# filename = 'data/deu.txt'
# doc = load_doc(filename)
# # split into english-german pairs
# pairs = to_pairs(doc)
# # clean sentences
# clean_pairs = clean_pairs(pairs)
# # save clean pairs to file
# save_clean_data(clean_pairs, 'english-german.pkl')
# # spot check
# for i in range(100):
#     print('[%s] => [%s]' % (clean_pairs[i,0], clean_pairs[i,1]))

eng = load_doc('data/news_2.en')
ger = load_doc('data/news_2.de')

In [None]:
tokenized_eng = getTokenized(eng)

In [None]:
tokenized_ger = getTokenized(ger)

In [None]:
vocabulary, word2id, id2word = getVocabulary(tokenized_ger)
# tokenized_ger = removeOOV(tokenized_ger, vocabulary)

In [6]:
vocabulary, word2id, id2word = getVocabulary(tokenized_eng)
tokenized_eng = removeOOV(tokenized_eng, vocabulary)

In [7]:
print(len(tokenized_ger), len(tokenized_eng))

216190 216190


In [8]:
pairs = []
for i in range(len(tokenized_eng)):
    pairs.append([tokenized_eng[i], tokenized_ger[i]])

In [22]:
pairs_new = []
for i in range(len(tokenized_eng)):
    if ('UNK' not in tokenized_eng[i]) and ('UNK' not in tokenized_ger[i]):
        pairs_new.append([tokenized_eng[i], tokenized_ger[i]])
save_clean_data(pairs_new, 'english-german-nounk.pkl')

Saved: english-german-nounk.pkl


In [23]:
len(pairs_new)

102864

In [14]:
total_tokens = 0
n_small = 0
max_v = 0
for key, value in vocabulary.items():
    total_tokens += value
    if(value > max_v):
        max_v = value
    if value < 3:
        n_small += 1

In [15]:
print(max_v)

297026


In [16]:
n_small

0