In [1]:
import pandas as pd
import os
import re
import sys
from collections import OrderedDict
import warnings
warnings.filterwarnings('ignore')

In [51]:
dataFile = 'data/dummy-training-monolingual-europarl-en'

In [52]:
df = pd.read_csv(dataFile, sep="\n", header=None)
print(df)
# print(df.loc[1])

                                                    0
0                           Resumption of the session
1   I declare resumed the session of the European ...
2   Although, as you will have seen, the dreaded '...
3   You have requested a debate on this subject in...
4   In the meantime, I should like to observe a mi...
5      Please rise, then, for this minute' s silence.
6   (The House rose and observed a minute' s silence)
7               Madam President, on a point of order.
8   You will be aware from the press and televisio...
9   One of the people assassinated very recently i...
10  Would it be appropriate for you, Madam Preside...
11  Yes, Mr Evans, I feel an initiative of the typ...
12  If the House agrees, I shall do as Mr Evans ha...
13              Madam President, on a point of order.
14  I would like your advice about Rule 143 concer...
15  My question relates to something that will com...
16  The Cunha report on multiannual guidance progr...
17  It says that this should

In [53]:
# functions for data cleaning

DOT_LIKE = ',;.!?'
DOT_LIKE_AND_SPACE = ',;.!? '

def cleanData(inputFile):
    sys.stderr.write("Cleaning data " + inputFile + "\n")
    mappings = OrderedDict([
        (re.compile("['’]"), "'"),
        (re.compile("' s([" + DOT_LIKE_AND_SPACE + "])"), "'s\g<1>"), # Removes strange text mistake pattern in europarl data.
        (re.compile("n't"), " n't"),
        (re.compile(" '([^" + DOT_LIKE + "']*)'"), '. \g<1>.'), # Remove quoting apostrophes.
        (re.compile("'([^t])"), " '\g<1>"), # Separate tokens like "'s" "'ll" and so on.
        (re.compile('\([^)]*\)'), ''), # Removes bracketed.
        (re.compile('[-—]'), ' '), # Dash to space.
        (re.compile('[^a-z0-9A-Z\',\.?! ]'), ' '), # Other unknown to space.
        (re.compile('^$|^\.$'), ''), # Removes empty line.
    ])
    cleanFile = inputFile + '.clean'
    regexProcess(mappings, inputFile, cleanFile)
    return cleanFile

def regexProcess(mappings, inputFile, outputFile):
    with open(outputFile, 'w', encoding="utf8") as output:
        with open(inputFile, encoding="utf8") as input:
            for fullLine in input:
                line = fullLine.rstrip()
                for pattern, replacement in mappings.items():
                    line = pattern.sub(replacement, line)
                if len(line) == 0:
                    continue
                output.write(line + " ")
    return outputFile

In [54]:
cleanData(dataFile)
cleaned_data = dataFile + ".clean"
print(cleaned_data)

data/dummy-training-monolingual-europarl-en.clean


Cleaning data data/dummy-training-monolingual-europarl-en


In [None]:
# functions for data sampling

def sampleData(
        sampleCount,
        inputFile,
        weighted=True,
        testPercentage=0.8):
    import itertools
    from random import randint

    outputFile = inputFile + ".samples"
    sys.stderr.write("Sampling data " + inputFile + ' into ' + outputFile + "\n")
    LOG_SAMPLE_NUM_STEP = 10000
    DOT_LIKE_REGEX = re.compile('.*[' + DOT_LIKE + ']')

    def incrementSampleNum(sampleNum):
        sampleNum += 1
        if sampleNum % LOG_SAMPLE_NUM_STEP == 0:
            sys.stderr.write('sampleNum: ' + str(sampleNum) + "\n")
        return sampleNum

    def readwords(mfile):
        byte_stream = itertools.groupby(
            itertools.takewhile(lambda c: bool(c),
                                map(mfile.read,
                                    itertools.repeat(1))), str.isspace)

        return ("".join(group) for pred, group in byte_stream if not pred)

    def samplingTestValues(sampleNum, sampleCount, testPercentage=0.8):
        return int(sampleCount * testPercentage) < sampleNum

    def write(output, window, label):
        output.write(' '.join(window))
        output.write(' ' + str(label))
        output.write('\n')

    def skipNonDotSample(weighted):
        DOT_WEIGHT = 1
        """ Skip non dot samples to prevent local minima of no dots. """
        return weighted and randint(0, 9) < DOT_WEIGHT

    def skip(weighted):
        """ Skips for more diverse input. """
        return weighted and randint(0, 9) < 3

    samples = []
    labels = []
    samplingTestValues = False
    with open(outputFile, 'w', encoding="utf8") as output:
        with open(outputFile + ".test", 'w', encoding="utf8") as testOutput:
            with open(inputFile, 'r', encoding="utf8") as input:
                window = []
                sampleNum = 0
                for word in readwords(input):
                    if len(window) < WORDS_PER_SAMPLE_SIZE:
                        window.append(word)
                        continue
                    if sampleNum != 0:
                        window.append(word)
                        window.pop(0)
                    middle = window[-DETECTION_INDEX]
                    if skip(weighted):
                        continue
                    if DOT_LIKE_REGEX.match(middle) is not None:
                        label = True
                    else:
                        label = False
                    if samplingTestValues:
                        write(testOutput, window, label)
                    else:
                        samples.append(' '.join(window))
                        labels.append(label)
                        write(output, window, label)
                    sampleNum = incrementSampleNum(sampleNum)
                    if int(sampleCount * testPercentage) < sampleNum + 1:
                        samplingTestValues = True
                        weighted = False
                    if 1 + sampleNum > sampleCount:
                        break
    return labels, samples

def loadSamples(samplesCount, source):
    sys.stderr.write('Loading maximum ' + str(samplesCount) + ' samples from ' + source + "\n")
    with open(source, 'r', encoding="utf8") as input:
        samples = []
        labels = []
        for fullLine in input:
            line = fullLine.rstrip()
            split = line.split(' ')
            samples.append(' '.join(split[:-1]))
            if split[-1] == "True":
                labels.append(True)
            else:
                labels.append(False)
            if len(samples) > samplesCount:
                break
        return labels, samples

In [None]:
print("labels, samples = sampleData(5000000, dataFile + .clean, weighted=False)")
labels, samples = sampleData(5000000, dataFile + ".clean", weighted=False)

print("labels, samples = loadSamples(5000000, dataFile + .clean.samples)")
labels, samples = loadSamples(5000000, dataFile + ".clean.samples")

In [None]:
# functions for Word Index

def saveWordIndex(samples):
    sys.stderr.write('Building word index.' + "\n")
    tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
    tokenizer.fit_on_texts(samples)
    wordIndex = {}
    for num, item in enumerate(tokenizer.word_index.items()):
        if num >= MAX_NB_WORDS - 1:
            break
        else:
            wordIndex[item[0]] = item[1]
    saveObject(wordIndex, 'wordIndex')
    sys.stderr.write('Found %s unique tokens.' % len(wordIndex) + "\n")
    return wordIndex

def loadWordIndex():
    return loadObject('wordIndex')

def loadObject(name):
    """ :rtype: dict """
    # return np.load(os.path.join(MODEL_DATA_DIR, name + '.npy')).item()

    # save np.load
    np_load_old = np.load
    # modify the default parameters of np.load
    np.load = lambda *a,**k: np_load_old(*a, allow_pickle=True, **k)
    # call load_data with allow_pickle implicitly set to true
    load = np.load(os.path.join(MODEL_DATA_DIR, name + '.npy')).item()
    # restore np.load for future normal usage
    np.load = np_load_old

    return load

In [None]:
# functions for tokenization

def tokenize(labels, samples, wordIndex):
    sys.stderr.write('Tokenizing samples.' + "\n")

    tokenizedSamples = texts_to_sequences(wordIndex, samples, MAX_NB_WORDS)
    paddedSamples = pad_sequences(tokenizedSamples, maxlen=WORDS_PER_SAMPLE_SIZE)

    tokenizedLabels = to_categorical(np.asarray(labels))

    sys.stderr.write('Shape of paddedSamples tensor:' + str(paddedSamples.shape) + "\n")
    sys.stderr.write('Shape of tokenizedLabels tensor:' + str(tokenizedLabels.shape) + "\n")

    return tokenizedLabels, paddedSamples

In [None]:
# functions for splitting the data into a training set and a validation set

def splitTrainingAndValidation(labels, samples):
    indices = np.arange(samples.shape[0])
    np.random.shuffle(indices)
    samples = samples[indices]
    labels = labels[indices]
    nb_validation_samples = int(VALIDATION_SPLIT * samples.shape[0])

    xTrain = samples[:-nb_validation_samples]
    yTrain = labels[:-nb_validation_samples]
    xVal = samples[-nb_validation_samples:]
    yVal = labels[-nb_validation_samples:]
    return xTrain, yTrain, xVal, yVal

In [None]:
# functions for creating model with or without wordIndex

def createModel(wordIndex=None):
    sys.stderr.write('Creating model.' + "\n")
    model = Sequential()
    model.add(createEmbeddingLayer(wordIndex))
    model.add(Conv1D(512, 3, activation='relu'))
    if wordIndex is not None:
        model.add(Dropout(0.25))
    model.add(Flatten())
    model.add(Dense(LABELS_COUNT, activation='softmax'))
    # alternative optimizer: rmsprop, adam
    model.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['acc', precision, recall, fbeta_score])
    return model