### Task 1: POS Tagging
Introduction

There is an ongoing discussion whether the problem of part of speech tagging is already solved, at least for English (see Manning 2011), by reaching the tagging error rates similar or lower than the human inter-annotator agreement, which is ca. 97%. In the case of languages with rich morphology, such as Polish, there is however no doubt that the accuracies of around 91% delivered by taggers leave much to be desired and more work is needed to proclaim this task as solved.

The aim of this proposed task is therefore to stimulate research in potentially new approaches to the problem of POS tagging of Polish, which will allow to close the gap between the tagging accuracy of systems available for English and languages with rich morphology.

### Task definition
Subtask (A): Morphosyntactic disambiguation and guessing

Given a sequence of segments, each with a set of possible morphosyntactic interpretations, the goal of the task is to select the correct interpretation for each of the segments and provide an interpretation for segments for which only 'ign' interpretation has been given (segments unknown to the morphosyntactic dictionary).

In [None]:
### ALL IMPORTS ###
import pandas as pd
import numpy as np
import random
import matplotlib.pyplot as plt
import tensorflow as tf
from itertools import islice, chain
import re
from collections import Counter
import itertools
import time
import progressbar
from lxml import objectify, etree

In [None]:
### DATA PREPARATION ###
def read_training_data(filename):
    """Extract the first file enclosed in a tar.gz file as a list of words"""
    with open(filename) as f:
        content = f.read()
        pattern = '<\?xml version="1\.0" encoding="UTF-8"\?>\s*<\!DOCTYPE cesAna SYSTEM "xcesAnaIPI\.dtd">\s*<cesAna xmlns\:xlink="http\:\/\/www\.w3\.org\/1999\/xlink" version="1\.0" type="lex disamb">\s*<chunkList>\s*(?P<chunks>[\W\s\d\w]+)<\/chunkList>\s*<\/cesAna>'
        chunks_block = re.search(pattern, content)
        if chunks_block:
            all_chunks = chunks_block.groups('chunks')
            pattern = '<chunk type=\"s\">\s*(?P<chunk>[.\w\W\s]+?)<\/chunk>\s*'
            chunks = re.findall(pattern, all_chunks[0])
            return chunks
        return None

def create_dictionary_train(chunks):
    print("Number of chunks: {0}".format(len(chunks)))
    words = {}
    for chunk in chunks:
        pattern = '(?P<token><tok>\s*(?:[\w\W\d.]+?)<\/tok>\s*?)(?:<ns\/>)?'
        tokens = re.findall(pattern, chunk)
        for tok in tokens:
            pattern = '<orth>(?P<orth>.+)<\/orth>\s*(?:[\w\W\d.]+)'
            orth = re.search(pattern, tok)
            x = orth.group('orth')
            pattern = '<lex><base>(?P<base>.+)<\/base><ctag>(?P<ctag>.+)<\/ctag><\/lex>\s*'
            lexes = re.findall(pattern, tok)
            words[x] = [lexes]
    return words
        
    
def create_dictionary_gold(chunks):
    print("Number of chunks: {0}".format(len(chunks)))
    words = {}
    for chunk in chunks:
        pattern = '(?P<token><tok>\s*(?:[\w\W\d.]+?)<\/tok>\s*?)(?:<ns\/>)?'
        tokens = re.findall(pattern, chunk)
        for tok in tokens:
            pattern = '<orth>(?P<orth>.+)<\/orth>\s*(?:[\w\W\d.]+)'
            orth = re.search(pattern, tok)
            x = orth.group('orth')
            pattern = '<lex disamb=\"1\"><base>(?P<base>.+)<\/base><ctag>(?P<ctag>.+)<\/ctag><\/lex>\s*'
            lexes = re.findall(pattern, tok)
            words[x] = [lexes[0][1]]
    return words

def take(n, iterable):
    "Return first n items of the iterable as a list"
    return list(islice(iterable, n))

In [None]:
% time chunks = read_training_data('train-gold.xml')
% time words = create_dictionary_gold(chunks)
print('Data size %d' % len(words))
print("Unique words: {0}".format(len(Counter(words))))
words_sample = take(5, words)
print("\n".join(["{0} -> {1}".format(x,words[x][0]) for x in words_sample]))
chunks_train = read_training_data('train-analyzed.xml')
words_train = create_dictionary_train(chunks_train)
print('Data size %d' % len(words_train))
reference_words = list(words_train.keys()) # words list in order
chunks = pd.read_csv('pl-embeddings-skip_pure_words.txt', chunksize=1000000, delimiter=' ', header=None, encoding='utf-8')
embeddings_df = pd.DataFrame()
%time embeddings_df = pd.concat(chunk for chunk in chunks).sort_values(0)
del embeddings_df[101]
subset_of_embeddings = embeddings_df.loc[embeddings_df[0].isin(words_train.keys())]
print(len(subset_of_embeddings))
tmp = subset_of_embeddings
subset_of_embeddings['interpretation'] =  [words[word][0] for word in tmp[0]]
subset_of_embeddings['disamb'] = [False for i in range(len(subset_of_embeddings))]
word_list_with_duplicates = []
interpretation = []
disamb = []
def create_series_for_df():
    i = 0
    bar = progressbar.ProgressBar(max_value=progressbar.UnknownLength)
    #print(words_train['A'])
    print('A' in reference_words)
    for word_train in reference_words:
        try:
            #print(words_train[word_train][0])
            for k in words_train[word_train][0]: # iterate over interpretations
                #print(k[1])
                if words[word_train][0].strip() != k[1].strip(): 
                    disamb.append(0)
                else:
                    disamb.append(1)
                word_list_with_duplicates.append(word_train)
                interpretation.append(k[1])
        except:
            continue
            
        i += 1
        bar.update(i)

%time create_series_for_df()
words_count = Counter(word_list_with_duplicates)

subset_of_embeddings['Count'] = subset_of_embeddings[0].map(words_count)
subset_of_embeddings.Count = subset_of_embeddings.Count.fillna(0).astype(int)
subset_with_repetitions = pd.DataFrame(np.repeat(subset_of_embeddings.values, subset_of_embeddings['Count'].values, axis=0))
data_tuples = list(zip(word_list_with_duplicates, interpretation, disamb))
print(data_tuples[:10])
sorted_repetitions_df = subset_with_repetitions.sort([0])
data_tuples = sorted(data_tuples)
test_df = pd.DataFrame([(i[1],i[2]) for i in data_tuples], )
test_df.head()
subset_with_repetitions['interpretation'] = test_df[0]
subset_with_repetitions['disamb'] = test_df[1]
print("One-hot representation of morphosynthactic forms")
result = pd.concat([subset_with_repetitions,pd.get_dummies(subset_with_repetitions['interpretation'])], axis=1)
del result[101]
del result[102]
del result[103]
del result['interpretation']
tmp = result['disamb']
del result['disamb']
result = pd.concat([result, tmp],axis=1)
result.head()
with open('all_columns','w') as f:
    f.write(str(result.columns.tolist()))
    
result.to_csv('input-output-dataset.csv', encoding='utf-8')

In [None]:
### SPLIT DATA INTO TRAINING AND TEST SETS ###
correct = pd.DataFrame()
non_correct = pd.DataFrame()
correct_test = pd.DataFrame()
non_correct_test = pd.DataFrame()

bar_all = progressbar.ProgressBar(max_value=progressbar.UnknownLength)
for j, chunk in enumerate(pd.read_csv('input-output-dataset.csv', chunksize=10000)):
    del chunk['0']            
    del chunk['Unnamed: 0']
    
    if j % 5 == 0:
        correct_test = pd.concat([correct_test, chunk[chunk['disamb'] == True]])
        non_correct_test = pd.concat([non_correct_test, chunk[chunk['disamb'] == False]])
    else:
        correct = pd.concat([correct, chunk[chunk['disamb'] == True]])
        non_correct = pd.concat([non_correct, chunk[chunk['disamb'] == False]])
        bar_all.update(j)

del correct['disamb']
del non_correct['disamb']
del correct_test['disamb']
del non_correct_test['disamb']

with open('in-out_correct.csv', 'w') as f:
    correct.to_csv(f, header=False, index=False)
with open('in-out_non-correct.csv', 'w') as f:
    non_correct.to_csv(f, header=False, index=False)
with open('in-out_correct_test.csv', 'w') as f:
    correct_test.to_csv(f, header=False, index=False)
with open('in-out_non-correct_test.csv', 'w') as f:
    non_correct_test.to_csv(f, header=False, index=False)

In [None]:
### MODEL CREATION AND TEACHING ###
def weight_variable(shape):
    initial = tf.truncated_normal(shape, stddev=0.1)
    return tf.Variable(initial)

def bias_variable(shape):
    initial = tf.constant(0.1, shape=shape)
    return tf.Variable(initial)

def add_missing_dummy_columns( d, columns ):
    missing_cols = set( columns ) - set( d.columns )
    for c in missing_cols:
        d[c] = 0

def fix_columns( d, columns ):

    add_missing_dummy_columns( d, columns )

    # make sure we have all the columns we need
    assert( set( columns ) - set( d.columns ) == set())

    extra_cols = set( d.columns ) - set( columns )
    for c in extra_cols:
        if c not in list(map(lambda x: str(x),range(0,102))):
            del d[c]
        elif c not in d:
            d[c] = 0

def create_model():
    tf.reset_default_graph()
    accu = []
    
    # first layer
    hid_layer = 140
    x = tf.placeholder(tf.float32, shape=[None, 1398])
    y_train = tf.placeholder(tf.float32, shape=[None,2])

    W_1 = weight_variable([1398, hid_layer])
    b_1 = bias_variable([hid_layer])

    h_1 = tf.nn.sigmoid(tf.add(tf.matmul(x,W_1),b_1))

    # second layer
    W_2 = weight_variable([hid_layer, 2])
    b_2 = bias_variable([2])

    h_2 =  tf.nn.sigmoid(tf.add(tf.matmul(h_1,W_2),b_2))

    # Train and evaluate the model
    cross_entropy = tf.reduce_mean(
                tf.nn.softmax_cross_entropy_with_logits(labels=y_train, logits=h_2))
    train_step = tf.train.AdamOptimizer(0.01).minimize(cross_entropy)
    correct_prediction = tf.equal(tf.argmax(h_2, 1), tf.argmax(y_train, 1))
    accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
    chunksize = 10000
    epochs = 1

    bar = progressbar.ProgressBar(max_value=epochs)

    init_op = tf.global_variables_initializer()

    # Add ops to save and restore all the variables.
    saver = tf.train.Saver()

    with tf.Session() as sess:
        sess.run(init_op)
        for i in range(epochs):
            print("Step: {0}".format(i))
            corr_chunks = pd.read_csv('in-out_correct.csv', chunksize=chunksize, delimiter=',')
            non_corr_chunks = pd.read_csv('in-out_non-correct.csv', chunksize=chunksize, delimiter=',')

            for chunk_corr, chunk_non_corr in zip(corr_chunks, non_corr_chunks):

                batch_y = pd.concat([pd.DataFrame(np.ones(len(chunk_corr))), pd.DataFrame(np.zeros(len(chunk_non_corr)))]).values
                batch_y_neg = pd.concat([pd.DataFrame(np.zeros(len(chunk_corr))), pd.DataFrame(np.ones(len(chunk_non_corr)))]).values
                batch_y = np.hstack((batch_y, batch_y_neg))

                batch_x = pd.concat([chunk_corr,chunk_non_corr], ignore_index=True)
                batch_x = batch_x.values

                #shuffle
                combined = list(zip(batch_x, batch_y))
                np.random.shuffle(combined)

                batch_x[:], batch_y[:] = zip(*combined)

                train_step.run(feed_dict={x: batch_x, y_train: batch_y})

                print("Step accuracy.")
                train_accuracy = accuracy.eval(feed_dict={
                    x: batch_x, y_train: batch_y})
                print('step {0}, train accuracy {1:.10}'.format(i, train_accuracy))
            bar.update(i)

        test_corr = pd.read_csv('in-out_correct_test.csv',chunksize=chunksize, delimiter=',')
        test_non_corr = pd.read_csv('in-out_non-correct_test.csv', chunksize=chunksize, delimiter=',')

        output = []
        for chunk_corr_tst, chunk_non_corr_tst in zip(test_corr, test_non_corr):
            test_y = pd.concat([pd.DataFrame(np.ones(len(chunk_corr_tst))), pd.DataFrame(np.zeros(len(chunk_non_corr_tst)))]).values
            test_y_neg = pd.concat([pd.DataFrame(np.zeros(len(chunk_corr_tst))), pd.DataFrame(np.ones(len(chunk_non_corr_tst)))]).values

            test_y = np.hstack((test_y, test_y_neg))

            tmp_1 = chunk_corr_tst.values
            tmp_2 = chunk_non_corr_tst.values
            test_x = np.vstack((tmp_1, tmp_2))
            accu.append(accuracy.eval(feed_dict={x: test_x, y_train: test_y}))
            print('test accuracy {0:.10f}'.format(accu[-1]))
            output.append(h_2.eval(feed_dict={x: test_x}))

        df = pd.DataFrame(output)
        df.to_csv('output_test')
        # Save the variables to disk.
        save_path = saver.save(sess, "./model.ckpt")
  `      print("Model saved in file: %s" % save_path)

In [None]:
create_model()

In [None]:
### TEST DATA PREPARATION ###
def get_xcef_chunks(filename):
    """Function returns chunks from xcef file as a string"""
    print("Reading chunks of data from file: {0}".format(filename))
    with open(filename) as f:
        content = f.read()
        pattern = '<\?xml version="1\.0" encoding="UTF-8"\?>\s*<\!DOCTYPE cesAna SYSTEM "xcesAnaIPI\.dtd">\s*<cesAna xmlns\:xlink="http\:\/\/www\.w3\.org\/1999\/xlink" version="1\.0" type="lex disamb">\s*<chunkList>\s*(?P<chunks>[\W\s\d\w]+)<\/chunkList>\s*<\/cesAna>'
        chunks_block = re.search(pattern, content)
        if chunks_block:
            all_chunks = chunks_block.groups('chunks')
            pattern = '<chunk type=\"s\">\s*(?P<chunk>[.\w\W\s]+?)<\/chunk>\s*'
            chunks = re.findall(pattern, all_chunks[0])
            return chunks
        return None

def create_dict_of_words(chunks):
    """Function creates dictionary with words as keys and gramatical interpretations as values"""
    
    print("Creating dictionary of words from chunks.")
    print("Number of chunks: {0}".format(len(chunks)))
    words = {}
    i = 0 # index for each word
    for chunk in chunks:
        pattern = '(?P<token><tok>\s*(?:[\w\W\d.]+?)<\/tok>\s*?)(?:<ns\/>)?'
        tokens = re.findall(pattern, chunk) # get all tokens
        for tok in tokens:
            pattern = '<orth>(?P<orth>.+)<\/orth>\s*(?:[\w\W\d.]+)'
            orth = re.search(pattern, tok) 
            x = orth.group('orth') # get word
            pattern = '<lex><base>(?:.+)<\/base><ctag>(?P<ctag>.+)<\/ctag><\/lex>\s*'
            lexes = re.findall(pattern, tok) # get lexems
            words[x] = ([lexes], i) # save for each word every possible lexem
            i += 1
    return words

def take(n, iterable):
    "Return first n items of the iterable as a list"
    return list(islice(iterable, n))

def prepare_training_data(file_test):

    chunks = get_xcef_chunks(file_test)
    words = create_dict_of_words(chunks)
    first_5 = take(5, words)
    print("First 5 words:") 
    for i in first_5:
        print("     {0}   -> {1}".format(i, words[i]))
    
    print('Number of words: {0}'.format(len(words)))
    print("Unique words: {0}".format(len(Counter(words))))

    reference_words = list(words.keys()) # words list in order

    ### READ IN EMBEDDINGS ###
    chunks = pd.read_csv('pl-embeddings-skip_pure_words.txt', chunksize=1000000, delimiter=' ', header=None, encoding='utf-8')
    embeddings_df = pd.DataFrame()
    embeddings_df = pd.concat(chunk for chunk in chunks).sort_values(0)
    del embeddings_df[101]
    #embeddings_df.head(30)

    ### GET SUBSET OF EMBEDDINGS FOR ANALYZED DATA 
    subset_of_embeddings = embeddings_df.loc[embeddings_df[0].isin(words.keys())]
    print("Subset of embeddings length: {0}".format(len(subset_of_embeddings)))
    tmp = subset_of_embeddings
    subset_of_embeddings['interpretation'] =  [words[word][0][0] for word in tmp[0]]
    subset_of_embeddings['index'] =  [words[word][1] for word in tmp[0]]
    print("Subset of embeddings head: {0}".format(subset_of_embeddings.head()))

    word_list_with_duplicates = []
    interpretation = []
    index = []
    def create_series_for_df():
        print("Create series for df:")
        i = 0
        for word_train in reference_words:
            
            #print(words[word_train][0][0])
            for k in words[word_train][0][0]: # iterate over interpretations
                #print("interp: {0}".format(k))
                word_list_with_duplicates.append(word_train)
                interpretation.append(k)
                #print("Index: {0}".format(words[word_train][1]))
                index.append(words[word_train][1])
            i += 1

    create_series_for_df()

    ### PREPARE TRAINING DATA WITH EXTENSION OF OTHER INTERPRETATION ###
    words_count = Counter(word_list_with_duplicates)
    subset_of_embeddings['Count'] = subset_of_embeddings[0].map(words_count)
    subset_of_embeddings.Count = subset_of_embeddings.Count.fillna(0).astype(int)

    subset_with_repetitions = pd.DataFrame(np.repeat(subset_of_embeddings.values, subset_of_embeddings['Count'].values, axis=0))
    print("Subset with repetitions head: {0}".format(subset_with_repetitions.head()))
    data_tuples = list(zip(word_list_with_duplicates, interpretation, index))
    
    sorted_repetitions_df = subset_with_repetitions.sort([102])
    print("Subset with repetitions sorted by index head: {0}".format(sorted_repetitions_df.head()))
    data_tuples = sorted(data_tuples, key=lambda x: x[2])
    print("Tupled data: {0}".format(data_tuples[:15]))
    test_df = pd.DataFrame([i[1], i[2]] for i in data_tuples )
    #test_df.head()
    subset_with_repetitions['interpretation'] = test_df[0]
    subset_with_repetitions['index'] = test_df[1]

#    subset_with_repetitions['disamb'] = test_df[1]
    #subset_with_repetitions.tail(10)

    print("One-hot representation of morphosynthactic forms")
    result = pd.concat([subset_with_repetitions,pd.get_dummies(subset_with_repetitions['interpretation'])], axis=1)
    del result[101]
    del result[102]
    del result['interpretation']
    print(result.head())
    input_file = 'input_test.csv'
    result.to_csv(input_file, encoding='utf-8')
    print("Test data saved successfully in: {0}".format(input_file))

In [None]:
### MODEL EVALUATION ON TEST DATA ###
def write_tags_input_data(test_file, output_tags):
    merged = list(itertools.chain(*output_tags))
    out = pd.DataFrame(merged)
    out.to_csv("output_tags.csv")


def check_model_unknown_output(test_file):
    tf.reset_default_graph()
    chunksize = 10000
    test = pd.read_csv(test_file,chunksize=chunksize, delimiter=',')

    # first layer
    hid_layer = 140
    x = tf.placeholder(tf.float32, shape=[None, 1398])
    y_train = tf.placeholder(tf.float32, shape=[None,2])

    W_1 = weight_variable([1398, hid_layer])
    b_1 = bias_variable([hid_layer])

    h_1 = tf.nn.sigmoid(tf.add(tf.matmul(x,W_1),b_1))

    # second layer
    W_2 = weight_variable([hid_layer, 2])
    b_2 = bias_variable([2])

    h_2 =  tf.nn.sigmoid(tf.add(tf.matmul(h_1,W_2),b_2))

    # Train and evaluate the model
    cross_entropy = tf.reduce_mean(
                tf.nn.softmax_cross_entropy_with_logits(labels=y_train, logits=h_2))
    train_step = tf.train.AdamOptimizer(0.01).minimize(cross_entropy)
    correct_prediction = tf.equal(tf.argmax(h_2, 1), tf.argmax(y_train, 1))
    accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
    chunksize = 10000
    epochs = 2

    bar = progressbar.ProgressBar(max_value=epochs)
    # Add ops to save and restore all the variables.
    saver = tf.train.Saver()

    with tf.Session() as sess:
        # Restore variables from disk.
        saver.restore(sess, "./model.ckpt")
        print("Model restored.")
        output = []
        #prepare_test_data(test_file)
        test = pd.read_csv('input_test.csv',chunksize=chunksize, delimiter=',')

        for chunk in test:
            del chunk['Unnamed: 0']
            del chunk['0']
            columns = None
            with open('all_columns') as f:
                columns = f.read().split(', ')

            fix_columns( chunk, columns )
            test_x = chunk.values
            print("Size of test_x for chunk {0}".format(len(test_x)))
            test_out = h_2.eval(feed_dict={x: test_x})
            print("Test output size: {0}".format(len(test_out)))
            output.append(test_out)

        write_tags_input_data(test_file, output)

In [None]:
check_model_unknown_output()

In [None]:
### CHECK IF SIZES ARE THE SAME ###
out = pd.read_csv('output_tags.csv')
input_tst = pd.read_csv('input_test.csv')
assert ( len(out) == input_tst.shape[0])
print("Length of output: {0}, shape of input data: {1}".format(len(out),input_tst.shape))

In [None]:
def get_random():
    return random.choice([True, False])

In [None]:
### TAG INPUT FILE WITH CORRECT DISAMBIGUATIONS ###
class Disambiguer():
    def __init__(self, xml):
        self.root = objectify.fromstring(xml)
        self.output = pd.read_csv('output_tags.csv').values

    def get_sentences(self):
        self.sentences = []
        self.chunks = []
        for j in self.root.chunkList.chunk:
            self.chunks.append(j.chunk)
            for i in j.chunk.tok:
                self.sentences.append(i.orth.text)
        print(' '.join(self.sentences))

        return self.chunks

    def get_ctags(self):
        self.ctags = []
        for chunk in self.root.chunkList.chunk:
            for i in chunk.tok:
                for j in i.lex:
                    self.ctags.append(j.ctag)
        return self.ctags

    def get_disambiguation(self, n, word, interpretation):
        print(word + "  " + interpretation + " " + str(n))
        if n >= len(self.output):
            return get_random()
        if float(self.output[n][0]) >= float(self.output[n][1]):
            return True
        return False

    def get_list_of_lexems(self):
        lexems = []
        ctags = self.get_ctags()
        print(len(ctags))
        for n, i in enumerate(ctags):
            lexems.append(i.getparent())
            tmp = i.getparent().getparent().orth
            orth = tmp.text
            disamb = self.get_disambiguation(n, orth, i.text)
            if disamb == False:
                i.getparent().getparent().remove(i.getparent())
            else:
                if len(i.getparent().getparent().lex) > 1:
                    i.getparent().getparent().remove(i.getparent())
                else:
                    i.getparent().set('disamb', str(disamb))

        out = etree.tostring(self.root, pretty_print=True, encoding='UTF-8', xml_declaration=False)
        #out = out.encode('UTF-8')
        with open('tagged_test_file.xml', 'wb') as f:
            f.write(out)

        return lexems

class DataReader():
    def __init__(self, path):
        self.path = path

    def read_file(self):
        with open(self.path, 'rb') as f:
            content = f.read()
            return content
        return None

In [None]:
data_path = "./test-analyzed.xml"
data = DataReader(data_path)
disamb = Disambiguer(data.read_file())
#ctags = set(disamb.get_ctags())
#print("Unique ctags: {0}".format(len(ctags)))
#print(sorted(list(map(lambda x: x.text,ctags))))
disamb.get_list_of_lexems()
#for i in disamb.get_list_of_lexems():
#    print(i.attrib['disamb'])