In [3]:
import os
import pickle

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
import matplotlib.pyplot as plt

from torch.utils.data import DataLoader

# torch.manual_seed(1)

In [None]:
# def process_task2_files(filepath):
#     with open(filepath, 'r', encoding='utf-8') as f:
#         source = f.read()

#     out       = []
#     sentences = source.strip().split('\n')

#     for sentence in sentences:
#         line  = sentence.strip().split('\t')
#         line  = line[1:4]
#         out.append('\t'.join(line))

#     with open(filepath.replace('task2', 'task2p'), 'w+') as f:
#         f.write('\n'.join(out))

In [None]:
# common_path = '../data/files/'

# for lang in ['arabic', 'finnish', 'georgian', 'german', 'hungarian', 'maltese', 'navajo', 'russian', 'spanish', 'turkish']:
    
#     n_p = common_path + '{}-task2-'.format(lang)
    
#     for t in ['train', 'test', 'dev']:
#         filepath = n_p + t
        
#         process_task2_files(filepath)

In [5]:
from dataset import Vocabulary, MorphologyDatasetTask3


vocab      = Vocabulary('LANG')
morph_data = MorphologyDatasetTask3(test=False, language='LANG', vocab=vocab)
dataloader = DataLoader(morph_data, batch_size=1, shuffle=False, num_workers=2)

         0                                 1          2
0     play    pos=verb,tense=present,mod=ind     played
1  trivial                 pos=adj,color=neg  untrivial
2  decided        pos=verb,tense=past,mod=cc     decide
0   played    pos=verb,tense=present,mod=ind       play
1    files  pos=adj,color=pos,tense=whatever      files
2  running         pos=verb,tense=now,mod=pp        run


In [7]:
for i_batch, sample_batched in enumerate(dataloader):
    print(i_batch)

0
1
2
3
4
5


In [None]:
print(len(vocab.char_2_idx))
print(len(vocab.desc_2_idx))
# print(vocab.idx_2_desc)

In [None]:
a = torch.tensor([1, 2, 3])
# a = a.permute(1, 0)
print(a.size()[0])
# a = a.unsqueeze()

In [None]:
with open('../../results/attentions.pkl', 'rb') as f:
    attentions = pickle.load(f)
    
attentions = attentions.squeeze(1)

plt.matshow(attentions)
plt.plot()

In [None]:
from dataset import *
from helper import load_file

idx_2_char = load_file('../data/pickles/{}-idx_2_char'.format('english'))
char_2_idx = load_file('../data/pickles/{}-char_2_idx'.format('english'))
idx_2_desc = load_file('../data/pickles/{}-idx_2_desc'.format('english'))
desc_2_idx = load_file('../data/pickles/{}-desc_2_idx'.format('english'))
# msd_types  = load_file('../data/pickles/{}-msd_options'.format('english'))  # label types

print(char_2_idx)


morph_data = MorphologyDatasetTask3(csv_file='../data/files/english-task3-test', language='english', 
                                    root_dir='../data/files')
morph_data.set_vocabulary(char_2_idx, idx_2_char, desc_2_idx, idx_2_desc, None)

dataloader = DataLoader(morph_data, batch_size=1, shuffle=False, num_workers=2)


In [None]:
for i_batch, sample_batched in enumerate(dataloader):
    print(sample_batched)

In [None]:
def process_MSD(msd):
    '''
    Process msd in the input sentence
    Args:
        msd: string containing different MSDs
    Returns:
        out: dict with different msds
    '''
    
    out = {}
    msds = msd.strip().split(',')

    for m in msds:
        current = m.strip().split('=')
        out[current[0]] = current[1]

    return out

In [None]:
path = '../data/task1_test'

out  = []

with open(path, 'r', encoding='utf-8') as f:
    source = f.read()
    
sentences = source.strip().split('\n')

for sentence in sentences:
    line = sentence.strip().split('\t')
    
    if len(line) > 3:
        print('Something wrong with line: {}'.format(sentence))
        continue
    
    current_word = {
        'lemma'      : line[0],
        'MSD'        : process_MSD(line[1]),
        'target_form': line[2]        
    }
    
    out.append(current_word)

In [None]:
# print(out[0])
out

In [None]:
with open('../data/pickles/task3_test.pkl', 'rb') as f:
    n_out = pickle.load(f)

In [None]:
n_out

In [None]:
def prepare_sequence(seq, to_ix):
    idxs = [to_ix[w] for w in seq]
    return torch.tensor(idxs, dtype=torch.long)


training_data = [
    ("The dog ate the apple".split(), ["DET", "NN", "V", "DET", "NN"]),
    ("Everybody read that book".split(), ["NN", "V", "DET", "NN"])
]
word_to_ix = {}
for sent, tags in training_data:
    for word in sent:
        if word not in word_to_ix:
            word_to_ix[word] = len(word_to_ix)
print(word_to_ix)
tag_to_ix = {"DET": 0, "NN": 1, "V": 2}

# These will usually be more like 32 or 64 dimensional.
# We will keep them small, so we can see how the weights change as we train.
EMBEDDING_DIM = 6
HIDDEN_DIM = 6



In [None]:
def load_file(path):
    '''
    load pickle File
    Args:
        path: file path
    Returns:
        out : pickle loaded
    '''

    with open(path, 'rb') as f:
        out = pickle.load(f)

    return out


In [None]:
with open('../data/pickles/desc_2_idx', 'rb') as f:
    desc_2_idx = pickle.load(f)
    
print(desc_2_idx)
print("---")

with open('../data/pickles/msd_options', 'rb') as f:
    msd_options = pickle.load(f)
    
print(msd_options)
print("---")

with open('../data/pickles/task3_test.pkl', 'rb') as f:
    task3_test = pickle.load(f)

print(task3_test)

In [None]:
def prepare_msd(msd, idx_2_desc, msd_options):
    '''
    msd: {'pos': 'verb', 'tense': 'present', 'mod': 'ind'}

    output: [0, 1, 2, 0, 0, ...]
    '''
    label_len = len(idx_2_desc)
    k_output  = []

    for i in range(label_len):
        desc  = idx_2_desc[i]
        opt   = msd.get(desc)
        types = msd_options[i]

        if opt is None:
            k_output.append(to_categorical([0], num_classes=len(types))[0])
            continue

        k_output.append(to_categorical([types[opt]], num_classes=len(types))[0])

    return np.concatenate(k_output, axis=0)

def prepare_sequence(sequence, char_2_idx, max_seq_len):
    '''
    Append <END> to each sequence and Pad with <PAD>
    '''
    output = []

    for char in sequence:
        output.append(char_2_idx[char])

    output.append(char_2_idx['<END>'])

    while len(output) < max_seq_len:
        output.append(char_2_idx['<PAD>'])

    return output

In [None]:
idx_2_desc  = load_file('../data/pickles/idx_2_desc')
char_2_idx  = load_file('../data/pickles/char_2_idx')
msd_options = load_file('../data/pickles/msd_options')

In [None]:
# msd = {'pos': 'verb', 'tense': 'present', 'mod': 'ind'}
# msd = {'pos': 'adj', 'color': 'neg'}
msd       = {'pos': 'verb', 'tense': 'past', 'mod': 'cc'}
labels    = prepare_msd(msd, idx_2_desc, msd_options)
data_test = prepare_sequence('playing', char_2_idx, 10)

print('labels    : {}'.format(labels))
print('data_test : {}'.format(data_test))
print('vocab size: {}'.format(len(char_2_idx)))

In [None]:
x_s = torch.from_numpy(to_categorical(data_test, num_classes=len(char_2_idx)))
print(x_s.size())
print(x_s[0])

# x_s = torch.unsqueeze(x_s, 0)
# print(x_s.size())


print(F.log_softmax(x_s, dim=1))

In [None]:
m = torch.distributions.categorical.Categorical(torch.tensor([ 0.25, 0.25, 0.25, 0.25 ]))

sample = m.sample()

print(m.log_prob(sample))