In [1]:
import csv
import math
import itertools
from io import open
from conllu import parse_incr

In [2]:
# print the structure of the fine-tuning MLP for WSD
def print_fine_tuning_MLP(model, param):

	print('\n******************* fine-tuning MLP structure ***********************')

	print('Current Task: {}'.format(param))
	module_dict = model.layers[param]

	for module in module_dict:
		print(module)

	print('**********************************************************************')

In [3]:
# print the whole model 
def print_whole_model(model):

	print('\nAll parameters in the model:')
	for name, param in model.named_parameters():
		if param.requires_grad:
			print(name, param.size())

	print(model)

In [4]:
'''
return: 
all sentences
target word index
target word lemma
target word sense
all senses for each word 
from the EUD for train, test, and dev dataset
index provided by WSD dataset by White et. al.
'''

def get_sentences_and_senses(wsd_data, train_data, test_data, dev_data, sen_num):
    
	# get the sentences, target word index, annotator response, lemma, and target word sense
	train_sentences, train_word_index, train_word_sense = ([] for i in range(3))
	test_sentences, test_word_index, test_word_sense = ([] for i in range(3))
	dev_sentences, dev_word_index, dev_word_sense = ([] for i in range(3))
	train_lemma, test_lemma, dev_lemma = ([] for i in range(3))
	train_response, test_response, dev_response = ([] for i in range(3))
    
    # all senses for each 
	all_senses = {}
    
	# for test purpose: only load specific amount of data
	for i in range(sen_num):

		# get the original sentence from EUD
		sentence_id = wsd_data[i].get('Sentence.ID')

		# the index in EUD is 1-based!!!
		sentence_number = int(sentence_id.split(' ')[-1]) - 1
		# print('sentence id {} i {}'.format(sentence_id, i))
		word_index = int(wsd_data[i].get('Arg.Token')) - 1
		word_lemma = wsd_data[i].get('Arg.Lemma')
		word_sense = wsd_data[i].get('Synset')
		response = wsd_data[i].get('Sense.Response')

		if "train" in sentence_id: 
			sentence = train_data[sentence_number]

			# the clean sentence in list
			clean_sentence = [word_dict.get('lemma') for word_dict in sentence]
			train_sentences.append(clean_sentence)
			train_word_index.append(word_index)
			train_word_sense.append(word_sense)
			train_response.append(response)
			train_lemma.append(word_lemma)
            
		elif "test" in sentence_id:
			sentence = test_data[sentence_number]

			# the clean sentence in list
			clean_sentence = [word_dict.get('lemma') for word_dict in sentence]
			test_sentences.append(clean_sentence)
			test_word_index.append(word_index)
			test_word_sense.append(word_sense)
			test_response.append(response)
			test_lemma.append(word_lemma)

		else:
			sentence = dev_data[sentence_number]
            
			# the clean sentence in list
			clean_sentence = [word_dict.get('lemma') for word_dict in sentence]
			dev_sentences.append(clean_sentence)
			dev_word_index.append(word_index)
			dev_word_sense.append(word_sense)
			dev_response.append(response)
			dev_lemma.append(word_lemma)

        # if the word already exits: add the new sense to the list
        # else: creata a new set for the word
		if word_lemma in all_senses.keys():
			all_senses[word_lemma].add(word_sense)
		else:
			all_senses[word_lemma] = {word_sense}         

	print('Processed {} sentences for test purpose.'.format(sen_num))
	print('\n******************* Data Example ***********************')
	print('Sentence: {}'.format(train_sentences[0]))
	print('Target Word Index: {}'.format(train_word_index[0]))
	print('Target Word Sense (index in WordNet 3.1): {}'.format(train_word_sense[0]))
	print('Annotator Response: {}'.format(train_response[0]))
	print('All senses for the target word: {}'.format(all_senses[train_lemma[0]]))
	print('********************************************************')

	return train_sentences, train_word_sense, train_word_index, test_sentences, test_word_sense, test_word_index, dev_sentences, dev_word_sense, dev_word_index, train_response, test_response, dev_response, all_senses
			

In [5]:
# parse the WSD dataset first
# and retrieve all sentences from the EUD

'''
Copyright@
White, A. S., D. Reisinger, K. Sakaguchi, T. Vieira, S. Zhang, R. Rudinger, K. Rawlins, & B. Van Durme. 2016. 
[Universal decompositional semantics on universal dependencies]
(http://aswhite.net/media/papers/white_universal_2016.pdf). 
To appear in *Proceedings of the Conference on Empirical Methods in Natural Language Processing 2016*.
'''

def parse_data():
	
	# parse the WSD dataset
	wsd_data = []

	# read in tsv by White et. al., 2016
	with open('data/wsd/wsd_eng_ud1.2_10262016.tsv', mode = 'r') as wsd_file:

		tsv_reader = csv.DictReader(wsd_file, delimiter = '\t')

		# store the data
		for row in tsv_reader:

			# each data vector
			wsd_data.append(row)

		# make sure all data are parsed
		print('Parsed {} word sense data from White et. al., 2016.'.format(len(wsd_data)))

	# parse the EUD-EWT conllu files and retrieve the sentences
	train_file = open("data/UD_English-EWT/en_ewt-ud-train.conllu", "r", encoding="utf-8")
	train_data = list(parse_incr(train_file))
	print('Parsed {} training data from UD_English-EWT/en_ewt-ud-train.conllu.'.format(len(train_data)))
	# 'spring' as the first example in White et. al., 2016
	# print(train_data[1363])

	test_file = open("data/UD_English-EWT/en_ewt-ud-test.conllu", "r", encoding="utf-8")
	test_data = list(parse_incr(test_file))
	print('Parsed {} testing data from UD_English-EWT/en_ewt-ud-test.conllu.'.format(len(test_data)))

	dev_file = open("data/UD_English-EWT/en_ewt-ud-dev.conllu", "r", encoding="utf-8")
	dev_data = list(parse_incr(dev_file))
	print('Parsed {} dev data from UD_English-EWT/en_ewt-ud-dev.conllu.'.format(len(dev_data)))

	return wsd_data, train_data, test_data, dev_data


In [6]:
# parse the data
wsd_data, train_data, test_data, dev_data = parse_data()

# return the raw sentences from the EUD for train, test, and dev
# test the first 20 sentences
train_sentences, train_word_sense, train_word_index, test_sentences, test_word_sense, test_word_index, dev_sentences, dev_word_sense, dev_word_index, train_response, test_response, dev_response, all_senses = get_sentences_and_senses(wsd_data, train_data, test_data, dev_data, 20)


Parsed 439312 word sense data from White et. al., 2016.
Parsed 12543 training data from UD_English-EWT/en_ewt-ud-train.conllu.
Parsed 2077 testing data from UD_English-EWT/en_ewt-ud-test.conllu.
Parsed 2002 dev data from UD_English-EWT/en_ewt-ud-dev.conllu.
Processed 20 sentences for test purpose.

******************* Data Example ***********************
Sentence: ['on', 'August', '9', ',', '2004', ',', 'it', 'be', 'announce', 'that', 'in', 'the', 'spring', 'of', '2001', ',', 'a', 'man', 'name', 'El', '-', 'Shukrijumah', ',', 'also', 'know', 'as', 'Jafar', 'the', 'Pilot', ',', 'who', 'be', 'part', 'of', 'a', '"', 'second', 'wave', ',', '"', 'have', 'be', 'case', 'New', 'York', 'City', 'helicopter', '.']
Target Word Index: 12
Target Word Sense (index in WordNet 3.1): spring.n.01
Annotator Response: True
All senses for the target word: {'spring.n.04', 'spring.n.02', 'leap.n.01', 'give.n.01', 'spring.n.01', 'spring.n.03'}
********************************************************


In [7]:
from allennlp.commands.elmo import ElmoEmbedder
from model import *

Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.


In [8]:
# ELMo setup
# ELMo is tuned to lower dimension (256) by MLP in Model
elmo = ElmoEmbedder()
model = Model(elmo_class = elmo, all_senses = all_senses)

# print the model 
print_whole_model(model)

# MLP illustration
print_fine_tuning_MLP(model, 'WSD')


All parameters in the model:
layers.WSD.0.weight torch.Size([300, 512])
layers.WSD.0.bias torch.Size([300])
dimension_reduction_MLP.weight torch.Size([256, 3072])
dimension_reduction_MLP.bias torch.Size([256])
wsd_lstm.weight_ih_l0 torch.Size([1024, 256])
wsd_lstm.weight_hh_l0 torch.Size([1024, 256])
wsd_lstm.bias_ih_l0 torch.Size([1024])
wsd_lstm.bias_hh_l0 torch.Size([1024])
wsd_lstm.weight_ih_l0_reverse torch.Size([1024, 256])
wsd_lstm.weight_hh_l0_reverse torch.Size([1024, 256])
wsd_lstm.bias_ih_l0_reverse torch.Size([1024])
wsd_lstm.bias_hh_l0_reverse torch.Size([1024])
wsd_lstm.weight_ih_l1 torch.Size([1024, 512])
wsd_lstm.weight_hh_l1 torch.Size([1024, 256])
wsd_lstm.bias_ih_l1 torch.Size([1024])
wsd_lstm.bias_hh_l1 torch.Size([1024])
wsd_lstm.weight_ih_l1_reverse torch.Size([1024, 512])
wsd_lstm.weight_hh_l1_reverse torch.Size([1024, 256])
wsd_lstm.bias_ih_l1_reverse torch.Size([1024])
wsd_lstm.bias_hh_l1_reverse torch.Size([1024])
output_layers.spring.weight torch.Size([6, 30

In [9]:
# forward propagation
# ELMo (1024) -> dimension reduction (256) -> bi-LSTM (512) -> fine-tuning MLP (10)
model.forward(train_sentences, train_word_index)


Original ELMo embeddings size: torch.Size([20, 3, 48, 1024]), mask: torch.Size([20, 48])
Embedding size after dimension reduction: torch.Size([48, 20, 256]), mask: torch.Size([48, 20, 256])

Embedding size after bi-LSTM: torch.Size([48, 20, 512])

Word lemma: spring
Embedding vector (distributions over all its senses): tensor([0.5042, 0.5068, 0.5075, 0.4990, 0.4983, 0.4977],
       grad_fn=<SigmoidBackward>)
All its senses: {'spring.n.04', 'spring.n.02', 'leap.n.01', 'give.n.01', 'spring.n.01', 'spring.n.03'}

Word lemma: spring
Embedding vector (distributions over all its senses): tensor([0.5042, 0.5068, 0.5075, 0.4990, 0.4983, 0.4977],
       grad_fn=<SigmoidBackward>)
All its senses: {'spring.n.04', 'spring.n.02', 'leap.n.01', 'give.n.01', 'spring.n.01', 'spring.n.03'}

Word lemma: spring
Embedding vector (distributions over all its senses): tensor([0.5042, 0.5068, 0.5075, 0.4990, 0.4983, 0.4977],
       grad_fn=<SigmoidBackward>)
All its senses: {'spring.n.04', 'spring.n.02', 'lea

[tensor([0.5042, 0.5068, 0.5075, 0.4990, 0.4983, 0.4977],
        grad_fn=<SigmoidBackward>),
 tensor([0.5042, 0.5068, 0.5075, 0.4990, 0.4983, 0.4977],
        grad_fn=<SigmoidBackward>),
 tensor([0.5042, 0.5068, 0.5075, 0.4990, 0.4983, 0.4977],
        grad_fn=<SigmoidBackward>),
 tensor([0.5042, 0.5068, 0.5075, 0.4990, 0.4983, 0.4977],
        grad_fn=<SigmoidBackward>),
 tensor([0.5042, 0.5068, 0.5075, 0.4990, 0.4983, 0.4977],
        grad_fn=<SigmoidBackward>),
 tensor([0.5042, 0.5068, 0.5075, 0.4990, 0.4983, 0.4977],
        grad_fn=<SigmoidBackward>),
 tensor([0.5042, 0.5068, 0.5075, 0.4990, 0.4983, 0.4977],
        grad_fn=<SigmoidBackward>),
 tensor([0.5042, 0.5068, 0.5075, 0.4990, 0.4983, 0.4977],
        grad_fn=<SigmoidBackward>),
 tensor([0.5042, 0.5068, 0.5075, 0.4990, 0.4983, 0.4977],
        grad_fn=<SigmoidBackward>),
 tensor([0.5042, 0.5068, 0.5075, 0.4990, 0.4983, 0.4977],
        grad_fn=<SigmoidBackward>),
 tensor([0.5042, 0.5068, 0.5075, 0.4990, 0.4983, 0.4977],
  