# Word and POS tags embeddings


## Description
In this notebook, we are trying to parse $*.conllu$ files and extract the word embeddings, as well as the POS tags emeddings.

## Work to be done:
1. Learn how to parse $*.conllu$ files
2. Find a way to get all words embeddings.
3. Find a way to get all POS tags embeddings.

### 1.1 Read $*.conllu$ files

#### Word class:

In [12]:
class Word:
    """Word class that maps all details for a word from a *.conllu file. """
    
    def __init__(self, ID, FORM, LEMMA, UPOSTAG, XPOSTAG, FEATS, HEAD, DEPREL, DEPS, MISC):
        self.ID = ID           # Word index, integer starting at 1 for each new sentence; may be a range for multiword tokens; may be a decimal number for empty nodes.
        self.FORM = FORM       # Word form or punctuation symbol.
        self.LEMMA  = LEMMA    # Lemma or stem of word form.
        self.UPOSTAG = UPOSTAG # Universal part-of-speech tag.
        self.XPOSTAG = XPOSTAG # Language-specific part-of-speech tag; underscore if not available.
        self.FEATS = FEATS     # List of morphological features from the universal feature inventory or from a defined language-specific extension; underscore if not available.
        self.HEAD = HEAD       # Head of the current word, which is either a value of ID or zero (0).
        self.DEPREL = DEPREL   # Universal dependency relation to the HEAD (root iff HEAD = 0) or a defined language-specific subtype of one.
        self.DEPS = DEPS       # Enhanced dependency graph in the form of a list of head-deprel pairs.
        self.MISC = MISC       # Any other annotation.
    
    def __str__(self):
        string = ''
        
        string += self.ID      if self.ID      != None else '_'
        string += " "
        string += self.FORM    if self.FORM    != None else '_'
        string += " "
        string += self.LEMMA   if self.LEMMA   != None else '_'
        string += " "
        string += self.UPOSTAG if self.UPOSTAG != None else '_'
        string += " "
        string += self.XPOSTAG if self.XPOSTAG != None else '_'
        string += " "
        string += self.FEATS   if self.FEATS   != None else '_'
        string += " "
        string += self.HEAD    if self.HEAD    != None else '_'
        string += " "
        string += self.DEPREL  if self.DEPREL  != None else '_'
        string += " "
        string += self.DEPS    if self.DEPS    != None else '_'
        string += " "
        string += self.MISC    if self.MISC    != None else '_'
        
        return string
        
    @staticmethod
    def from_line(line):
        tokens = line.split()
        
        assert len(tokens) == 10
        
        ID =      tokens[0] if tokens[0] != '_' else None
        FORM =    tokens[1] if tokens[1] != '_' else None
        LEMMA =   tokens[2] if tokens[2] != '_' else None
        UPOSTAG = tokens[3] if tokens[3] != '_' else None
        XPOSTAG = tokens[4] if tokens[4] != '_' else None
        FEATS =   tokens[5] if tokens[5] != '_' else None
        HEAD =    tokens[6] if tokens[6] != '_' else None
        DEPREL=   tokens[7] if tokens[7] != '_' else None
        DEPS =    tokens[8] if tokens[8] != '_' else None
        MISC =    tokens[9] if tokens[9] != '_' else None
        
        return Word(ID, FORM, LEMMA, UPOSTAG, XPOSTAG, FEATS, HEAD, DEPREL, DEPS, MISC)
        

#### Test Word class:

In [13]:
test_text = "4	proves	prove	VERB	VBZ	Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin	0	root	0:root	_"
test_word = Word.from_line(test_text)
print(test_word)
print(test_word.FORM)

4 proves prove VERB VBZ Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin 0 root 0:root _
proves


#### Sentence class:

In [14]:
class Sentence:
    """Sentence class that maps all details for a sentence from a *.conllu file. """
    
    def __init__(self,newdoc_id, send_id, text, words):
        self.newdoc_id = newdoc_id
        self.send_id = send_id
        self.text = text
        self.words = words
        
    def __str__(self):
        string = ''
        if self.newdoc_id != None:
            string += "# newdoc id = "
            string += str(self.newdoc_id)
            string += "\n"
        string += "# send_id = "
        string += str(self.send_id)
        string += "\n"
        string += "# text = "
        string += str(self.text)
        string += "\n"
        sentence_length = len(self.words)
        for index, word in enumerate(self.words):
            string += str(word)
            if index < sentence_length - 1:
                string += "\n"
        return string
    
        
    @staticmethod
    def from_lines(lines):
        newdoc_id=''
        send_id = ''
        text=''
        words = []

        
        for line in lines:
            if line.startswith("#"): #misc properties
                prop = line.split("=")[1]
                if line.startswith("newdoc", 2):
                    newdoc_id = prop
                    continue
                if line.startswith("sent_id", 2):
                    send_id = prop
                    continue
                if line.startswith("text", 2):
                    text = prop
                    continue
                continue
            else:# words
                if line.split()[0].isdigit(): #index is an integer
                    words.append(Word.from_line(line))

        return Sentence(newdoc_id, send_id, text, words)

#### Test Sentence class:

In [15]:
lines = ["# sent_id = weblog-blogspot.com_gettingpolitical_20030906235000_ENG_20030906_235000-0003",
"# text = Today's incident proves that Sharon has lost his patience and his hope in peace.",
"1	Today	today	NOUN	NN	Number=Sing	3	nmod:poss	3:nmod:poss	SpaceAfter=No",
"2	's	's	PART	POS	_	1	case	1:case	_",
"3	incident	incident	NOUN	NN	Number=Sing	4	nsubj	4:nsubj	_",
"4	proves	prove	VERB	VBZ	Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin	0	root	0:root	_",
"5	that	that	SCONJ	IN	_	8	mark	8:mark	_",
"6	Sharon	Sharon	PROPN	NNP	Number=Sing	8	nsubj	8:nsubj	_",
"7	has	have	AUX	VBZ	Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin	8	aux	8:aux	_",
"8	lost	lose	VERB	VBN	Tense=Past|VerbForm=Part	4	ccomp	4:ccomp	_",
"9	his	he	PRON	PRP$	Gender=Masc|Number=Sing|Person=3|Poss=Yes|PronType=Prs	10	nmod:poss	10:nmod:poss	_",
"10	patience	patience	NOUN	NN	Number=Sing	8	obj	8:obj	_",
"11	and	and	CCONJ	CC	_	13	cc	13:cc	_",
"12	his	he	PRON	PRP$	Gender=Masc|Number=Sing|Person=3|Poss=Yes|PronType=Prs	13	nmod:poss	13:nmod:poss	_",
"13	hope	hope	NOUN	NN	Number=Sing	10	conj	10:conj	_",
"14	in	in	ADP	IN	_	15	case	15:case	_",
"15	peace	peace	NOUN	NN	Number=Sing	13	nmod	13:nmod	SpaceAfter=No",
"16	.	.	PUNCT	.	_	4	punct	4:punct	_"]
sentence = Sentence.from_lines(lines)
print(str(sentence), "\n")
print(sentence.text, "\n")
print(sentence.words[0], "\n")

# newdoc id = 
# send_id =  weblog-blogspot.com_gettingpolitical_20030906235000_ENG_20030906_235000-0003
# text =  Today's incident proves that Sharon has lost his patience and his hope in peace.
1 Today today NOUN NN Number=Sing 3 nmod:poss 3:nmod:poss SpaceAfter=No
2 's 's PART POS _ 1 case 1:case _
3 incident incident NOUN NN Number=Sing 4 nsubj 4:nsubj _
4 proves prove VERB VBZ Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin 0 root 0:root _
5 that that SCONJ IN _ 8 mark 8:mark _
6 Sharon Sharon PROPN NNP Number=Sing 8 nsubj 8:nsubj _
7 has have AUX VBZ Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin 8 aux 8:aux _
8 lost lose VERB VBN Tense=Past|VerbForm=Part 4 ccomp 4:ccomp _
9 his he PRON PRP$ Gender=Masc|Number=Sing|Person=3|Poss=Yes|PronType=Prs 10 nmod:poss 10:nmod:poss _
10 patience patience NOUN NN Number=Sing 8 obj 8:obj _
11 and and CCONJ CC _ 13 cc 13:cc _
12 his he PRON PRP$ Gender=Masc|Number=Sing|Person=3|Poss=Yes|PronType=Prs 13 nmod:poss 13:nmod:poss _
13 

In [16]:
en_path = "Annotated training data/UD_English-master/"
en_dev_filename = "en-ud-dev.conllu"
en_test_filename = "en-ud-test.conllu"
en_train_filename = "en-ud-train.conllu"

ro_path = "Annotated training data/UD_Romanian-dev/"
ro_dev_filename = "ro-ud-dev.conllu"
ro_test_filename = "ro-ud-test.conllu"
ro_train_filename = "ro-ud-train.conllu"

def read_sentences(path, filename):
    sentences = []
    sentence_lines = []
    file = open(path + filename, 'r')
    for line in file:
        if line == '\n':
            sentences.append(Sentence.from_lines(sentence_lines))
            sentence_lines = []
        else:
            sentence_lines.append(line)
            
    return sentences


en_train_sentences = read_sentences(en_path,en_train_filename)
en_dev_sentences = read_sentences(en_path,en_dev_filename)
en_test_sentences = read_sentences(en_path,en_test_filename)

ro_train_sentences = read_sentences(ro_path,ro_train_filename)
ro_dev_sentences = read_sentences(ro_path,ro_dev_filename)
ro_test_sentences = read_sentences(ro_path,ro_test_filename)

assert len(en_dev_sentences) == 2002# there are 2002 sentences in the english development dataset.


### 1.2 Write $*.conllu$ files

In [71]:
def write_sentences(sentences, path, filename):
    with open(path + filename, 'w+') as file:
        for sentence in sentences:
            file.write(str(sentence) + "\n")


### 2. Extract words/POS tags and then extract embeddings

#### Imports

In [17]:
import numpy as np
import pickle
from collections import defaultdict, Counter
from random import random
import matplotlib.pyplot as plt
from gensim.models import Word2Vec

In [43]:
def get_Gensim_sentences(sentences):
    """
    This function receives a list of Sencences and return 
    the word and POS sentences in the format Gensim needs, 
    as well as Counters for both words and POS tags
    Format: [['I', 'like', 'custard'],...]
    """
    gensim_word_sentences = []
    word_counts = Counter()
    gensim_POS_sentences = []
    POS_counts = Counter()
    label_counts = Counter()
    for sentence in sentences:
        gensim_word_sentence = []
        gensim_POS_sentence = []
        for word in sentence.words:
            gensim_word_sentence.append(word.FORM)
            word_counts[word.FORM] += 1
            gensim_POS_sentence.append(word.UPOSTAG)
            POS_counts[word.UPOSTAG] += 1
            label_counts[word.DEPREL.split(":")[0]] += 1      # some labels comprize multimple words(i.e: "nsubj:poss")
        gensim_word_sentences.append(gensim_word_sentence)
        gensim_POS_sentences.append(gensim_POS_sentence)
    return gensim_word_sentences, word_counts, gensim_POS_sentences, POS_counts, label_counts

In [44]:
gensim_word_sentences_train, word_counts_train, gensim_POS_sentences_train, POS_counts_train, label_counts_train = get_Gensim_sentences(en_test_sentences)
gensim_word_sentences_dev, word_counts_dev, gensim_POS_sentences_dev, POS_counts_dev, label_counts_dev = get_Gensim_sentences(en_dev_sentences)
gensim_word_sentences_test, word_counts_test, gensim_POS_sentences_test, POS_counts_test, label_counts_test = get_Gensim_sentences(en_test_sentences)


w2i = defaultdict(lambda: len(w2i))
i2w = dict()
i2w[w2i["<UNK>"]] = "<UNK>" # word with index 0 are the words that are unknown.
for word in word_counts_train.keys():
    if word_counts_train[word] > 1:
        i2w[w2i[word]] = word # trick
    else:
        w2i[word] = 0  # all unknown words have index 0
word_vocabulary = word_counts_train.keys()


        
t2i = defaultdict(lambda: len(t2i))
i2t = dict()
for tag in POS_counts_train.keys():
    i2t[t2i[tag]] = tag # trick
pos_tag_vocabulary = POS_counts_train.keys() 


        
l2i = defaultdict(lambda: len(l2i))
i2l = dict()
for label in label_counts_train.keys():
        i2l[l2i[label]] = label # trick





### a. Gensim Word2Vector trained on our data:

In [48]:
gensim_word_model = Word2Vec(gensim_word_sentences_train, size=100, window=5, min_count=1, workers=4)
gensim_POS_model = Word2Vec(gensim_POS_sentences_train, size=100, window=3, min_count=1, workers=4)

In [49]:
from sklearn.cluster import KMeans
from sklearn.manifold import TSNE

from bokeh.models import ColumnDataSource, LabelSet
from bokeh.plotting import figure, show, output_file
from bokeh.palettes import d3
from bokeh.io import output_notebook
output_notebook()

def emb_scatter(data, names, N=20, perplexity=30.0):
    """
    Function for plotting embeddings and words using TSNE.
    TSNE finds a way to plot multidimensional data to a 
    bidimensional plane. It assures that data close in the 
    multidimensionalspace will also be close in budimensional 
    place, but not the other way around.
    """
    ## try to find some clusters ##
    print("finding clusters")
    kmeans = KMeans(n_clusters=N)
    kmeans.fit(data)
    klabels = kmeans.labels_

    ## get a tsne fit ##
    print("fitting tsne")
    tsne = TSNE(n_components=2, perplexity=perplexity)
    emb_tsne = tsne.fit_transform(data)
    
    ## plot the tsne of the embeddings with bokeh ##
    # source: https://github.com/oxford-cs-deepnlp-2017/practical-1
    p = figure(tools="pan,wheel_zoom,reset,save",
               toolbar_location="above",
               title="T-SNE for most common words")

    # set colormap as a list
    colormap = d3['Category20'][N]
    colors = [colormap[i] for i in klabels]

    source = ColumnDataSource(data=dict(x1=emb_tsne[:,0],
                                        x2=emb_tsne[:,1],
                                        names=names,
                                        colors=colors))

    p.scatter(x="x1", y="x2", size=8, source=source, color='colors')

    labels = LabelSet(x="x1", y="x2", text="names", y_offset=6,
                      text_font_size="8pt", text_color="#555555",
                      source=source, text_align='center')
    p.add_layout(labels)

    show(p)

In [50]:
top_words = [k for k,v in word_counts_train.most_common(1000)]
top_word_vecs = gensim_word_model[top_words]
emb_scatter(top_word_vecs, top_words, N=20)

finding clusters
fitting tsne


In [51]:
top_POS = [k for k,v in POS_counts_train.most_common(1000)]
top_POS_vecs = gensim_POS_model[top_POS]
emb_scatter(top_POS_vecs, top_POS, N=17)

finding clusters
fitting tsne


### b. Gensim Word2Vector trained on Wikipedia data:

In [52]:
# TODO: implement or delete this section.

### c. GloVe word embeddings from  spacy library(300d):

In [53]:
import spacy
glove = spacy.load('en')

spacy_word_model = {}
for word in word_vocabulary:
    spacy_word_model[word] = glove(word).vector

spacy_POS_model = {}
for POS_tag in pos_tag_vocabulary:
    spacy_POS_model[POS_tag] = glove(POS_tag).vector

In [54]:
most_common_words = word_counts_train.most_common(1000)
most_common_words_vectors = []
for word, count in most_common_words:
    most_common_words_vectors.append(spacy_word_model[word])

emb_scatter(most_common_words_vectors, [word for word, count in most_common_words], N=20)

finding clusters
fitting tsne


In [55]:
pos_vectors = []
for pos_tag in pos_tag_vocabulary:
    pos_vectors.append(spacy_POS_model[pos_tag])
emb_scatter(pos_vectors, list(pos_tag_vocabulary), N=17)

finding clusters
fitting tsne


### d. GloVe word embeddings from pre-trained word vectors(50d/100d):

In [58]:
# you need to download the file(s) from https://github.com/stanfordnlp/GloVe
file = open("GloveEmbeddings/glove.6B.50d.txt")
pre_trained_tokens = []
for line in file:
    pre_trained_tokens.append(line.split()[0])

# ..but, aprox 800 out of 8800 are not found in the 
unknown_words = []
for word in w2i.keys():
    if w2i[word] != 0 and word.lower() not in pre_trained_tokens:
        unknown_words.append((word,word_counts_train[word]))
unknown_words

[("80's", 2),
 ('Usamah', 2),
 ('212-902-3724', 2),
 ('212-428-1181', 2),
 ('hilary.ackermann@gs.com', 2),
 ('Ecogas', 3),
 ('Maffett', 2),
 ('IFERC', 3),
 ('Perlingiere', 5),
 ('Elbertson', 5),
 ('EB3326', 3),
 ('853-7906', 3),
 ('646-2600', 3),
 ('janette.elbertson@enron.com', 3),
 ('09/20/2000', 2),
 ('09/08/2000', 2),
 ('420,588', 2),
 ('!!!!!!', 4),
 ('*******************', 2),
 ('36349', 3),
 ('Haedicke', 2),
 ('02/28/2001', 2),
 ('blacklined', 3),
 ('EPMI', 2),
 ('.?', 6),
 ('GPSA', 3),
 ('Guaranty.doc', 4),
 ('CDWR', 2),
 ('Titman', 3),
 ('titman@mail.utexas.edu', 2),
 ('01/24/2001', 5),
 ('duffie@Stanford.EDU', 2),
 ('Arfsten', 2),
 ("id's", 2),
 ('01-Feb-02', 11),
 ('6,363,217', 2),
 ('1,993,045', 2),
 ('alt.animals', 2),
 ('alt.animals.cat', 3),
 ('..', 12),
 ('^^', 2),
 ('www.southbhamcats.org.uk', 2),
 ('n3td3v', 2),
 ('=-----', 2),
 ('732-657-3416', 3),
 ("it's", 4),
 ('Yahoo!', 2),
 ('(:', 2),
 ('hlep', 2),
 ('t2i', 2),
 ('ZebraKlub', 2),
 ('clownloach', 2),
 ('.!', 2),


### e. GloVe word embeddings trained from our corpus:

In [68]:
# Copy the "text8"  into the GloVe folder and run "./demo.sh". 
# Copy the "vector.txt" back to the notebook folder and run
# the second part of this cell.

# with open('text8', 'w+') as file:
#     for sentence in gensim_word_sentences_train:
#         for word in sentence:
#             if w2i[word] == 0:
#                 file.write("<UNK> ")
#             else:
#                 file.write(word + " ")
    

trained_glove_word_model = {}

with open('vectors.txt', 'r') as gloVe_corpus_file:
    for line in gloVe_corpus_file:
        l = line.split()
        trained_glove_word_model[l.pop(0)] = l
        
        
most_common_words = word_counts_train.most_common(1000)
trained_word_vectors = []
most_common_words = [ word for word, count in most_common_words]
for word in most_common_words:
    trained_word_vectors.append(trained_glove_word_model[i2w[w2i[word]]])
emb_scatter(trained_word_vectors, most_common_words, N=20)

finding clusters
fitting tsne
