#### The exercise below uses two off-the-shelf NLP models developed independently by Google and Facebook. Both models are trained to learn to read interest rate decisions by the Reserve Bank of Australia. Both models are then tested to see how they do on simple word association tasks i.e  has the algorithm learnt to 'read' the vocabulary of the Reserve Bank and has it grouped vocabulary into the correct contexts. 

#### Beyond the scope of this exercise is the use of the resulting  word embedding matrix for downstream Regression or Classification tasks.  As a side note TFIDF models are an alternate route for basic sentiment analysis tasks especially where sentiment is explicit in the document. Personal preference is to use variants of RNN's  in tailored NLP tasks and where possible keep the net as shallow as possible depending on the data and task.


#### Aug/2018

In [1]:
import pandas as pd
import numpy as np
import spacy
from spacy.en import English
from spacy.lemmatizer import Lemmatizer
import gensim
from gensim.models import word2vec
from gensim.models.keyedvectors import KeyedVectors
from gensim.models import FastText 
import matplotlib.pyplot as plt
import sklearn.manifold
from __future__ import absolute_import, division, print_function
import math
from sklearn import metrics 

In [2]:
#Use SpaCy to parse data. Leaving stopwords in, just Lemmatizing. Not Stemming.
  


nlp = spacy.load('en')
parser = English() 


corpus_list_train = []
import codecs
with codecs.open('..RBA_2018.txt','r',encoding='utf8') as f:
     for line in f.readlines():
 
        article= []
         
        corpus_test  = nlp(line) 
        for w in corpus_test:
            if not w.is_punct:
                 article.append(w.lemma_)
 
        corpus_list_train.append(article)
 
 
X_train = corpus_list_train 

In [3]:
#Word2vec. Model designed by the Google Brain Team(Mikolov et al) and released around 2013/14. 
#Models developed in Python by the team at Gensim.
 
size = 200
window_size = 5  
epochs = 50
min_count = 0
workers = 4

 
model = word2vec.Word2Vec(X_train, alpha=0.015,sg=1,window=window_size,size=size,\
                          min_count=min_count,workers=workers,iter=epochs,
                          sample=0.01,batch_words=1,negative=5)

model.build_vocab(sentences=X_train,update=True)
model.train(sentences=X_train,epochs=50,total_examples=model.corpus_count)

269614

In [4]:
#Fasttext model. Mikolov et al. Mikolov moved to the Facebook AI team and model released
#a couple of years ago. Model moves beyond Word2Vec and breaks words into sub component 
#morphological parts which also strives to makes it easier to use on multiple languages.

model_ft = FastText(X_train,sg=0, hs=0, size=200, alpha=0.025, 
                     window=5, min_count=0, max_vocab_size=None, word_ngrams=1,
                     sample=0.001, seed=1, workers=4, min_alpha=0.0001, negative=5, 
                     cbow_mean=1, iter=5, null_word=0,
                     min_n=2, max_n=8, sorted_vocab=1, bucket=2000000, trim_rule=None, batch_words=300) 

model_ft.build_vocab(sentences=X_train,update=True)
model_ft.train(sentences=X_train,epochs=50,total_examples=model_ft.corpus_count)

In [5]:
#Word association using Word2Vec.
#Checking what the model believes are the ten most contextually
#similar words to the word 'leave'.
 
model.wv.most_similar(('leave'),topn=10)

[(u'cash', 0.9738770723342896),
 (u'decide', 0.9602638483047485),
 (u'today', 0.8722482919692993),
 (u'board', 0.8350348472595215),
 (u'1.50', 0.7566527724266052),
 (u'unchanged', 0.7442362308502197),
 (u'stance', 0.7228062152862549),
 (u'meeting', 0.7142150402069092),
 (u'hold', 0.7089430689811707),
 (u'which', 0.6934565305709839)]

In [6]:
#Word association using FastText.
#What does the model believe are the ten most contextually similar words/terms to '1.5'
#Current Cash rate set by the Reserve Bank is 1.5%.


model_ft.wv.most_similar(('1.50'),topn=10)

[(u'cash', 0.9998171925544739),
 (u'unchanged', 0.999815046787262),
 (u'board', 0.9998142123222351),
 (u'cent', 0.9998129606246948),
 (u'capacity', 0.999812126159668),
 (u'boost', 0.9998111128807068),
 (u'leave', 0.999811053276062),
 (u'bank', 0.9998108744621277),
 (u'number', 0.9998099207878113),
 (u'policy', 0.9998087882995605)]

In [7]:
#Word embedding matrix and unique Vocabulary index table from word2vec and fasttext models
#for downstream regression or classification algorithms.

print(model.vector_size, model.wv )
print(model_ft.vector_size, model_ft.wv )
w2v = dict(zip(model.wv.index2word, model.wv.syn0))
ft = dict(zip(model_ft.wv.index2word, model_ft.wv.syn0))

200 <gensim.models.keyedvectors.EuclideanKeyedVectors object at 0x118871250>
200 <gensim.models.wrappers.fasttext.FastTextKeyedVectors object at 0x12481a190>
