In [2]:
import nltk
from nltk.corpus import reuters
from nltk import ConditionalFreqDist, FreqDist
from itertools import groupby, chain
from operator import itemgetter
import pandas as pd
import numpy as np
import re

# Natural Language Processing - Word representations

## Words, what are they?

<img src="../img/representation.png" width=50%/>

| **Figure 1**: Word representations|
|:-----------------:|

### Reuters news corpus

In [2]:
print("Number of documents: {}".format(len(reuters.fileids())))
print("Number of words: {}".format(len(set([word.lower() 
                                            for doc in reuters.fileids() 
                                            for word in reuters.words(doc)]))))
print("Training and test size split")
pd.DataFrame({'ids': map(lambda s: s.split("/")[0], reuters.fileids())}).groupby(by='ids').size()

Number of documents: 10788
Number of words: 31078
Training and test size split


ids
test        3019
training    7769
dtype: int64

In [3]:
print("With different news categories: {}".format(len(reuters.categories())))
reuters.categories()[:10]

With different news categories: 90


['acq',
 'alum',
 'barley',
 'bop',
 'carcass',
 'castor-oil',
 'cocoa',
 'coconut',
 'coconut-oil',
 'coffee']

#### Categories are overlapping 

In [8]:
sorted([(cat, len(reuters.fileids(cat))) for cat in reuters.categories()],
       key=itemgetter(1))[:10]

[('castor-oil', 2),
 ('groundnut-oil', 2),
 ('lin-oil', 2),
 ('rye', 2),
 ('sun-meal', 2),
 ('copra-cake', 3),
 ('cotton-oil', 3),
 ('dfl', 3),
 ('nkr', 3),
 ('palladium', 3)]

In [14]:
reuters.raw('test/14832')

"THAI TRADE DEFICIT WIDENS IN FIRST QUARTER\n  Thailand's trade deficit widened to 4.5\n  billion baht in the first quarter of 1987 from 2.1 billion a\n  year ago, the Business Economics Department said.\n      It said Janunary/March imports rose to 65.1 billion baht\n  from 58.7 billion. Thailand's improved business climate this\n  year resulted in a 27 pct increase in imports of raw materials\n  and semi-finished products.\n      The country's oil import bill, however, fell 23 pct in the\n  first quarter due to lower oil prices.\n      The department said first quarter exports expanded to 60.6\n  billion baht from 56.6 billion.\n      Export growth was smaller than expected due to lower\n  earnings from many key commodities including rice whose\n  earnings declined 18 pct, maize 66 pct, sugar 45 pct, tin 26\n  pct and canned pineapples seven pct.\n      Products registering high export growth were jewellery up\n  64 pct, clothing 57 pct and rubber 35 pct.\n  \n\n"

#### Create a data structure that maps fileids to their respective categories

In [6]:
sort_ids_cat = sorted([(fileid, cat) 
                       for cat in reuters.categories()
                       for fileid in reuters.fileids(cat)], 
                      key=itemgetter(0))

class_map = dict((k, list(map(itemgetter(1), v))) 
                 for k, v in groupby(sort_ids_cat, 
                                     key=itemgetter(0)))
list(class_map.items())[:10]

[('test/14826', ['trade']),
 ('test/14828', ['grain']),
 ('test/14829', ['crude', 'nat-gas']),
 ('test/14832', ['corn', 'grain', 'rice', 'rubber', 'sugar', 'tin', 'trade']),
 ('test/14833', ['palm-oil', 'veg-oil']),
 ('test/14839', ['ship']),
 ('test/14840', ['coffee', 'lumber', 'palm-oil', 'rubber', 'veg-oil']),
 ('test/14841', ['grain', 'wheat']),
 ('test/14842', ['gold']),
 ('test/14843', ['acq'])]

---
## Bag of Words representation (BoW)

In [7]:
max_train_docs = 500
max_test_docs = 100
train_doc_ids = [doc_id for doc_id in reuters.fileids() 
                 if doc_id.startswith("training")][:max_train_docs]
#vocab_dist = FreqDist(word.lower()
#                      for doc_id in train_doc_ids
#                      for word in reuters.words(doc_id))
#prog = re.compile('[a-z]+')
#most_freq_words = [tpl[0] for tpl in vocab_dist.most_common() if prog.match(tpl[0])]

def bow_transform(doc_ids):
    cfd = ConditionalFreqDist((doc_id, word.lower())
                              for doc_id in doc_ids
                              for word in reuters.words(doc_id))
    return pd.DataFrame(cfd).fillna(0).T

bow = bow_transform(train_doc_ids)
bow

Unnamed: 0,bahia,cocoa,review,showers,continued,throughout,the,week,in,zone,...,sticks,hostel,zincor,kms,johannesburg,quelled,minutes,police,arrived,fears
training/1,5.0,7.0,2.0,1.0,1.0,1.0,18.0,2.0,6.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
training/10,0.0,0.0,0.0,0.0,0.0,0.0,15.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
training/100,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,7.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
training/1000,0.0,0.0,0.0,0.0,0.0,0.0,8.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
training/10000,0.0,0.0,0.0,0.0,0.0,0.0,7.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
training/1077,0.0,0.0,0.0,0.0,0.0,0.0,7.0,0.0,2.0,0.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
training/10770,0.0,0.0,0.0,0.0,0.0,0.0,7.0,0.0,2.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
training/10771,0.0,0.0,0.0,0.0,0.0,0.0,55.0,1.0,10.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
training/10773,0.0,0.0,0.0,0.0,0.0,0.0,21.0,0.0,6.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Model creation and validation

* How do we assign a class label to a document?
   - Mean vector
   - Supervised classification algorithm


* How do we meassure the quality of our model?
   - For every class
   - For a set of classes
   

* What do we do with words we have never seen before?


---
## Term Frequency - Inverse Document Frequency (TF-IDF)

\\[\quad\\]

* Intuition:
> A high word frequency is not necessarily relevant (see last session). But if a word has a high frequency in one document relative to its frequency in all the other documents, we can assume that this word has a high relevance for this document.

\\[\quad\\]

* Term frequency:
> The frequency of a token normalized by the length of the document.
\\[
\text{tf}(t, d) = \frac{f_{t, d}}{N_{d}},
\\]
where \\(t\\) is a token in the document \\(d\\) and \\(N_{d}\\) is the number of tokens in the document.

\\[\quad\\]

* Inverse document frequency:
> A measure of how much information a word carries in the corpus.
\\[
\text{idf}(t, D) = \log \left(\frac{N_D}{|\{d \in D : t \in d\}|}\right),
\\]
where \\(N_D\\) is the number of documents in the corpus and \\(|\{d \in D : t \in d\}|\\) is the number of documents in which the token \\(t\\) occures.

\\[\quad\\]

* Term frequency - inverse document frequency:
> \\[
\text{tf-idf}(t, d, D) = \text{tf}(t, d) \cdot \text{idf}(t, D)
\\]
It is the BoW token frequency weighted by the occurence of the token in the corpus. As the frequency of the token across all documents increases the idf approaches 0, which controlls the influence of the tf.

\\[\quad\\]

#### Term frequency

In [9]:
def tf_fn(doc):
    N = doc.sum()
    return doc.apply(lambda t: t/N)

bow.apply(tf_fn, axis=1)

Unnamed: 0,bahia,cocoa,review,showers,continued,throughout,the,week,in,zone,...,sticks,hostel,zincor,kms,johannesburg,quelled,minutes,police,arrived,fears
training/1,0.007899,0.011058,0.00316,0.00158,0.00158,0.00158,0.028436,0.003160,0.009479,0.00158,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
training/10,0.000000,0.000000,0.00000,0.00000,0.00000,0.00000,0.057915,0.000000,0.000000,0.00000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
training/100,0.000000,0.000000,0.00000,0.00000,0.00000,0.00000,0.016807,0.000000,0.058824,0.00000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
training/1000,0.000000,0.000000,0.00000,0.00000,0.00000,0.00000,0.051613,0.000000,0.000000,0.00000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
training/10000,0.000000,0.000000,0.00000,0.00000,0.00000,0.00000,0.060870,0.000000,0.008696,0.00000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
training/1077,0.000000,0.000000,0.00000,0.00000,0.00000,0.00000,0.055556,0.000000,0.015873,0.00000,...,0.007937,0.007937,0.007937,0.007937,0.007937,0.007937,0.007937,0.007937,0.007937,0.000000
training/10770,0.000000,0.000000,0.00000,0.00000,0.00000,0.00000,0.067308,0.000000,0.019231,0.00000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.009615
training/10771,0.000000,0.000000,0.00000,0.00000,0.00000,0.00000,0.064103,0.001166,0.011655,0.00000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
training/10773,0.000000,0.000000,0.00000,0.00000,0.00000,0.00000,0.066038,0.000000,0.018868,0.00000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


#### Inverse document frequency

In [10]:
N_D = bow.shape[0] # Number of documents

def idf_fn(term_col):
    return np.log( N_D / sum(1 if t > 0 else 0 for t in term_col))

bow.apply(idf_fn)

bahia        6.214608
cocoa        3.506558
review       4.828314
showers      5.521461
continued    3.729701
               ...   
quelled      6.214608
minutes      6.214608
police       6.214608
arrived      6.214608
fears        6.214608
Length: 7071, dtype: float64

#### tf-idf

In [11]:
idf = bow.apply(idf_fn)

def tf_idf_fn(df):
    tf = df.apply(tf_fn, axis=1)
    return tf.apply(lambda d: d * idf.T, axis=1)

tf_idf_fn(bow)

Unnamed: 0,bahia,cocoa,review,showers,continued,throughout,the,week,in,zone,...,sticks,hostel,zincor,kms,johannesburg,quelled,minutes,police,arrived,fears
training/1,0.049089,0.038777,0.015255,0.008723,0.005892,0.008723,0.010305,0.007091,0.004054,0.007275,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
training/10,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.020989,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
training/100,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.006091,0.000000,0.025159,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
training/1000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.018705,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
training/10000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.022059,0.000000,0.003719,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
training/1077,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.020134,0.000000,0.006789,0.000000,...,0.049322,0.049322,0.049322,0.049322,0.049322,0.049322,0.049322,0.049322,0.049322,0.000000
training/10770,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.024393,0.000000,0.008225,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.059756
training/10771,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.023231,0.002616,0.004985,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
training/10773,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.023932,0.000000,0.008070,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


---
## Word embedding - word2vec

### What is a word embedding?

We want to represent words as vectors in a way, that the semantic relationships between words is encoded into their vector representation. 

\\[\quad\\]

<img src="../img/vec_space.png" width=40%/>

| **Figure 2**: Vector representation|
|:-----------------:|

\\[\quad\\]

The most famous example of word embeddings was published by [Mikolov et.al 2013-A]
> "[...] This allows vector-oriented reasoning based on the offsets between words. For example, the male/female relationship is automatically learned, and with the induced vector representations, “King - Man + Woman” results in a vector very close to “Queen.”"

[Mikolov et.al 2013-A] [Linguistic Regularities in Continuous Space Word Representations - Tomas Mikolov, Wen-tau Yih, Geoffrey Zweig](https://www.aclweb.org/anthology/N13-1090)

### The skip-gram model [Mikolov et.al 2013-B]

The skip-gram model is a simple artificial neural network model with one hidden layer.
The idea is that we train the network to do a predictive task, but we are not actually interessted in the network and its capabilties itself.
We are interessted in the learnt weights of the network.
These weights are the word embeddings we are looking for.

The predictive task the network has to do is the following.
Given a word in a sentence predict the likelihood for its surrounding words.
E.g. given the word 'united' the likelihood that the next word is 'states' is higher than the likelihood that the next word is 'potatoes'.

\\[\quad\\]

<img src="../img/training_data.png" width=40%/>

| **Figure 3**: [McCormick 2016]|
|:-----------------:|

\\[\quad\\]

Again, we have to think about the representation of words.
In this case we use a so called one-hot encoding, i.e. the dimensionality of the input vector is equal to the vocabulary size.
The input vector is all zeros except for a single one representing the word.

\\[\quad\\]

<img src="../img/skip_gram_net_arch.png" width=40%/>

| **Figure 4**: [McCormick 2016]|
|:-----------------:|

\\[\quad\\]

The hidden layer uses no activation function, but the output layer uses the softmax function.

\\[\quad\\]

\\[
f(x) = \frac{e^{x^Tw}}{\sum_{k=1}^K e^{x^Tw_k}}
\\]

\\[\quad\\]

The one-hot encoded word vector basically selects a single row from the hidden layer weight matrix.

<img src="../img/matrix_mult_w_one_hot.png" width=40%/>

| **Figure 5**: [McCormick 2016]|
|:-----------------:|

This single row is fed into the output softmax layer together with the output weight vector.

<img src="../img/output_weights_function.png" width=40%/>

| **Figure 6**: [McCormick 2016]|
|:-----------------:|

This model does not know anything about the order of words.
For the model ('New', 'York') is the same as ('York', 'New')

### Subsampling Frequent Words

The word 'the' appears in the context of every word.
Therefore it contains little information (see last session).
Subsampling remove frequent words as training examples and as context words.

### Negative Sampling

When training a neural network with back propagation usually all the weight of the network are being updated.
Our corpus likely has a vocabulary of more than 100k and possibly contains serveral million words, training becomes infeasible.
But the training examples have a particular structure, namely they are one-hot encoded.
Negative sampling exploits this structure and only updates the weights of the word which has the one and X additional words that are zero.

[McCormick 2016] [Word2Vec Tutorial - The Skip-Gram Model - Chris McCormick](http://mccormickml.com/2016/04/19/word2vec-tutorial-the-skip-gram-model/)

[Mikolov et.al 2013-B] [Distributed Representations of Words and Phrasesand their Compositionality - Tomas Mikolov, Ilya Sutskever, Kai Chen, Greg Corrado, Jeffrey Dean](https://papers.nips.cc/paper/5021-distributed-representations-of-words-and-phrases-and-their-compositionality.pdf)

In [3]:
text = [['The', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog', '.'],
        ['He', 'laid', 'down', 'on', 'the', 'grass', '.']]

corpus = [[word.lower() for word in sent] for sent in text]
vocabulary = list(set(chain(*corpus)))

win_size = 2

def create_pairs_from_sent(sent, win_size):
    rlt = []
    for n in range(len(sent)):
        context = sent[:n][-win_size:] + sent[n+1:][:win_size]
        rlt.append(zip([sent[n]]*len(context), context))
    return list(chain(*rlt))

def one_hot_builder(vocabulary):
    vocab_size = len(vocabulary)
    def one_hot_encoder(word):
        rlt = np.zeros(vocab_size)
        rlt[vocabulary.index(word)] = 1
        return rlt
    def one_hot_decoder(word):
        return vocabulary[np.argwhere(np.equal(word, 1))[0][0]]
    return one_hot_encoder, one_hot_decoder

one_hot_encoder, one_hot_decoder = one_hot_builder(vocabulary)

train_pairs = list(chain(*map(lambda s: create_pairs_from_sent(s, win_size), 
                              corpus)))

train_set = list(map(lambda tpl: (one_hot_encoder(tpl[0]), 
                                  one_hot_encoder(tpl[1])), 
                     train_pairs))
train_set[:3]

[(array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.]),
  array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.])),
 (array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.]),
  array([0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.])),
 (array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.]),
  array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.]))]

In [1]:
# Continued in Sesssion 3

# Introductory Material

https://www.kaggle.com/learn/overview