In [1]:
import datetime
now = datetime.datetime.now()

# Embeddings

## Summer School JGU Mainz — Advanced Methods in Behavioral Economics, 2021

### Carina I. Hausladen

In [2]:
print(now.strftime("%Y-%m-%d"))

2021-09-25


# Introduction

* Tom Lin, 2019: [Blog](https://towardsdatascience.com/nlp-performance-of-different-word-embeddings-on-text-classification-de648c6262b), [github](https://github.com/TomLin/Playground/blob/master/04-Model-Comparison-Word2vec-Doc2vec-TfIdfWeighted.ipynb)
* [pretrained embeddings](https://deepset.ai/german-word-embeddings)
    * The deepset files need to be manually reformatted, according to the [gensim documentation](https://radimrehurek.com/gensim/scripts/glove2word2vec.html).
    * **Please make sure that you have downloaded the pretrained embeddings (.magnitude files) according to the README.md and placed them in the data folder.**

- In the following, we will learn about various embedding methods.
- We will use logistic regression in order to compare classification performance across embeddings.

In [3]:
import multiprocessing
import warnings
warnings.simplefilter("ignore")

import numpy as np
import pandas as pd
from gensim.models.word2vec import Word2Vec
from imblearn.over_sampling import RandomOverSampler
from nltk import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from tqdm import tqdm

try:
    from pymagnitude import Magnitude
except ModuleNotFoundError:  # workaround for weird bug
    from pymagnitude import Magnitude

from utils.UtilWordEmbedding import DocModel, MeanEmbeddingVectorizer, TfidfEmbeddingVectorizer
from utils.setup import prepare_docs
from utils.strt_grp_sffl_splt import str_grp_splt
from utils.utility import run_log_reg

In [4]:
df = pd.read_csv('data/chat_hours_simulated.csv')
df_prep, all_docs = prepare_docs(df, y="honestmean", X="Chat_subject", dv="player.hours_stated")
all_docs.new_docs = [x if len(x) != 0 else "kein_chat" for x in all_docs.new_docs]  

In [5]:
df["honestmean"].value_counts()  # 1: honest responses
ros = RandomOverSampler(random_state=42, sampling_strategy='minority')

# Bag of Words

In [6]:
bow = CountVectorizer(input='content', lowercase=False, preprocessor=lambda x: x)
X_bow = bow.fit_transform(all_docs.new_docs)
train_idx, test_idx = str_grp_splt(df_prep,
                                   grp_col_name="group",
                                   y_col_name="honestmean",
                                   train_share=0.8)
train_X = X_bow[train_idx]
test_X = X_bow[test_idx]
train_y = df_prep["honestmean"][train_idx]
test_y = df_prep["honestmean"][test_idx]
train_X, train_y = ros.fit_resample(train_X, train_y)

train_y.value_counts()
test_y.value_counts()

m_bow, model, ma = run_log_reg(train_X, test_X, train_y, test_y)

F1: 0.802 | Pr: 0.687 | Re: 0.987 | AUC: 0.557 | Accuracy: 0.679 



# Tf–idf Term Weighting

In [7]:
tfidf = TfidfVectorizer(input='content', lowercase=False, preprocessor=lambda x: x)
X_tfidf = tfidf.fit_transform(all_docs.new_docs)
train_idx, test_idx = str_grp_splt(df_prep,
                                   grp_col_name="group",
                                   y_col_name="honestmean",
                                   train_share=0.8)
train_X = X_tfidf[train_idx]
test_X = X_tfidf[test_idx]
train_y = df_prep["honestmean"][train_idx]
test_y = df_prep["honestmean"][test_idx]
train_X, train_y = ros.fit_resample(train_X, train_y)

m_tfidf = run_log_reg(train_X, test_X, train_y, test_y)


F1: 0.797 | Pr: 0.673 | Re: 1.000 | AUC: 0.561 | Accuracy: 0.664 



# Word2Vec
[Mikolov et al. 2013](https://arxiv.org/abs/1301.3781), [Vatsal](https://towardsdatascience.com/word2vec-explained-49c52b4ccb71) <br>

* Intuition
    * Word2Vec groups vectors of similar words. 
    * It estimates a word's meaning based on its occurrences in the text. 
    * These estimates yield word associations with other words in the corpus.
    * For example: 
    ```
    King    -    Man    +    Woman    =    Queen
    [5,3]   -    [2,1]  +    [3, 2]   =    [5,4]  
```

* How it works
    * Words are transformed into a numerical representation of the word.
    * This vector is learned via a neural network. 
    * The vectors try to capture various characteristics of that word, e.g. semantic relationship, context, etc. 

        ```
        this = [1, 0, 0, 0, 0, 0, ... 0]
        is    = [0, 1, 0, 0, 0, 0, ... 0]
        fun = [0, 0, 1, 0, 0, 0, ... 0]
        ```

* Implementation
    * We implement Word2Vec via gensim's [Word2vec class](https://radimrehurek.com/gensim/models/word2vec.html).
    * Variations
        * We can either train our own embeddings or use pretrained embeddings.
        * Both types can be implemented via bow or tf-idf.

## own, bow

In [8]:
w2v_own = Word2Vec(all_docs.doc_words, vector_size=70, min_count=1)
mean_vec_tr = MeanEmbeddingVectorizer(w2v_own)  # averages all word embeddings occuring in the model
doc_vec = mean_vec_tr.transform(all_docs.doc_words)

train_idx, test_idx = str_grp_splt(df_prep,
                                   grp_col_name="group",
                                   y_col_name="honestmean",
                                   train_share=0.8)
train_X = doc_vec[train_idx]
test_X = doc_vec[test_idx]
train_y = df_prep["honestmean"][train_idx]
test_y = df_prep["honestmean"][test_idx]
train_X, train_y = ros.fit_resample(train_X, train_y)

m_w2v_own_smpl = run_log_reg(train_X, test_X, train_y, test_y)




F1: 0.805 | Pr: 0.685 | Re: 1.000 | AUC: 0.525 | Accuracy: 0.681 



## own, tf-idf


In [9]:
tfidf_vec_tr = TfidfEmbeddingVectorizer(w2v_own)
tfidf_vec_tr.fit(all_docs.doc_words)
tfidf_doc_vec = tfidf_vec_tr.transform(all_docs.doc_words)

train_idx, test_idx = str_grp_splt(df_prep,
                                   grp_col_name="group",
                                   y_col_name="honestmean",
                                   train_share=0.8)
train_X = tfidf_doc_vec[train_idx]
test_X = tfidf_doc_vec[test_idx]
train_y = df_prep["honestmean"][train_idx]
test_y = df_prep["honestmean"][test_idx]
train_X, train_y = ros.fit_resample(train_X, train_y)

m_w2v_own_tfidf = run_log_reg(train_X, test_X, train_y, test_y)




F1: 0.795 | Pr: 0.670 | Re: 1.000 | AUC: 0.458 | Accuracy: 0.660 



## pre, bow

In [10]:
# python -m pymagnitude.converter -i 'analysis/data/vectors_w2v.txt' -o 'analysis/data/w2v.magnitude'
w2v = Magnitude('data/w2v_vec.magnitude')

def avg_embdngs(documents, embedings, num_trials=10):
    vectors = []
    for title in tqdm(documents):
        try:
            emb = np.average(embedings.query(word_tokenize(title)), axis=0)
            vectors.append(emb)
        except:
            print(f"Failed")
            print(title)
    return np.array(vectors)


all_docs.new_docs = [x if len(x) != 0 else "kein_chat" for x in all_docs.new_docs]
X_w2v_pre = avg_embdngs(all_docs.new_docs, w2v)

train_idx, test_idx = str_grp_splt(df_prep,
                                   grp_col_name="group",
                                   y_col_name="honestmean",
                                   train_share=0.8)
train_X = X_w2v_pre[train_idx]
test_X = X_w2v_pre[test_idx]
train_y = df_prep["honestmean"][train_idx]
test_y = df_prep["honestmean"][test_idx]
train_X, train_y = ros.fit_resample(train_X, train_y)

m_w2v_pre_smpl = run_log_reg(train_X, test_X, train_y, test_y)


100%|████████████████████████████████████████████████████████████████████████████████████████████████| 351/351 [00:19<00:00, 18.27it/s]


F1: 0.803 | Pr: 0.685 | Re: 0.992 | AUC: 0.531 | Accuracy: 0.679 



## pre, idf


In [11]:
tfidf = TfidfVectorizer(input='content', lowercase=False, preprocessor=lambda x: x)
tfidf.fit(all_docs.new_docs)
idf_dict = dict(zip(tfidf.get_feature_names(), tfidf.idf_))


def tfidf_embdngs(documents, embedings):
    vectors = []
    for title in tqdm(documents):
        w2v_vectors = embedings.query(word_tokenize(title))
        weights = [idf_dict.get(word, 1) for word in word_tokenize(title)]
        vectors.append(np.average(w2v_vectors, axis=0, weights=weights))
    return np.array(vectors)


X_tfidf_w2v_pre = tfidf_embdngs(all_docs.new_docs, w2v)

train_idx, test_idx = str_grp_splt(df_prep,
                                   grp_col_name="group",
                                   y_col_name="honestmean",
                                   train_share=0.8)
train_X = X_tfidf_w2v_pre[train_idx]
test_X = X_tfidf_w2v_pre[test_idx]
train_y = df_prep["honestmean"][train_idx]
test_y = df_prep["honestmean"][test_idx]
train_X, train_y = ros.fit_resample(train_X, train_y)

m_w2v_pre_tfidf = run_log_reg(train_X, test_X, train_y, test_y)


100%|██████████████████████████████████████████████████████████████████████████████████████████████| 351/351 [00:00<00:00, 2013.13it/s]


F1: 0.808 | Pr: 0.691 | Re: 0.995 | AUC: 0.529 | Accuracy: 0.688 



# Global Vectors (GloVe)
[nlp Stanford](https://nlp.stanford.edu/projects/glove/),
[Pennington et al. (2014)](https://aclanthology.org/D14-1162.pdf),
[Thushan Ganegedara](https://towardsdatascience.com/light-on-math-ml-intuitive-guide-to-understanding-glove-embeddings-b13b4f19c010) <br>

- GloVe is an unsupervised learning algorithm for obtaining vector representations for words. 
- Training is performed on aggregated global word-word co-occurrence statistics.
- The resulting representations showcase interesting linear substructures of the word vector space.

- Example
    - Co-occurrence probabilities the for target words ice and steam.
    - For the ratio, noise from non-discriminative words like water and fashion cancel out.
    - Large values (> 1) correlate well with properties specific to ice.
    - Small values (< 1) correlate well with properties specific of steam.

| Probability and Ratio | k = solid | k = gas | k = water | k = fashion |
| --------------------- | --------- | ------- | --------- | ----------- | 
| P(k\|ice)| 1.9 × 10−4 | 6.6 × 10−5 | 3.0 × 10−3 | 1.7 × 10−5 |
| P(k\|steam)| 2.2 × 10−5 | 7.8 × 10−4 | 2.2 × 10−3 | 1.8 × 10−5 |
| P(k\|ice)/P(k\|steam)| 8.9 | 8.5 × 10−2 | 1.36 | 0.96 |

- Word2Vec vs. GloVe
    - Word2Vec 
        - Relies only on local information of language. 
        - Local: the semantics learnt for a given word is only affected by the surrounding words.
    - GloVe
       - Incorporates global and local statistics (word co-occurrence) to obtain word vectors. 

## glove, pre

In [12]:
glove = Magnitude('data/glove_vec.magnitude')
X_glove_pre = avg_embdngs(all_docs.new_docs, glove)

train_idx, test_idx = str_grp_splt(df_prep,
                                   grp_col_name="group",
                                   y_col_name="honestmean",
                                   train_share=0.8)
train_X = X_glove_pre[train_idx]
test_X = X_glove_pre[test_idx]
train_y = df_prep["honestmean"][train_idx]
test_y = df_prep["honestmean"][test_idx]
train_X, train_y = ros.fit_resample(train_X, train_y)

m_glove_pre_smpl = run_log_reg(train_X, test_X, train_y, test_y)


100%|████████████████████████████████████████████████████████████████████████████████████████████████| 351/351 [00:12<00:00, 28.22it/s]


F1: 0.803 | Pr: 0.685 | Re: 0.994 | AUC: 0.520 | Accuracy: 0.679 



## pre, idf


In [13]:
tfidf = TfidfVectorizer(input='content', lowercase=False, preprocessor=lambda x: x)
tfidf.fit(all_docs.new_docs)
idf_dict = dict(zip(tfidf.get_feature_names(), tfidf.idf_))
X_tfidf_glove_pre = tfidf_embdngs(all_docs.new_docs, glove)

train_idx, test_idx = str_grp_splt(df_prep,
                                   grp_col_name="group",
                                   y_col_name="honestmean",
                                   train_share=0.8)
train_X = X_tfidf_glove_pre[train_idx]
test_X = X_tfidf_glove_pre[test_idx]
train_y = df_prep["honestmean"][train_idx]
test_y = df_prep["honestmean"][test_idx]
train_X, train_y = ros.fit_resample(train_X, train_y)

m_glove_pre_tfidf = run_log_reg(train_X, test_X, train_y, test_y)


100%|██████████████████████████████████████████████████████████████████████████████████████████████| 351/351 [00:00<00:00, 2099.07it/s]


F1: 0.802 | Pr: 0.684 | Re: 0.994 | AUC: 0.554 | Accuracy: 0.677 



# fastText
[Joulin et al. (2016)](https://arxiv.org/abs/1607.01759),
[fasttext](https://fasttext.cc),
[Nishan Subedi](https://towardsdatascience.com/fasttext-under-the-hood-11efc57b2b3)<br>

* Strengths:
    * FastText achieves high performance for word representations and sentence classification.
    * It performes specially strong in the case of rare words by making use of character level information.



* Bag of character n-grams

    * Each word is represented as a bag of character n-grams in addition to the word itself. 
     * For example: word $matter$; $n = 3$; fastText representation:  `<ma, mat, att, tte, ter, er>`. 
     * `<` and `>` are added as boundary symbols to distinguish the ngram of a word from a word itself.

* How it works
    * The model is a bag of words model. 
    * Aside of the sliding window of n-gram selection, there is no internal structure of a word that is taken into account.
    * As long as the characters fall under the window, the order of the character n-grams does not matter. 
    * During the model update, fastText learns weights for each of the n-grams as well as the entire word token.

Load pretrained embeddings
- We implemented fastText using logistic regression and sentence embeddings.
- The fasttext API requires C++ Build Tools (Windows) or xtools (macOS).
- These are quite big to install. Therefore, we load pretrained embeddings that were generated in another script.

In [14]:
m_ft_own = np.load("data/fasttext_embeddings.npy")

# Doc2Vec
[Le and Mikolov (2014)](http://proceedings.mlr.press/v32/le14.html), 
[Edward Ma](https://towardsdatascience.com/understand-how-to-transfer-your-paragraph-to-vector-by-doc2vec-1e225ccf102) <br>

* Intuition
    * Instead of averaging word embeddings over the sequence, we can train paragraph vectors directly. 
    * We use document embeddings and distributed memory (PV-DM) instead of distributed bag of words. 
    * Randomly samples adjacent words from a paragraph. 
    * Predicts a center word from the sampled set by taking the context words and a paragraph id as input.

* How it works
    * Doc2Vec is based on Word2Vec: It learns the document representation in an unsupervised manner.
    * Input and output 
        * The input of texts (i.e. word) per document can vary.
        * The output are vectors of fixed-length.
    * Paragraph and word vectors
        * The paragraph vector is unique among a document. 
        * Word vectors are shared among all documents.
    * Training: Word vectors will be trained while the paragraph will be thrown away after that. 
    * Prediction: The paragraph vector will be initialized randomly and computed by word vectors.

## PV-DM
Distributed Memory Model of Paragraph Vectors (PV-DM)
* Both paragraph vectors and word vectors are initialized randomly. 
* Every paragraph vector is assigned to a single document while word vectors are shared among all documents. 
* Either averaging or concatenating both paragraph and words vectors.
* Both are passed to stochastic gradient descent.
* The gradient is obtained via back propagation.

In [15]:
workers = multiprocessing.cpu_count()
dm_args = {'dm': 1, 'dm_mean': 1, 'vector_size': 100, 'window': 5, 'negative': 5, 'hs': 0, 'min_count': 2,
           'sample': 0, 'workers': workers, 'alpha': 0.025, 'min_alpha': 0.025, 'epochs': 100,
           'comment': 'alpha=0.025'
           }
dm = DocModel(docs=all_docs.tagdocs, **dm_args)
dm.custom_train()
dm_doc_vec_ls = []
for i in range(len(dm.model.dv)):
    dm_doc_vec_ls.append(dm.model.dv[i])
dm_doc_vec = pd.DataFrame(dm_doc_vec_ls)

train_idx, test_idx = str_grp_splt(df_prep,
                                   grp_col_name="group",
                                   y_col_name="honestmean",
                                   train_share=0.8)
train_X = dm_doc_vec.loc[train_idx]
test_X = dm_doc_vec.loc[test_idx]
train_y = df_prep["honestmean"][train_idx]
test_y = df_prep["honestmean"][test_idx]
train_X, train_y = ros.fit_resample(train_X, train_y)

m_dm_doc_vec = run_log_reg(train_X, test_X, train_y, test_y)

F1: 0.808 | Pr: 0.695 | Re: 0.989 | AUC: 0.532 | Accuracy: 0.690 



# Plot Results


In [16]:
lst = [list(m_bow), list(m_tfidf[0]),
       list(m_w2v_own_smpl[0]), list(m_w2v_own_tfidf[0]),
       list(m_w2v_pre_smpl[0]), list(m_w2v_pre_tfidf[0]),
       list(m_glove_pre_smpl[0]), list(m_glove_pre_tfidf[0]),
       list(m_ft_own), list(m_dm_doc_vec[0])]

df_results = pd.DataFrame(lst, columns=['f1', 'pr', 're', 'AUC', 'acc'], dtype=float)
df_results.rename(index={0: 'bow', 1: 'tfidf',
                         2: 'w2v (own, smpl)', 3: 'w2v (own, tfidf)',
                         4: 'w2v (pre, smpl)', 5: 'w2v (pre, tfidf)',
                         6: 'glove (pre, smpl)', 7: 'glove (pre, tfidf)',
                         8: 'ft (own, smpl)', 9: 'dm_doc_vec'}, inplace=True)
df_results = df_results.sort_values(by='f1', ascending=False)
df_results

Unnamed: 0,f1,pr,re,AUC,acc
dm_doc_vec,0.808347,0.695347,0.988889,0.531874,0.690426
"w2v (pre, tfidf)",0.80811,0.691282,0.995238,0.529391,0.688298
"w2v (own, smpl)",0.805195,0.684783,1.0,0.525294,0.680851
"glove (pre, smpl)",0.803108,0.684908,0.993651,0.519867,0.678723
"w2v (pre, smpl)",0.802874,0.685397,0.992063,0.531234,0.678723
bow,0.80212,0.686807,0.987302,0.55658,0.678723
"glove (pre, tfidf)",0.802088,0.683522,0.993651,0.553815,0.676596
tfidf,0.796923,0.673095,1.0,0.560778,0.66383
"w2v (own, tfidf)",0.794872,0.670213,1.0,0.457655,0.659574
"ft (own, smpl)",0.769231,0.650794,0.97619,0.678571,0.675676
