A notebook to evaluate different kinds of embeddings

In [17]:
import pandas as pd
import helpers
import io
import numpy as np
from gensim.models import Word2Vec
from gensim.models import FastText
from scipy import spatial

In [9]:
print("Load data...")
DATA_PATH = "./data/telegraph_articles_de.csv"
SPACY_LANGUAGE = "de_core_news_sm"
data_de = pd.read_csv(DATA_PATH, index_col="Unnamed: 0") 
print("Create corpus for model...")
raw_docs, _ = helpers.create_corpus(data_de, SPACY_LANGUAGE)
print("Done !")

Load data...
Create corpus for model...
Runtime: 0.06 seconds || Completed: 1 of 12420
Runtime: 82.01 seconds || Completed: 501 of 12420
Runtime: 151.57 seconds || Completed: 1001 of 12420
Runtime: 215.81 seconds || Completed: 1501 of 12420
Runtime: 264.82 seconds || Completed: 2001 of 12420
Runtime: 384.04 seconds || Completed: 2501 of 12420
Runtime: 482.14 seconds || Completed: 3001 of 12420
Runtime: 558.92 seconds || Completed: 3501 of 12420
Runtime: 647.27 seconds || Completed: 4001 of 12420
Runtime: 715.03 seconds || Completed: 4501 of 12420
Runtime: 788.01 seconds || Completed: 5001 of 12420
Runtime: 849.41 seconds || Completed: 5501 of 12420
Runtime: 918.71 seconds || Completed: 6001 of 12420
Runtime: 1011.83 seconds || Completed: 6501 of 12420
Runtime: 1098.10 seconds || Completed: 7001 of 12420
Runtime: 1170.24 seconds || Completed: 7501 of 12420
Runtime: 1252.58 seconds || Completed: 8001 of 12420
Runtime: 1312.53 seconds || Completed: 8501 of 12420
Runtime: 1371.87 seconds |

### 1. Different word embeddings to test

In [18]:
# Word 2 Vec skipgram
w2v_sg_model = Word2Vec(sentences=raw_docs,
                       vector_size=100
                       , window=5
                       , min_count=1
                       , workers=4
                       , sg=1 #skipgram
                       , negative=5
                       , seed = 0)

In [19]:
# Word 2 Vec cbow
w2v_cbow_model = Word2Vec(sentences=raw_docs,
                       vector_size=100
                       , window=5
                       , min_count=1
                       , workers=4
                       , sg=0 #cbow
                       , negative=5
                       , seed = 0) #use of negative sampling

In [21]:
# Fasttext
ftt_model = FastText(vector_size=100, 
                     window=5, 
                     min_count=1, 
                     sentences=raw_docs, 
                     epochs=20)

### 2. The topics to test

In [30]:
topics_df = pd.read_csv("../data/topics_df.csv")
test_topics = topics_df[(topics_df["country"]=="de") & (topics_df["keyword"]=="telegraph")]["topic"]
to_list = lambda l : l.replace("[", "").replace("]", "").replace("\'", "").split(", ")
test_topics = [to_list(row) for row in list(test_topics)]

### 3. The score measure

In [32]:
# Compactness score of one word
# https://aclanthology.org/W16-2508.pdf
# Calculating by averaging all the pairwise semantic similarities of the words in the topic without this word
def compactness(w, topic, model):
    pairwise_sim = [1 - spatial.distance.cosine(model.wv[w],model.wv[w_bis]) for w_bis in topic if not w == w_bis]
    return sum(pairwise_sim)/(len(pairwise_sim))

In [37]:
# Compute average compactness score for model
def average_compactness(model, topics_list):
    compactnesses = [compactness(w, t, model) for t in topics_list for w in t ]
    return sum(compactnesses)/len(compactnesses)

In [70]:
# Outlier Position Percentage, same paper
def mOPP(model, topics_list):
    num = 0
    denom = 0
    for i, t in enumerate(topics_list):
        c = [compactness(w, t, model) for w in t]
        mean_for_group = sum(c)/len(c)
        for j, t_bis in enumerate(topics_list):
            if not i == j:
                for w in t_bis:
                    denom += 1
                    if compactness(w, t, model) > mean_for_group: # good
                        num += 1
    return num/denom

In [71]:
def custom_score(model, topics_list):
    # compactness of word in its topic / in the other topics, for every word, mean
    # the smaller the better (in its topic must be low, in other topics must be high)
    ratios = []
    for i, t in enumerate(topics_list):
        for w in t:
            cw = compactness(w, t, model)
            for j, t_bis in enumerate(topics_list):
                if not i == j:
                    cw_bis = compactness(w, t_bis, model)
                    ratios = [cw/cw_bis] if not len(ratios) else ratios + [cw_bis]
    return sum(ratios)/len(ratios)

### 4. Model Comparison

In [72]:
print("Average Compactness")
print("  Word2Vec CBOW model : ", average_compactness(w2v_cbow_model, test_topics))
print("  Word2Vec Skipgram model : ", average_compactness(w2v_sg_model, test_topics))
print("  Fasttext model : ", average_compactness(ftt_model, test_topics))
print("OPP")
print("  Word2Vec CBOW model : ", mOPP(w2v_cbow_model, test_topics))
print("  Word2Vec Skipgram model : ", mOPP(w2v_sg_model, test_topics))
print("  Fasttext model : ", mOPP(ftt_model, test_topics))
print("Custom")
print("  Word2Vec CBOW model : ", custom_score(w2v_cbow_model, test_topics))
print("  Word2Vec Skipgram model : ", custom_score(w2v_sg_model, test_topics))
print("  Fasttext model : ", custom_score(ftt_model, test_topics))

Average Compactness
  Word2Vec CBOW model :  0.7200669950644879
  Word2Vec Skipgram model :  0.5144439304068062
  Fasttext model :  0.2043688678386321
OPP
  Word2Vec CBOW model :  0.13917867764813543
  Word2Vec Skipgram model :  0.13070235644017386
  Fasttext model :  0.10911690688629604
Custom
  Word2Vec CBOW model :  0.5461314424042428
  Word2Vec Skipgram model :  0.3528988816537987
  Fasttext model :  0.06661156495706871
