In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from scipy import spatial

from sklearn.decomposition import PCA

from ast import literal_eval

import dateparser
from time import time
from tqdm import tqdm

In [None]:
%%time
from gensim.test.utils import common_texts
from gensim.models import Word2Vec
from gensim import models, corpora

# Embed the words in a word-space using Word2Vec

## Naive approach
To familiarize with the techniques and get a first glimpse on the possible outcomes, let's perform the word2vec embedding on the whole dataset.
Note that, as defined below, the model is not deterministic, meaning that running it twice won't provide the same results.

In [None]:
# load the dataset into a Pandas DataFrame and parse the date
articles_cor = pd.read_csv("./all_articles.csv", parse_dates=['Date'], date_parser=dateparser.parse)
# print 5 radom rows
articles_cor.sample(5)

In [None]:
articles_cor.tokens_cor = articles_cor.tokens_cor.apply(literal_eval)

In [None]:
%%time
model = Word2Vec(sentences=articles_cor.tokens_cor.values
                       , vector_size=100
                       , window=5
                       , min_count=1
                       , workers=4
                       , sg=1 #skipgram
                       , negative=5 #use of negative sampling
                      )

Now that the model is built, we can check what words are the closest to "écologie" in this wordspace. 

In [None]:
model.wv.most_similar('écologie', topn=20)

In [None]:
# explore through different terms to get some insights and check that it makes sense
model.wv.most_similar('noé', topn=20) # :'(

It can be verified that in these lists of words, the first value is a close word in the wordspace and the second value is the cosine similarity between those two terms. The following code allows also to play with the words to see how "close" or "far" two different words are in the built space.

In [None]:
def cosine_sim(word_vec1, word_vec2):
    """ Compute the cosine similarity between two vectors in the wordspace """
    # if the string is provided, convert into vector thanks to the model
    if type(word_vec1)==str:
        word_vec1 = model.wv[word_vec1]
    if type(word_vec2)==str:
        word_vec2 = model.wv[word_vec2]
        
    return 1 - spatial.distance.cosine(word_vec1, word_vec2)

In [None]:
vector_écologie = model.wv['écologie']  # get numpy vector of a word
vector_leitmotiv = model.wv['noé']
print("cosine_dist(écologie, noé) = {}".format(cosine_sim(vector_écologie, vector_leitmotiv)))

## Build different models for different epochs 

To check if we can see some differences between the different time periods, let's split the data in 3 parts: prior to 1990, between 1990 and 2000 and after 2000.

In [None]:
mask_rise = [(date.year < 1990 and date.year > 1970) for date in pd.to_datetime(articles_cor.Date)]
mask_peak = [(date.year > 1990 and date.year < 2000) for date in pd.to_datetime(articles_cor.Date)]
mask_stable = [(date.year > 2000) for date in pd.to_datetime(articles_cor.Date)]

df_rise = articles_cor[mask_rise]
df_peak = articles_cor[mask_peak]
df_stable = articles_cor[mask_stable]

print(len(df_rise), len(df_peak), len(df_stable))

In [None]:
%%time 

model_rise = Word2Vec(sentences=df_rise.tokens_cor.values
                 , vector_size=300
                 , window=5
                 , min_count=15
                 , workers=4
                 , sg=1 #skipgram
                 , negative=6 #use of negative sampling
                )

model_peak = Word2Vec(sentences=df_peak.tokens_cor.values
                 , vector_size=300
                 , window=5
                 , min_count=15
                 , workers=4
                 , sg=1 #skipgram
                 , negative=6 #use of negative sampling
                )

model_stable = Word2Vec(sentences=df_stable.tokens_cor.values
                 , vector_size=300
                 , window=5
                 , min_count=15
                 , workers=4
                 , sg=1 #skipgram
                 , negative=6 #use of negative sampling
                )

In [None]:
model_rise.save('model_rise_cor_300')
model_stable.save('model_stable_cor_300')
model_peak.save('model_peak_cor_300')

In [None]:
model_rise.wv.most_similar('écologie', topn=20)

In [None]:
model_peak.wv.most_similar('écologie', topn=20)

In [None]:
model_stable.wv.most_similar('écologie', topn=20)

In [None]:
vector_écologie_rise = model_rise.wv['écologie']
vector_écologie_peak = model_peak.wv['écologie']
vector_écologie_stable = model_stable.wv['écologie']

vector_science_rise = model_rise.wv['science']
vector_politique_rise = model_rise.wv['politique']

vector_science_peak = model_peak.wv['science']
vector_politique_peak = model_peak.wv['politique']

vector_science_stable = model_stable.wv['science']
vector_politique_stable = model_stable.wv['politique']

print("1970-1980: cosinedist(écologie, science)= {0:.3f} | cosinedist(écologie, politique)= {1:.3f}"\
     .format(cosine_sim(vector_écologie_rise, vector_science_rise)
             , cosine_sim(vector_écologie_rise, vector_politique_rise)
            )
     )

print("1990-2000: cosinedist(écologie, science)= {0:.3f} | cosinedist(écologie, politique)= {1:.3f}"\
     .format(cosine_sim(vector_écologie_peak, vector_science_peak)
             , cosine_sim(vector_écologie_peak, vector_politique_peak)
            )
     )

print("2000-...: cosinedist(écologie, science)= {0:.3f} | cosinedist(écologie, politique)= {1:.3f}"\
     .format(cosine_sim(vector_écologie_stable, vector_science_stable)
             , cosine_sim(vector_écologie_stable, vector_politique_stable)
            )
     )