In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from os import listdir
from scipy import spatial

from sklearn.decomposition import PCA
from sklearn.feature_extraction.text import TfidfVectorizer


from collections import Counter

import dateparser
from time import time
from tqdm import tqdm

In [2]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

import string

from spellchecker import SpellChecker

from gensim.test.utils import common_texts
from gensim.models import Word2Vec
from gensim import models, corpora



# Initialisation

## Load the data
First, let's load the data into a DataFrame, remove duplicates if there are any and drop rows with articles not recognized as string.

In [3]:
%%time 
# check how long does it take to run the cell

DATA_PATH = "../Data/"

articles = pd.DataFrame()
# read the csvs file by file
for f in listdir(DATA_PATH):
    curr_df = pd.read_csv(DATA_PATH+f
                          , usecols = ['Article Title', 'Journal', 'Date', 'Url', 'Text']
                          , parse_dates=['Date']
                          , date_parser=dateparser.parse
                         )
    # add the word used to retrieve the article
    curr_df['word'] = f.split('.')[0].split('_')[1]
    articles = articles.append(curr_df)
    print(f, ' : ', len(curr_df)
          , ' | # NaNs : ', np.count_nonzero(curr_df.isnull().values.ravel()))

articles_écologiquement.csv  :  815  | # NaNs :  3
articles_écologismes.csv  :  2  | # NaNs :  0
articles_écologistes.csv  :  11000  | # NaNs :  22
articles_écologie.csv  :  2810  | # NaNs :  5
articles_écologies.csv  :  8  | # NaNs :  0
articles_écologisme.csv  :  43  | # NaNs :  0
articles_écologiste.csv  :  7100  | # NaNs :  7
CPU times: user 38 s, sys: 2.42 s, total: 40.4 s
Wall time: 40.4 s


In [4]:
# check that all articles are different 
if not articles.Url.is_unique:
    print("Removing duplicates")
    articles.drop_duplicates(subset="Url"
                             , keep='first'
                             , inplace=True
                             , ignore_index=False
                            )
else:
    print("No duplicates in the DF.")

No duplicates in the DF.


In [5]:
string_mask = [type(text)==str for text in articles["Text"]]
print("There are {0} articles not recognized as strings."\
      .format(len(string_mask)-np.sum(string_mask)))
# remove articles that aren't string
articles = articles[string_mask]

There are 18 articles not recognized as strings.


In [6]:
articles.reset_index(inplace=True)
articles.sample(3)

Unnamed: 0,index,Article Title,Journal,Date,Url,Text,word
9727,8920,pnr.iR RI nr.n,La Liberté,2005-11-14,https://www.e-newspaperarchives.ch/?a=d&d=LLE2...,"pnr . iR RI nr . n dats de gauche , les deux...",écologistes
8027,7220,Sept organisations critiquent le manque ...,La Liberté,1994-02-19,https://www.e-newspaperarchives.ch/?a=d&d=LLE1...,Sept organisations critiquent le manque de vo...,écologistes
10003,9196,Puisse Mitterrand sauvegarder la liberté...,La Gazette,1981-05-28,https://www.e-newspaperarchives.ch/?a=d&d=GDM1...,Puisse Mitterrand sauvegarder la liberté ! U...,écologistes


## Data concierge

Now let's do some basic textual data cleaning by removing:
- stopwords from nltk's french list, augmented manually
- punctuation
- numerical values
- isolated letters

To do so, we need to tokenize the textual strings first.

In [7]:
STOP_FR = stopwords.words("french")
STOP_FR += ["comme", "tout", "aussi", "sans", "si", "selon"]
STOP_FR += ["de", "du", "des"] # added bc of tfidf on epochs

PUNCTUATION = [punc for punc in string.punctuation]
PUNCTUATION += ["›", "__", "’", "‘", "...", "„", "¦"]

In [8]:
def tokenize_list_articles(list_articles):
    """ tokenize iterable object and remove some pre-defined values """
    tokenized_articles = [word_tokenize(article) for article in list_articles]
    tokenized_articles = [[token.lower() for token in article 
                           if not (token.lower() in STOP_FR+PUNCTUATION
                           or token.isnumeric()
                           or len(token) < 2)
                          ]
                         for article in tokenized_articles]
    return tokenized_articles

In [9]:
%%time 
texts_articles = list(articles["Text"].values)
tokenized_articles = tokenize_list_articles(texts_articles)
# add a column in the DF:
articles["tokens"] = tokenized_articles

CPU times: user 1min 31s, sys: 697 ms, total: 1min 32s
Wall time: 1min 32s


We can then quickly check the proportion of tokens collected that exist in the french vocabulary thanks to the `spellchecker` package.

In [10]:
# create a 1D list containing all the tokes
all_tokens = [token 
              for article in tokenized_articles
              for token in article
             ]
print("Number of tokens in the dataset: {0:.3g}.".format(len(all_tokens)))

Number of tokens in the dataset: 6.17e+06.


In [11]:
french_checker = SpellChecker(language='fr')  # use the french Dictionary

misspelled_tokens = french_checker.unknown(all_tokens)
print("{0:.3g}% of the tokens are considered misspelled.".format(100*len(misspelled_tokens)/len(all_tokens)))

3.58% of the tokens are considered misspelled.


We can get an example of such misspelled words in a random article:

In [12]:
rand_index = np.random.randint(0, len(tokenized_articles))

rand_tokens = tokenized_articles[rand_index]

misspelled_rand = french_checker.unknown(rand_tokens)

print("In article {0}; {1} out of {2} words are considered incorrect. Those are: {3}"\
      .format(rand_index, len(misspelled_rand), len(rand_tokens), misspelled_rand))

In article 9203; 19 out of 324 words are considered incorrect. Those are: {'arth-goldau', 'quenr', 'raccordements', 'lfa', 'lotschberg', 'week-end', 'lallégement', 'romande', 'lécologie', 'votations', 'letat', 'lugano', 'gothard', 'nlfa', 'lintérieur', 'sz', 'sextupleront', 'redimensionné', 'neri'}


# Embed the words in a word-space using Word2Vec

## Naive approach
To familiarize with the techniques and get a first glimpse on the possible outcomes, let's perform the word2vec embedding on the whole dataset.
Note that, as defined below, the model is not deterministic, meaning that running it twice won't provide the same results.

In [13]:
%%time
model = Word2Vec(sentences=tokenized_articles
                       , vector_size=100
                       , window=5
                       , min_count=1
                       , workers=4
                       , sg=1 #skipgram
                       , negative=5 #use of negative sampling
                      )

CPU times: user 4min 49s, sys: 482 ms, total: 4min 49s
Wall time: 1min 16s


Now that the model is built, we can check what words are the closest to "écologie" in this wordspace. 

In [14]:
model.wv.most_similar('écologie', topn=20)

[('économie', 0.7515649795532227),
 ('ergonomie', 0.7334681153297424),
 ('rimer', 0.7291607856750488),
 ('ecologie', 0.7246688604354858),
 ('lécologie', 0.7239876985549927),
 ('réconcilier', 0.7174688577651978),
 ('larchitecture', 0.7072470188140869),
 ('spiritualité', 0.706907331943512),
 ('conjugue', 0.6987759470939636),
 ('féminisme', 0.6977463960647583),
 ('concilier', 0.695932924747467),
 ('appliquée', 0.6914093494415283),
 ('conflictuel', 0.6908039450645447),
 ('indissociables', 0.6903030276298523),
 ('consumérisme', 0.6873682737350464),
 ('concilie', 0.6866443753242493),
 ('antagonisme', 0.6827805638313293),
 ('equilibre', 0.6824730634689331),
 ('authenticité', 0.6817585229873657),
 ('réconcilie', 0.6809834837913513)]

In [15]:
# explore through different terms to get some insights and check that it makes sense
model.wv.most_similar('noé', topn=20) # :'(

[('embarqué', 0.8589066863059998),
 ('saoudienne', 0.8546707034111023),
 ('pereira', 0.8409197330474854),
 ('nemo', 0.8409045934677124),
 ('niger', 0.8390101194381714),
 ('pâtisserie', 0.8367184996604919),
 ('auschwitz', 0.8365958333015442),
 ('parrain', 0.8351282477378845),
 ('princesse', 0.8347744941711426),
 ('bete', 0.8336206078529358),
 ('ressuscite', 0.8324520587921143),
 ('arche', 0.8318435549736023),
 ('mickey', 0.8315181732177734),
 ('danseuse', 0.831042468547821),
 ('navy', 0.8307561874389648),
 ('nomade', 0.8305745124816895),
 ('traînée', 0.8302186131477356),
 ('bidonville', 0.8297882080078125),
 ('navigateur', 0.8297860622406006),
 ('constantinople', 0.8295069336891174)]

It can be verified that in these lists of words, the first value is a close word in the wordspace and the second value is the cosine similarity between those two terms. The following code allows also to play with the words to see how "close" or "far" two different words are in the built space.

In [16]:
def cosine_sim(word_vec1, word_vec2):
    """ Compute the cosine similarity between two vectors in the wordspace """
    # if the string is provided, convert into vector thanks to the model
    if type(word_vec1)==str:
        word_vec1 = model.wv[word_vec1]
    if type(word_vec2)==str:
        word_vec2 = model.wv[word_vec2]
        
    return 1 - spatial.distance.cosine(word_vec1, word_vec2)

In [17]:
vector_écologie = model.wv['écologie']  # get numpy vector of a word
vector_leitmotiv = model.wv['leitmotiv']
print("cosine_dist(écologie, leitmotiv) = {}".format(cosine_sim(vector_écologie, vector_leitmotiv)))

cosine_dist(écologie, leitmotiv) = 0.667470395565033


## Add minimum count

Still naive, but a bit less this time, let's build a model considering only words appearing a minimum of $N$ times.

In [18]:
%%time
N_min = 5

model_min5 = Word2Vec(sentences=tokenized_articles
                 , vector_size=100
                 , window=5
                 , min_count=N_min #minimum number of occurences of a word
                 , workers=4
                 , sg=1 #skipgram
                 , negative=5 #use of negative sampling
                )

CPU times: user 3min 44s, sys: 1.12 s, total: 3min 45s
Wall time: 58.3 s


In [19]:
model_min5.wv.most_similar('écologie', topn=20)

[('économie', 0.7370924353599548),
 ('ecologie', 0.6811469793319702),
 ('ergonomie', 0.6783067584037781),
 ('rimer', 0.6582304239273071),
 ('réconcilier', 0.6573576927185059),
 ('spiritualité', 0.6463772654533386),
 ('conjugue', 0.6397854089736938),
 ('larchitecture', 0.6364557147026062),
 ('lécologie', 0.6356692910194397),
 ('florissante', 0.6310573816299438),
 ('dharmonie', 0.6273236274719238),
 ("l'écologie", 0.6226067543029785),
 ('conflictuel', 0.6217504143714905),
 ('logie', 0.62142413854599),
 ('concilier', 0.6203720569610596),
 ('equilibre', 0.6182593107223511),
 ('notions', 0.6142987608909607),
 ('indissociables', 0.6090261340141296),
 ('authenticité', 0.6062590479850769),
 ('economie', 0.6061153411865234)]

In [20]:
model.wv.most_similar('suisse', topn=20)

[('suissi', 0.7969109416007996),
 ('helvétique', 0.7586506009101868),
 ('romande', 0.7375638484954834),
 ('suis-', 0.72492915391922),
 ('ofel', 0.7181031703948975),
 ('usj', 0.711347758769989),
 ('suisses', 0.7095322012901306),
 ('asb', 0.7086243033409119),
 ('ucap', 0.703929603099823),
 ('lassociatior', 0.7009690999984741),
 ('feps', 0.6998518705368042),
 ('bos', 0.6981223821640015),
 ('cassure', 0.6974745392799377),
 ('radiodiffusion', 0.6974092721939087),
 ('horlogerie', 0.6956583857536316),
 ('sit', 0.6931045055389404),
 ('lasso', 0.6919788122177124),
 ('encourageait', 0.6905348896980286),
 ('rohner', 0.6900718808174133),
 ('horlogère', 0.6896616816520691)]

## Build different models for different epochs 

To check if we can see some differences between the different time periods, let's split the data in 3 parts: prior to 1990, between 1990 and 2000 and after 2000.

In [21]:
mask_rise = [(date.year < 1990 and date.year > 1970) for date in pd.to_datetime(articles.Date)]
mask_peak = [(date.year > 1990 and date.year < 2000) for date in pd.to_datetime(articles.Date)]
mask_stable = [(date.year > 2000) for date in pd.to_datetime(articles.Date)]

df_rise = articles[mask_rise]
df_peak = articles[mask_peak]
df_stable = articles[mask_stable]

print(len(df_rise), len(df_peak), len(df_stable))

6860 6947 6417


In [22]:
%%time 

model_rise = Word2Vec(sentences=df_rise.tokens.values
                 , vector_size=100
                 , window=5
                 , min_count=1
                 , workers=4
                 , sg=1 #skipgram
                 , negative=5 #use of negative sampling
                )

model_peak = Word2Vec(sentences=df_peak.tokens.values
                 , vector_size=100
                 , window=5
                 , min_count=1
                 , workers=4
                 , sg=1 #skipgram
                 , negative=5 #use of negative sampling
                )

model_stable = Word2Vec(sentences=df_stable.tokens.values
                 , vector_size=100
                 , window=5
                 , min_count=1
                 , workers=4
                 , sg=1 #skipgram
                 , negative=5 #use of negative sampling
                )

CPU times: user 4min 14s, sys: 795 ms, total: 4min 15s
Wall time: 1min 9s


In [23]:
model_rise.wv.most_similar('écologie', topn=20)

[('economie', 0.7366136908531189),
 ('ecologie', 0.7022604942321777),
 ('lécologie', 0.7017691135406494),
 ('féminisme', 0.6722659468650818),
 ('économie', 0.6662540435791016),
 ('science', 0.6643683314323425),
 ('sexuelle', 0.6618438959121704),
 ('biologie', 0.6524724364280701),
 ('solidarité', 0.6521098017692566),
 ('psychologie', 0.6428419947624207),
 ('éco', 0.6400136947631836),
 ('technocratie', 0.639086127281189),
 ('support', 0.6363763213157654),
 ('piliers', 0.6354470252990723),
 ('antagonisme', 0.6345431208610535),
 ('notions', 0.6318321228027344),
 ('léthique', 0.6314420104026794),
 ('réconcilier', 0.6301258206367493),
 ('éducation', 0.629490077495575),
 ('abordés', 0.628057062625885)]

In [24]:
model_peak.wv.most_similar('écologie', topn=20)

[('économie', 0.7324155569076538),
 ('authenticité', 0.6863291263580322),
 ('culture', 0.6704477667808533),
 ('concilier', 0.6586199998855591),
 ('modernité', 0.645703136920929),
 ('economie', 0.6381278038024902),
 ('equilibre', 0.6337856650352478),
 ('ecologie', 0.6258373260498047),
 ('lécologie', 0.6194453239440918),
 ('adéquation', 0.6145240068435669),
 ('comprise', 0.6121807098388672),
 ('rimer', 0.6091871857643127),
 ('conjuguer', 0.6075676083564758),
 ('sobriété', 0.6007258892059326),
 ('aérodynamisme', 0.6000484228134155),
 ('tendues', 0.597001314163208),
 ('réconcilier', 0.5963481068611145),
 ('parlez', 0.5935744047164917),
 ('et/ou', 0.5887662768363953),
 ('didées', 0.5880550742149353)]

In [25]:
model_stable.wv.most_similar('écologie', topn=20)

[('économie', 0.7390050888061523),
 ('architecture', 0.6903532147407532),
 ('ergonomie', 0.6667472720146179),
 ('durabilité', 0.664372980594635),
 ('spiritualité', 0.6639426946640015),
 ('science', 0.6580581068992615),
 ('antagonistes', 0.6546207070350647),
 ('lunité', 0.6513896584510803),
 ('réconcilier', 0.6509630680084229),
 ('concilier', 0.648421585559845),
 ('ecologie', 0.6478885412216187),
 ('logie', 0.6476572155952454),
 ('décologie', 0.6454635262489319),
 ('évolution', 0.6450504660606384),
 ('alliant', 0.6390377879142761),
 ('concilie', 0.6353011131286621),
 ('mêle', 0.6349608302116394),
 ('psychologie', 0.6344118714332581),
 ('allie', 0.6329975128173828),
 ('conflictuel', 0.6321721076965332)]

In [26]:
vector_écologie_rise = model_rise.wv['écologie']
vector_écologie_peak = model_peak.wv['écologie']
vector_écologie_stable = model_stable.wv['écologie']

vector_science_rise = model_rise.wv['science']
vector_politique_rise = model_rise.wv['politique']

vector_science_peak = model_peak.wv['science']
vector_politique_peak = model_peak.wv['politique']

vector_science_stable = model_stable.wv['science']
vector_politique_stable = model_stable.wv['politique']

print("1970-1980: cosinedist(écologie, science)= {0:.3f} | cosinedist(écologie, politique)= {1:.3f}"\
     .format(cosine_sim(vector_écologie_rise, vector_science_rise)
             , cosine_sim(vector_écologie_rise, vector_politique_rise)
            )
     )

print("1990-2000: cosinedist(écologie, science)= {0:.3f} | cosinedist(écologie, politique)= {1:.3f}"\
     .format(cosine_sim(vector_écologie_peak, vector_science_peak)
             , cosine_sim(vector_écologie_peak, vector_politique_peak)
            )
     )

print("2000-...: cosinedist(écologie, science)= {0:.3f} | cosinedist(écologie, politique)= {1:.3f}"\
     .format(cosine_sim(vector_écologie_stable, vector_science_stable)
             , cosine_sim(vector_écologie_stable, vector_politique_stable)
            )
     )

1970-1980: cosinedist(écologie, science)= 0.664 | cosinedist(écologie, politique)= 0.416
1990-2000: cosinedist(écologie, science)= 0.539 | cosinedist(écologie, politique)= 0.348
2000-...: cosinedist(écologie, science)= 0.658 | cosinedist(écologie, politique)= 0.283


# LDA

Find topics on the whole dataset.

In [27]:
dictionary_LDA = corpora.Dictionary(tokenized_articles)
dictionary_LDA.filter_extremes(no_below=N_min)

corpus = [dictionary_LDA.doc2bow(article) for article in tokenized_articles]

In [28]:
num_topics = 6
%time lda_model = models.LdaModel(corpus, num_topics=num_topics, \
                                  id2word=dictionary_LDA, \
                                  passes=4, alpha=[0.01]*num_topics, \
                                  eta=[0.01]*len(dictionary_LDA.keys()))

CPU times: user 54.1 s, sys: 8.5 s, total: 1min 2s
Wall time: 52.8 s


In [29]:
for i,topic in lda_model.show_topics(formatted=True, num_topics=num_topics, num_words=10):
    print(str(i)+": "+ topic)
    print()

0: 0.013*"suisse" + 0.010*"conseil" + 0.008*"fédéral" + 0.006*"contre" + 0.005*"loi" + 0.005*"non" + 0.004*"politique" + 0.004*"national" + 0.004*"être" + 0.004*"initiative"

1: 0.008*"hier" + 0.006*"contre" + 0.005*"deux" + 0.005*"suisse" + 0.005*"ats" + 0.004*"écologiste" + 0.004*"après" + 0.004*"ans" + 0.004*"pays" + 0.004*"gouvernement"

2: 0.008*"projet" + 0.005*"valais" + 0.005*"wwf" + 0.005*"canton" + 0.004*"protection" + 0.004*"commune" + 0.004*"deux" + 0.004*"francs" + 0.003*"entre" + 0.003*"fait"

3: 0.018*"parti" + 0.012*"conseil" + 0.009*"gauche" + 0.009*"deux" + 0.007*"socialiste" + 0.007*"verts" + 0.006*"partis" + 0.006*"droite" + 0.006*"pdc" + 0.006*"voix"

4: 0.004*"suisse" + 0.004*"être" + 0.003*"environnement" + 0.003*"moins" + 0.003*"production" + 0.003*"eau" + 0.003*"déchets" + 0.003*"produits" + 0.003*"encore" + 0.003*"pays"

5: 0.006*"bien" + 0.004*"fait" + 0.004*"monde" + 0.004*"faire" + 0.004*"tous" + 0.004*"où" + 0.003*"vie" + 0.003*"ans" + 0.003*"très" + 0.003

# TF-IDF

In [30]:
vectorizer = TfidfVectorizer()
X_TFIDF = vectorizer.fit_transform(articles["Text"].values)

X_TFIDF.shape #len(articles = 21760)

(21760, 244146)

In [31]:
print(X_TFIDF.argmax(axis=0))
print(X_TFIDF.argmax(axis=1))

[[20452  1266 18915 ... 14494 14091  5564]]
[[ 38049]
 [147481]
 [121550]
 ...
 [174336]
 [ 54724]
 [ 54724]]


In [32]:
vectorizer.get_feature_names()[38049]

'chauffer'

In [33]:
articles["Text"].values[0]

' Nous tenons à votre disposition non seulement le système de chauffage LOW NOx le plus respectueux de lenvironnement , mais encore notre brochure gratuite Chauffer  écologiquement  . Cet opuscule vous informera en détail au sujet de lOrdonnance sur la protection de l air 92 ( OPair 92 ) . Il suffit d envoyer le coupon ci-dessous . Sic Nom , prénom : Ruen : ^^^^ NPA / Lieu : ^^^ ¦ ^ m ^ m Prièred adressercecouponàELCOSystèmesd energieSA , Chauffer ^ ¦¦ B _____________^___ B ^¦____ F  écologiquement  , Maison Rouge 28 , 3960 Sierre . chauffage écolologique '

In [34]:
vectorizer.get_feature_names()[0]

'00'

In [35]:
articles["Text"].values[20452]

' ¦ EU  RADIO CHABLAIS  M 23 M 1 ESSSSM 8 . 00 Journal canadien 14752769 8 . 45 6 . 45 Télétubbies 44313721 7 . 15 NulSilence ça pousse 62836382 9 . 05 Zig le part ailleurs 37084856 8 . 30 GoutZag café 24096634 10 . 15 Charmants tes deau sur pierres brûlantes voisins 68471837 12 . 05 100 % Ques- 43262585 10 . 00 Quand les éléphants tions 22601914 13 . 05 Mise au point meurent 95155634 10 . 55 La Taule 85808634 14 . 30 Charmants voisins 59939214 12 . 40 Nulle part ailleurs 24528127 16 . 00 Le Journal 39246276 80706382 13 . 45 Pur et dur 22925382 16 . 30 Mediterraneo 17092566 17 . 05 15 . 20 La fidélité 64073160 18 . 00 Pyramide 79381769 17 . 30 Questions Dieu , le diable et Bob 67121818 pour un champion 17096382 18 . 15 18 . 25 Nulle part ailleurs cinéma Charmants voisins 72627740 20 . 00 94512011 19 . 00 Nulle part ailleurs Journal suisse 89378011 21 . 05 Le 21474585 20 . 35 Mickey les yeux point 97324479 22 . 00 Le journal bleus 48432450 22 . 15 Lérotisme vu 31889924 22 . 15 Cible émo

In [36]:
text_rise = ' '.join(df_rise.Text.values)
text_peak = ' '.join(df_peak.Text.values)
text_stable = ' '.join(df_stable.Text.values)

docs = [text_rise, text_peak, text_stable]

In [37]:
vectorizer_epochs = TfidfVectorizer()
X_TFIDF_epochs = vectorizer_epochs.fit_transform(docs)

X_TFIDF_epochs.shape #len(articles = 21760)

(3, 231772)

In [38]:
print(X_TFIDF_epochs.argmax(axis=1))

[[51752]
 [51752]
 [51752]]


In [39]:
vectorizer_epochs.get_feature_names()[51752]

'de'

# PCA Trial

In [42]:
X_TFIDF

<21760x244146 sparse matrix of type '<class 'numpy.float64'>'
	with 5805849 stored elements in Compressed Sparse Row format>

# DRAFT

In [40]:
STOOOP_RUNNIG

NameError: name 'STOOOP_RUNNIG' is not defined

In [None]:
model_pre_80.wv.most_similar('écologie', topn=20)

In [None]:
model_post_92.wv.most_similar('écologie', topn=20)

### Try to align models from different epochs

In [None]:
from functools import reduce

# Code originally ported from HistWords <https://github.com/williamleif/histwords> by William Hamilton <wleif@stanford.edu>.
# https://gist.github.com/tangert/106822a0f56f8308db3f1d77be2c7942
def align_gensim_models(models, words=None):
    """
    Returns the aligned/intersected models from a list of gensim word2vec models.
    Generalized from original two-way intersection as seen above.
    
    Also updated to work with the most recent version of gensim
    Requires reduce from functools
    
    In order to run this, make sure you run 'model.init_sims()' for each model before you input them for alignment.
    
    ##############################################
    ORIGINAL DESCRIPTION
    ##############################################
    
    Only the shared vocabulary between them is kept.
    If 'words' is set (as list or set), then the vocabulary is intersected with this list as well.
    Indices are re-organized from 0..N in order of descending frequency (=sum of counts from both m1 and m2).
    These indices correspond to the new syn0 and syn0norm objects in both gensim models:
        -- so that Row 0 of m1.syn0 will be for the same word as Row 0 of m2.syn0
        -- you can find the index of any word on the .index2word list: model.index2word.index(word) => 2
    The .vocab dictionary is also updated for each model, preserving the count but updating the index.
    """

    # Get the vocab for each model
    vocabs = [set(m.wv.key_to_index.keys()) for m in models]

    # Find the common vocabulary
    common_vocab = reduce((lambda vocab1,vocab2: vocab1&vocab2), vocabs)
    if words: common_vocab&=set(words)

    # If no alignment necessary because vocab is identical...
    
    # This was generalized from:
    # if not vocab_m1-common_vocab and not vocab_m2-common_vocab and not vocab_m3-common_vocab:
    #   return (m1,m2,m3)
    if all(not vocab-common_vocab for vocab in vocabs):
        print("All identical!")
        return models
        
    # Otherwise sort by frequency (summed for both)
    common_vocab = list(common_vocab)
    #common_vocab.sort(key=lambda w: sum([m.wv.key_to_index[w] for m in models]),reverse=True)
    
    # Then for each model...
    for m in models:
        
        # Replace old vectors_norm array with new one (with common vocab)
        indices = [m.wv.key_to_index[w] for w in common_vocab]
                
        old_arr = m.wv.get_normed_vectors()
                
        new_arr = np.array([old_arr[index] for index in indices])
        m.wv.vectors_norm = m.wv.syn0 = new_arr

        # Replace old vocab dictionary with new one (with common vocab)
        # and old index2word with new one
        m.wv.index2word = common_vocab
        old_vocab = m.wv.key_to_index
        new_vocab = {}
        for new_index,word in enumerate(common_vocab):
            old_vocab_obj=old_vocab[word]
            new_vocab[word] = Word2Vec.build_vocab(index=new_index, count=old_vocab_obj)
        m.wv.vocab = new_vocab

    return models

In [None]:
align_gensim_models([model_pre_80, model_post_92], words=None)

In [None]:
model_pre_80.wv.key_to_index