# **Imports**

In [1]:
#from engine import *
import pickle

import os
os.chdir("../..")
os.chdir(r"src")

import pandas as pd
import matplotlib.pyplot as plt
from GloVe.weights import *
#from Axes.models import *
from Axes.projection_functions import *
from Axes.axes_definition import *

import warnings
warnings.filterwarnings("ignore")

os.chdir("../")

# **Data formating**

In [3]:
i = 7 # Donc en 2017
df = standard_opening(
            "data/FinalDataframes/FilteredFinalDataFrame_201" + str(i) + ".csv", True
        )
df = df[df["source"] == "par"]

df_cos = pd.read_csv('data/current_dataframes/df')
df_cos = df_cos[df_cos['year'] == 2010+i]
df_cos = df_cos[df_cos["source"] == "par"].reset_index()

df['cos axe'] = list(df_cos['cos axe 1'])
df = df.loc[df['party'].isin(['Lab', 'Con'])]


In [4]:
os.chdir(r'notebooks/embeddings polarization/')

# **Polarized corpus selection with GloVe embeddings**

In [5]:
def get_quantiles(data, percentiles):
    """
    Get quantiles from a distribution.
    
    Parameters:
        data (array-like): The data.
        percentiles (array-like): The percentiles to compute (0-100).
    
    Returns:
        quantiles (array): The values at the specified percentiles.
    """
    return np.percentile(data, percentiles)

percentiles = [25, 75]
quantiles = get_quantiles(df['cos axe'], percentiles)

df = df.loc[(df['cos axe'] < quantiles[0]) | (df['cos axe'] > quantiles[1])]
#df = df.loc[(df['cos axe'] < quantiles[0])]

df.to_csv('df_news.csv')

data = df['text'].tolist()

# **LDA**

In [6]:
def create_dict_corpus(data_words):
    import gensim.corpora as corpora
    
    # Create Dictionary
    id2word = corpora.Dictionary(data_words)

    # Create Corpus
    texts = data_words

    # Term Document Frequency
    corpus = [id2word.doc2bow(text) for text in texts]

    return corpus, id2word

In [7]:
def preprocessing_lda(data):
    
    corpus, id2word = create_dict_corpus(data)

    return data, corpus, id2word

In [8]:
texts_processed_lda, corpus_lda, id2word_lda = preprocessing_lda(data)

In [9]:
pickle.dump(texts_processed_lda, open('texts_processed_lda.pkl', 'wb'))
pickle.dump((corpus_lda, id2word_lda), open('corpus_lda.pkl', 'wb'))

In [10]:
texts = pickle.load(open('texts_processed_lda.pkl', 'rb'))
corpus, id2word = pickle.load(open('corpus_lda.pkl', 'rb'))

In [11]:
def train_lda_model(corpus, id2word, texts, model_type='lda', start=10, limit=50, step=3):
    import os
    import gensim
    import numpy as np
    from gensim.models import CoherenceModel
    
    if model_type == 'mallet' and not os.path.exists('mallet-2.0.8.zip'):
        os.system('wget http://mallet.cs.umass.edu/dist/mallet-2.0.8.zip')
        os.system('unzip mallet-2.0.8.zip')
    mallet_path = 'mallet-2.0.8/bin/mallet'
    
    def lda_model_func(corpus, id2word, num_topics):
        print(f'Training model with {num_topics} topics')
        model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=num_topics, 
                                           random_state=42,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)
        coherence_model = CoherenceModel(model=model, texts=texts, dictionary=id2word, coherence='c_v')
        coh_v = coherence_model.get_coherence()
        print(f'num of topic: {num_topics}/{limit}, coherence value: {coh_v}')
        return model, coh_v
    
    
    def mallet_model_func(mallet_path, corpus, num_topics, id2word):
        print(f'Training model with {num_topics} topics')
        model = gensim.models.wrappers.LdaMallet(mallet_path, 
                                                     corpus=corpus, 
                                                     num_topics=num_topics, 
                                                     id2word=id2word, 
                                                     iterations=50,
                                                     workers=1,
                                                     random_seed=42)
        coherence_model = CoherenceModel(model=model, texts=texts, dictionary=id2word, coherence='c_v')
        coh_v = coherence_model.get_coherence()
        print(f'num of topic: {num_topics}/{limit}, coherence value: {coh_v}')
        return model, coh_v
    
    results = []
    for num_topics in range(start, limit+1, step):
        if model_type == 'lda':
            thread = lda_model_func(corpus, id2word, num_topics)
        else:
            thread = mallet_model_func(mallet_path, corpus, num_topics, id2word)
        results.append(thread)
    
    coherence_values_dict = {}
    coherence_values = []
    model_dict = {}
    
    for i in range(len(results)):
        model, coh_v = results[i]
        model_dict[start+i] = model
        coherence_values_dict[start+i] = coh_v
        coherence_values.append(coh_v)
    
    import matplotlib.pyplot as plt
    plt.plot(range(start, limit+1, step), coherence_values, marker='o')
    plt.xlabel('n_topics')
    plt.ylabel('coherence score')
    plt.savefig(f'{model_type}_coherence_values_topics.png')
    plt.close()
    
    max_idx = np.argmax(coherence_values)
    
    return model_dict, coherence_values_dict, max_idx+start

In [12]:
lda_models, coh_values, max_idx = train_lda_model(corpus, id2word, texts, model_type='lda', start=31, limit=31, step=1)

Training model with 31 topics
num of topic: 31/31, coherence value: 0.41768458219973087


In [13]:
df_ntopics_coh = pd.DataFrame(list(coh_values.items()), columns=['n_topics', 'coh_val'])
df_ntopics_coh.sort_values(by='coh_val', ascending=False)

Unnamed: 0,n_topics,coh_val
0,31,0.417685


In [14]:
# choose the model with K=?
K = 31
lda_model = lda_models[K]

In [15]:
# show the top-10 topic keywords for each topic and their corresponding probs
lda_models[K].show_topics(num_topics=-1, formatted=False)

[(0,
  [('peopl', 0.07057808),
   ('mani', 0.035128433),
   ('time', 0.029461054),
   ('know', 0.02717885),
   ('want', 0.025428988),
   ('look', 0.023120673),
   ('come', 0.019300655),
   ('good', 0.017782414),
   ('like', 0.017775413),
   ('thing', 0.017262198)]),
 (1,
  [('resid', 0.4169618),
   ('town', 0.24864845),
   ('scrap', 0.04617897),
   ('academi', 0.029542683),
   ('rubbish', 0.017725876),
   ('broker', 0.007829354),
   ('yemeni', 0.007598497),
   ('ceasefir', 0.001276323),
   ('constitu', 6.4554035e-05),
   ('mileag', 1.4859683e-05)]),
 (2,
  [('budget', 0.08824636),
   ('labour', 0.07812836),
   ('data', 0.0564751),
   ('public', 0.047576945),
   ('employ', 0.046441298),
   ('worker', 0.040704567),
   ('spend', 0.03797215),
   ('custom', 0.035052318),
   ('rise', 0.032739244),
   ('claim', 0.03033037)]),
 (3,
  [('polic', 0.17411944),
   ('victim', 0.13803579),
   ('justic', 0.13783965),
   ('crime', 0.077537864),
   ('crimin', 0.066206194),
   ('case', 0.04848691),
   (

In [16]:
# index each topic
topics = {x:y for x,y in lda_model.show_topics(num_topics=-1, num_words=10, formatted=False)}
df_topic = []
for j in range(len(topics)):
    df_topic.append([j, [each[0] for each in topics[j]]])

In [17]:
sources = df['party'].values.tolist()

In [18]:
# rank topics by the mass of probabilities
def rank_topics(lda_model, corpus):
    idx_prob = [[i, 0] for i in range(lda_model.num_topics)]
    for idx_doc, rows in enumerate(lda_model[corpus]):
        for j, (idx_topic, prob) in enumerate(rows[0]):
            idx_prob[idx_topic][1] += prob
    idx_prob.sort(key=lambda x:x[1], reverse=True)
    return idx_prob

In [19]:
topic_ranks = rank_topics(lda_model, corpus)
topic_ranks

[[15, 1547.008503060788],
 [5, 1461.5668585933745],
 [0, 1287.3503456730396],
 [21, 1270.911610128358],
 [29, 782.6186088277027],
 [16, 709.3686178131029],
 [18, 250.10670069698244],
 [10, 212.9987901384011],
 [2, 172.21862554736435],
 [11, 155.65554065816104],
 [26, 153.20812607277185],
 [20, 102.81011934578419],
 [7, 74.15158522687852],
 [13, 71.84708705823869],
 [30, 68.52157135959715],
 [28, 59.578360565938056],
 [25, 56.29575486574322],
 [6, 55.29404758941382],
 [19, 47.631963931024075],
 [3, 43.10020915605128],
 [17, 35.0522120250389],
 [12, 29.196615272201598],
 [9, 28.45920663420111],
 [23, 27.99658631812781],
 [14, 21.636693441309035],
 [8, 19.50822769012302],
 [22, 19.38922882452607],
 [27, 18.50985781941563],
 [24, 9.564348483458161],
 [1, 6.775415416806936],
 [4, 5.037258157506585]]

In [20]:
# create a dataframe for the topics, including the index, the keywords, and the mass probility
for j in range(K):
    df_topic[j].append(topic_ranks[j][1])
df_topic = pd.DataFrame(df_topic, columns=['topic_idx', 'topic_stems', 'probs'])
df_topic

Unnamed: 0,topic_idx,topic_stems,probs
0,0,"[peopl, mani, time, know, want, look, come, go...",1547.008503
1,1,"[resid, town, scrap, academi, rubbish, broker,...",1461.566859
2,2,"[budget, labour, data, public, employ, worker,...",1287.350346
3,3,"[polic, victim, justic, crime, crimin, case, o...",1270.91161
4,4,"[unfair, civil, servant, terrorist, correctli,...",782.618609
5,5,"[would, right, make, point, deal, chang, give,...",709.368618
6,6,"[agreement, trade, compani, market, intern, si...",250.106701
7,7,"[power, legisl, devolv, administr, provis, pri...",212.99879
8,8,"[food, human, transit, biggest, deleg, water, ...",172.218626
9,9,"[prison, post, offic, justic, convict, ministr...",155.655541


In [21]:
# figure out the the probability that each document is associated with each topic
# only keep those pairs with prob>=threshold (0.15 here)
def get_doc2topics(ldamodel, corpus, sources,threshold=0.15):
    data = []
    for idx_doc, rows in enumerate(ldamodel[corpus]):
        for j, (idx_topic, prob) in enumerate(rows[0]):
            if prob < 0.15:
                continue
            data.append([idx_doc, idx_topic, prob, sources[idx_doc]])
    df = pd.DataFrame(data, columns=['idx_doc', 'idx_topic', 'prob', 'source'])
    return df

df_doc_topic_all = get_doc2topics(lda_model, corpus, sources, 1)
df_doc_topic = get_doc2topics(lda_model, corpus, sources)
df_doc_topic.head()

Unnamed: 0,idx_doc,idx_topic,prob,source
0,0,0,0.154041,Con
1,0,5,0.156145,Con
2,0,15,0.177846,Con
3,1,2,0.243466,Lab
4,2,15,0.151203,Con


In [22]:
df_doc_topic.shape[0], df_doc_topic['idx_doc'].unique().shape

(16844, (9148,))

In [23]:
from collections import Counter
# count the # documents associated with each topic
Counter(df_doc_topic['idx_topic'].values.tolist())

Counter({15: 5128,
         5: 4496,
         21: 3052,
         0: 3044,
         29: 715,
         16: 214,
         18: 113,
         13: 21,
         26: 12,
         9: 10,
         2: 5,
         3: 5,
         20: 5,
         6: 4,
         7: 4,
         12: 3,
         17: 3,
         11: 3,
         14: 2,
         10: 1,
         30: 1,
         24: 1,
         19: 1,
         27: 1})

In [24]:
# create a validation set for finetuning the language model
# articles that is not assigned to any topic (prob < 0.15) are in this set
df.insert(0, 'idx', np.arange(df.shape[0]))
idxes_doc_val = set(df['idx'].unique().tolist()) - set(df_doc_topic['idx_doc'].unique().tolist())

In [25]:
# save the data
df_ntopics_coh.to_excel('coh_values.xlsx', index=False)
df_doc_topic.to_csv('df_doc_topic.csv', index=False)
df_topic.to_csv('df_topics.csv', index=False)
pickle.dump(topic_ranks, open('topic_ranks.pkl', 'wb'))
pickle.dump(lda_model, open('lda_model.pkl', 'wb'))
pickle.dump((lda_models, coh_values, max_idx), open('lda_models.pkl', 'wb'))
pickle.dump(idxes_doc_val, open('idxes_val.pkl', 'wb'))
# pickle.dump(idxes_doc_val2, open('idxes_val2.pkl', 'wb'))
pickle.dump(lda_model.show_topics(num_topics=-1, num_words=10, formatted=False), 
            open('topics.pkl', 'wb'))
#pickle.dump(topic_labels, open('topic_labels.pkl', 'wb'))

# **Doings partisanship learning directly with GloVe**

- Plutôt ré-entrainer GloVe directement sur les speeches polarisés sélectionnés. Refaire les poids. 
- Puis faire la technique de masque de l'articles et calculer les embeddings des topics.

## **Training partisanship learning**

In [11]:
df

Unnamed: 0_level_0,index,year,Speaker,party,text,source,keywords,agenda,url,cos axe
level_0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
30079,513548.0,2017.0,Damian Green,Con,"[complet, agre, alreadi, mention, enterpris, a...",par,[jobs],[],,0.069410
30080,513561.0,2017.0,Debbie Abrahams,Lab,"[happi, everyon, resolut, foundat, data, show,...",par,[],[],,0.075862
30081,513567.0,2017.0,Amanda Milling,Con,"[also, wish, happi, thank, answer, famili, res...",par,[],[],,-0.409124
30082,513572.0,2017.0,Penny Mordaunt,Con,"[consciou, need, children, peopl, particular, ...",par,[teams],[],,-0.285080
30087,513602.0,2017.0,Ian Blackford,SNP,"[happi, everyon, particularli, waspi, women, b...",par,[],[],,0.062566
...,...,...,...,...,...,...,...,...,...,...
8218,584519.0,2017.0,George Howarth,Lab,"[right, absolut, right, anoth, complic, factor...",par,[],[],,0.038827
8220,584536.0,2017.0,Ruth Cadbury,Lab,"[concur, comment, absolut, right, mere, word, ...",par,[jobs],[],,0.139995
8227,584554.0,2017.0,John Healey,Lab,"[certainli, appli, case, good, point, remain, ...",par,"[cook, meta, nest, brin]",[],,-0.296102
8229,584557.0,2017.0,Alok Sharma,Con,"[come, talk, work, commiss, also, heard, consu...",par,[],[],,-0.283878


In [7]:
os.chdir('../..')

In [48]:
from mittens import GloVe 
from scipy import sparse
import json
import ast
from mittens import Mittens

from src.GloVe.coocc_functs import *
from src.GloVe.training_functs import *

In [49]:
l = []
for sentence in df['text']:
    l+=sentence

vocab = list(set(l))
word2idx = {v: i for i, v in enumerate(vocab)}

In [50]:
items = [(j,t) for j,t in enumerate(df['text'])]

coocc = inter_coocc(items, word2idx)

dans inter_coocc
['also', 'wish', 'happi', 'thank', 'answer', 'famili', 'resourc', 'survey', 'publish', 'last', 'show', 'nearli', 'disabl', 'childrena', 'increas', 'past', 'outlin', 'measur', 'implement', 'take', 'account', 'increas', 'children', 'access', 'support', 'specialist', 'equip', 'requir']
0


In [55]:
def glove2dict(glove_filename):
    ''' transforms a txt file of embeddings into a dictionary
    Parameters:
    -----------
    glove_filename : embeddings txt file
    '''
    with open(glove_filename) as f:
        reader = csv.reader(f, delimiter=' ', quoting=csv.QUOTE_NONE)
        words = []
        mats = []
        for line in reader :
            if len(clean(line[0], gram='unigram'))>0:
                words.append(clean(line[0], gram='unigram')[0])
                mats.append(np.array(list(map(float, line[1:]))))
    embed = {words[i]: mats[i] for i in range(len(words))}
    return embed

In [None]:
original_embedding = glove2dict('data/embeddings/embeddings_201'+str(i)+'.txt')
coocc  = coocc.toarray()

In [57]:
mittens_model = Mittens(n=50, max_iter=40)
    
new_embeddings = mittens_model.fit(
    coocc,
    vocab=vocab,
    initial_embedding_dict= original_embedding)
    
a = np.array(vocab)
b = new_embeddings
c = np.column_stack((a, b))

Iteration 40: error 88480.24747

In [58]:
pd.DataFrame(c).to_csv('notebooks/embeddings polarization/re_trained_embeddings.csv', index=False)

In [59]:
c = np.array(pd.read_csv('notebooks/embeddings polarization/re_trained_embeddings.csv'))

## **LDA filter the new embeddings and compute polarization**

In [60]:
df

Unnamed: 0_level_0,idx,index,year,Speaker,party,text,source,keywords,agenda,url,cos axe
level_0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
30081,0,513567.0,2017.0,Amanda Milling,Con,"[also, wish, happi, thank, answer, famili, res...",par,[],[],,-0.409124
30082,1,513572.0,2017.0,Penny Mordaunt,Con,"[consciou, need, children, peopl, particular, ...",par,[teams],[],,-0.285080
30090,2,513634.0,2017.0,Damian Green,Con,"[agre, issu, precis, matthew, review, investig...",par,[],[],,-0.249888
30092,3,513644.0,2017.0,Penny Mordaunt,Con,"[invest, signific, resourc, includ, increas, c...",par,[],[],,-0.292079
30093,4,513647.0,2017.0,Luciana Berger,Lab,"[five, forward, view, mental, health, publish,...",par,[],[],,-0.432286
...,...,...,...,...,...,...,...,...,...,...,...
8212,5557,584505.0,2017.0,Jim Fitzpatrick,Lab,"[finger, point, look, reassur, leasehold, cove...",par,[page],[],,-0.256374
8214,5558,584512.0,2017.0,William Wragg,Con,"[surpris, agenc, charg, ground, rent, sale, wo...",par,[excel],[],,-0.246312
8227,5559,584554.0,2017.0,John Healey,Lab,"[certainli, appli, case, good, point, remain, ...",par,"[cook, meta, nest, brin]",[],,-0.296102
8229,5560,584557.0,2017.0,Alok Sharma,Con,"[come, talk, work, commiss, also, heard, consu...",par,[],[],,-0.283878


In [61]:
number_of_keywords_in_topic = 50
number_of_topics = 5
number_of_documents_for_one_side = 100

In [62]:
polarized_embeddings = dict([(word[0], embedding) for word, embedding in zip(c[:, :1], c[:, 1:])], dtypes='float')

In [63]:
df_doc_topic_left = df_doc_topic[df_doc_topic['source'] == 'Lab']
df_doc_topic_right = df_doc_topic[df_doc_topic['source'] == 'Con']

In [64]:
topic_id = 21

working_df_left = df_doc_topic_left[df_doc_topic_left['idx_topic'] == topic_id]
working_df_right = df_doc_topic_right[df_doc_topic_right['idx_topic'] == topic_id]

working_df_right = working_df_right.sort_values('prob', ascending=False)
working_df_left = working_df_left.sort_values('prob', ascending=False)

working_df_right = working_df_right.head(number_of_documents_for_one_side)
working_df_left = working_df_left.head(number_of_documents_for_one_side)

working_df = pd.concat([working_df_left, working_df_right])
df = df.rename(columns={"idx": "idx_doc"})
merged_df = pd.merge(working_df, df, on=['idx_doc'])[['idx_doc', 'idx_topic', 'prob', 'source_x', 'text']]
merged_df.set_index('idx_doc', inplace=True)

In [65]:
new_weights_dict = dict(lda_models[K].show_topics(num_topics=-1, num_words=20, formatted=False)[topic_id][1][:number_of_keywords_in_topic])

In [66]:
def filter_keywords_from_topic_and_compute_weighted_embeddings(topic_id, number_of_keywords_in_topic, text:list, polarized_embeddings):

    list_of_keywords = [a for a,b in lda_models[K].show_topics(num_topics=-1, num_words=20, formatted=False)[topic_id][1][:number_of_keywords_in_topic]]
    filtered_text = [word for word in text if word in list_of_keywords]

    new_weights_dict = dict(lda_models[K].show_topics(num_topics=-1, num_words=20, formatted=False)[topic_id][1][:number_of_keywords_in_topic])

    embed = np.zeros(50, dtype='float')

    for word in filtered_text:

        a = np.array(new_weights_dict[word])
        b = np.array(polarized_embeddings[word])

        embed = embed + a*b

    return embed

In [67]:
merged_df['weighted_topic_document_embeddings_1'] = merged_df['text'].apply(lambda x: filter_keywords_from_topic_and_compute_weighted_embeddings(topic_id, number_of_keywords_in_topic, x, polarized_embeddings))

In [68]:
merged_df['weighted_topic_document_embeddings_2'] = np.multiply(np.array(merged_df['prob']), np.array(merged_df['weighted_topic_document_embeddings_1']))

In [51]:
#final_df_left, final_df_right = merged_df[merged_df['source_x'] == 'Lab'], merged_df[merged_df['source_x'] == 'Con']

In [69]:
def document_probability_per_topic(lda_model, corpus):
    # Initialize a structure to hold sums of document probabilities per topic
    topic_doc_contribution = {i: {} for i in range(lda_model.num_topics)}
    
    # Aggregate probabilities of topics within each document
    for doc_id, doc_bow in enumerate(corpus):
        doc_topics = lda_model.get_document_topics(doc_bow, minimum_probability=0)
        for topic_id, prob in doc_topics:
            # Initialize the nested dictionary if the doc_id doesn't exist for the topic_id
            if doc_id not in topic_doc_contribution[topic_id]:
                topic_doc_contribution[topic_id][doc_id] = 0
            topic_doc_contribution[topic_id][doc_id] += prob
    
    # Normalize these sums to estimate P(d|t) for each topic
    for topic_id in topic_doc_contribution:
        total_contribution = sum(topic_doc_contribution[topic_id].values())
        for doc_id in topic_doc_contribution[topic_id]:
            topic_doc_contribution[topic_id][doc_id] /= total_contribution
    
    return topic_doc_contribution

P(d/t) = P(t/d)*P(d)/P(t)

In [None]:
document_probability_per_topic = document_probability_per_topic(lda_model, corpus)
merged_df['final_weights'] = np.multiply(np.array([document_probability_per_topic[topic_id][idx] for idx in list(merged_df.index)]), merged_df['weighted_topic_document_embeddings_2'])
merged_df = merged_df[['source_x', 'final_weights']]
final_df = merged_df.groupby(by='source_x').mean()
first_embedding, second_embedding = list(final_df['final_weights']) 
cosine = np.dot(first_embedding, second_embedding.T) / (norm(first_embedding) * norm(second_embedding))

In [79]:
cosine

0.9996148371649755

# **BERT Preprocessing**

- Start from GloVe embeddings
- Select polarized speeches close to poles with distribution and percentiles
- Essayer d'optimiser la LDA avec ce choix de percentiles
- This is our two parties corpuses, train BERT to predict right party

Create a mask for each document. This mask indicates how much each token of the document will contribute to the document-contextualized topic embedding (the weights). The length of the mask is equal to the # of tokens in the dcoument (tokenized by BERT). Each element in the mask indicates the weight of the token.

In [None]:
os.chdir(r'notebooks/embeddings polarization/')

In [26]:
os.getcwd()

'/Users/alexandrequeant/Desktop/Travail TSE/notebooks/embeddings polarization'

In [27]:
text_processed_bert = data
pickle.dump(text_processed_bert, open('texts_processed_bert.pkl', 'wb'))

In [28]:
lda_model = pickle.load(open('lda_model.pkl', 'rb'))
df_doc_topic = pd.read_csv('df_doc_topic.csv')

In [29]:
# tokenize the documents by BERT
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
texts_bert = pickle.load(open('texts_processed_bert.pkl', 'rb'))
text_encodings = tokenizer(pd.Series(texts_bert).apply(lambda x: ' '.join(x)).tolist(), padding=True, 
                           truncation=True)['input_ids']
text_encodings = pd.Series(text_encodings)

In [30]:
def search_in_list(list1, list2):
    '''
    search the indices of the topic keywords in the tokenized document -- a list of tokens
    '''
    idxes = []
    for i in range(len(list2)):
        if list1[0] == list2[i]:
            if (i + len(list1) <= len(list2)) and list2[i:i+len(list1)] == list1:
                idxes += list(range(i, i+len(list1)))
    return idxes

topic_masks = np.zeros((text_encodings.shape[0], 512))  # (n_docs, 512)
for topic in lda_model.show_topics(num_topics=-1, num_words=50, formatted=False):
    idx = topic[0]
    topic_stems = [each[0] for each in topic[1]]
    stem_probs = [each[1] for each in topic[1]]
    stem_probs = np.array(stem_probs)
    stem_probs /= stem_probs.sum()  # normalize the weights of the top-n keywords
    stem_encodings = tokenizer(topic_stems, truncation=True)['input_ids'] # encode topic keywords using BERT
    doc_idxes = df_doc_topic[df_doc_topic['idx_topic'] == idx]['idx_doc'].to_list() # find the documents associated with the topic
    doc_encodings = text_encodings[doc_idxes]
    for doc_idx in doc_idxes:
        doc_encoding = doc_encodings[doc_idx]
        topic_mask = topic_masks[doc_idx]
        for stem_input_ids, stem_prob in zip(stem_encodings, stem_probs):
            idxes = search_in_list(stem_input_ids[1:-1], doc_encoding) # search each keyword in the document
            if idxes:
                # if found multiple occurrences of the keywords, 
                # then the weight of each occurrence will be devalued
                topic_mask[idxes] += stem_prob / len(idxes) 
        if topic_mask.mean() > 0:
            topic_mask /= topic_mask.sum()   # normalize the mask

In [31]:
pickle.dump(topic_masks.tolist(), open('topic_masks.pkl', 'wb'))

In [17]:
topic_masks

array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.29404683, 0.01292744, ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.02623017, 0.        , ..., 0.        , 0.        ,
        0.        ]])