In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.cluster import KMeans 
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, ENGLISH_STOP_WORDS
from sklearn.decomposition import PCA
from sklearn.preprocessing import normalize
from sklearn.metrics import pairwise_distances

import nltk
import string

df = pd.read_pickle("./2019_cleaned.pkl")

In [28]:
from sklearn.feature_extraction.text import TfidfVectorizer
stop_words = ENGLISH_STOP_WORDS.union(['neruips','neurips2019'])
data = df['cleaned_tweet']

tf_idf_vectorizor = TfidfVectorizer(stop_words = stop_words, max_features = 20000)
tf_idf = tf_idf_vectorizor.fit_transform(data)
tf_idf_norm = normalize(tf_idf)
tf_idf_array = tf_idf_norm.toarray()

In [22]:
df.head(3)

Unnamed: 0,date,screen_name,tweet,tweet_id,lang,cleaned_tweet
1,2020-10-04 01:16:24,shinyML,rl for chip design 🙌 everyone’s fave game #neu...,1312562178143248384,en,rl for chip design everyone’s fave game
2,2020-06-14 03:01:00,KirkDBorne,Analysis of #NeurIPS2019 papers by themes: htt...,1272001061054799872,en,analysis of papers by themes —————————— ...
3,2020-06-30 14:14:21,arXiv__ml,RT @Xingyu2017: How should we combine multiple...,1277968719969259520,en,rt how should we combine multiple auxiliary t...


## A Topic Modeling Class, courtesty of Anterra:

In [26]:
from gensim import corpora, models, similarities, matutils
from sklearn.decomposition import TruncatedSVD, NMF

class topic_model():

    def __init__(self, data, vectorizer, model, num_topics):
        self.data = data
        self.vectorizer = vectorizer
        self.model = model
        self.num_topics = num_topics 


    def vectorize(self):
        if self.vectorizer == "cv":
            if self.model == "corex":
                vec = CountVectorizer(stop_words=stop_words, ngram_range=(1, 3), binary=True)
            else:
                vec = CountVectorizer(stop_words=stop_words, ngram_range=(1, 3))
        elif self.vectorizer == "tfidf":
            vec = TfidfVectorizer(stop_words=stop_words, ngram_range=(1, 3))
        else: print("Please define a valid vectorizer.")

        self.doc_word = vec.fit_transform(self.data)
        self.feature_names = vec.get_feature_names()
        self.words = list(np.asarray(vec.get_feature_names()))
        self.corpus = matutils.Sparse2Corpus(self.doc_word.transpose())
        self.id2word = dict((v, k) for k, v in vec.vocabulary_.items())
    

    def display_topics(self):
        if self.model == "lda":
            model = models.LdaModel(corpus=self.corpus, num_topics=self.num_topics, id2word=self.id2word, passes=5, random_state=42)
            topics = model.print_topics()
            print(topics)

        elif self.model == "corex":
            model = corextopic.Corex(n_hidden=self.num_topics, words=self.words, seed=1, max_iter=200)
            model.fit(self.doc_word, words=self.words, docs=self.data)
            topics = model.get_topics()
            print(topics)
            for n,topic in enumerate(topics):
                topic_words,_ = zip(*topic)
            print('{}: '.format(n) + ','.join(topic_words))

        else:
            if self.model == "lsa":
                model = TruncatedSVD(self.num_topics)
            elif self.model == "nmf":
                model = NMF(self.num_topics)
            else: print("Please define a valid model.")

            doc_topic = model.fit_transform(self.doc_word)
            for ix, topic in enumerate(model.components_):
                print("\nTopic ", ix)
                print(", ".join([self.feature_names[i] for i in topic.argsort()[:-21:-1]]))

### Let's begin with LSA, using a pipeline:

In [17]:
def pipeline(data, vectorizer, model, num_topics):
    x = topic_model(data, vectorizer, model, num_topics)
    x.vectorize()
    x.display_topics()

In [18]:
corpus = df["cleaned_tweet"]

In [30]:
pipeline(corpus, "cv", "lsa", 10)


Topic  0
learning, workshop, poster, deep, work, talk, machine, machine learning, amp, deep learning, paper, come, ai, today, presenting, ml, great, research, hall, session

Topic  1
poster, work, come, session, hall, east, today, poster session, exhibition, exhibition hall, east exhibition hall, east exhibition, paper, presenting, workshop, amp, ml, pm, check, ai

Topic  2
workshop, amp, ml, ai, talk, great, good, work, social, research, neuripsconf, talks, systems, fairness, thanks, panel, social good, climate, join, health

Topic  3
machine, machine learning, dibr, researchers, learn, using, week, framework, object, presenting, called, savage, nvidia, rendering, images, framework called, called dibr produces, called dibr, rendering framework, rendering framework called

Topic  4
paper, amp, ai, research, neuripsconf, work, deep, deep learning, new, researchers, neural, talk, best, models, yoshua, bengio, conference, learn, need, team

Topic  5
amp, savage, talk, engineering, design

In [23]:
import spacy
nlp = spacy.load('en_core_web_sm', disable=["parser", "ner"])
spacy_lemmatize = lambda x: " ".join([token.lemma_ for token in nlp(x)])
df['spacy_lemma'] = df['cleaned_tweet'].apply(spacy_lemmatize)

In [24]:
df.head(3)

Unnamed: 0,date,screen_name,tweet,tweet_id,lang,cleaned_tweet,spacy_lemma
1,2020-10-04 01:16:24,shinyML,rl for chip design 🙌 everyone’s fave game #neu...,1312562178143248384,en,rl for chip design everyone’s fave game,rl for chip design everyone ’s fave game
2,2020-06-14 03:01:00,KirkDBorne,Analysis of #NeurIPS2019 papers by themes: htt...,1272001061054799872,en,analysis of papers by themes —————————— ...,analysis of paper by theme — — — — — — —...
3,2020-06-30 14:14:21,arXiv__ml,RT @Xingyu2017: How should we combine multiple...,1277968719969259520,en,rt how should we combine multiple auxiliary t...,rt how should -PRON- combine multiple auxili...


Hmm, no improvement for some reason.

In [31]:
pipeline(corpus, "tfidf", "lsa", 10)


Topic  0
clinical trials, rigorous clinical trials, rigorous clinical, trials, rigorous, clinical, stakes big, researchers rigorous, big need best, stakes, need best, need best researchers, big need, researchers rigorous clinical, stakes big need, best researchers rigorous, best researchers, big, best, need

Topic  1
learning, deep, deep learning, neurips, reflections neurips, reflections, workshop, poster, yoshua, machine, bengio, yoshua bengio, machine learning, talk, learning deep, learning deep learning, deep learning deep, work, ai, paper

Topic  2
reflections neurips, reflections, neurips, papers themes, analysis papers themes, themes, analysis papers, neurips analysis papers, neurips analysis, just published reflections, published reflections neurips, published reflections, just published, papers, neurips neurips, bigram poem inspired, poem, poem inspired, bigram, bigram poem

Topic  3
dibr, machine, framework, called dibr produces, rendering framework called, produces, renderi

### Back to NMF, now with pipelines:

In [32]:
pipeline(corpus, "cv", "nmf", 10)


Topic  0
learning, deep, deep learning, machine learning, machine, bayesian, yoshua, bengio, yoshua bengio, learning workshop, reinforcement, reinforcement learning, learning deep, learning deep learning, deep learning deep, bayesian deep, bayesian deep learning, data, graph, talk

Topic  1
poster, hall, come, session, east, exhibition, exhibition hall, poster session, today, east exhibition hall, east exhibition, pm, bc, hall bc, dec, exhibition hall bc, check, west, presenting, come poster

Topic  2
workshop, ml, west, learning workshop, presenting, rl, panel, talks, climate, tomorrow, change, today, come, climate change, workshop today, great, rl workshop, saturday, check, graph

Topic  3
dibr, machine, machine learning, learn, researchers, using, framework, called, week, object, rendering, framework called dibr, framework called, called dibr produces, produces, rendering framework, dibr produces, rendering framework called, called dibr, nvidia

Topic  4
talk, ai, great, neuripscon

### LDA:

In [33]:
pipeline(corpus, "cv", "lda", 10)

[(0, '0.007*"talk" + 0.007*"poster" + 0.006*"learning" + 0.006*"great" + 0.005*"session" + 0.005*"ml" + 0.004*"workshop" + 0.004*"work" + 0.004*"vancouver" + 0.004*"poster session"'), (1, '0.003*"upcoming" + 0.003*"fairness" + 0.002*"representation learning" + 0.002*"representation" + 0.002*"learning" + 0.001*"learning fairness" + 0.001*"individual fairness" + 0.001*"stage" + 0.001*"heading" + 0.001*"graph representation learning"'), (2, '0.005*"celestekidd" + 0.002*"pm east" + 0.002*"rest" + 0.002*"talk celestekidd" + 0.002*"ai" + 0.002*"thing" + 0.002*"talk" + 0.001*"platform" + 0.001*"opportunities" + 0.001*"social"'), (3, '0.005*"ai" + 0.005*"papers" + 0.003*"neurips" + 0.002*"researchers" + 0.002*"amp" + 0.002*"research" + 0.002*"work" + 0.002*"world" + 0.002*"conference" + 0.002*"year"'), (4, '0.003*"hi" + 0.002*"amp" + 0.002*"say" + 0.002*"say hi" + 0.002*"lunch" + 0.002*"come say" + 0.002*"certainty" + 0.002*"neuripsconf" + 0.002*"come say hi" + 0.002*"learning"'), (5, '0.009*"

In [34]:
pipeline(corpus, "tfidf", "lda", 10)

[(0, '0.003*"learning" + 0.003*"poster" + 0.002*"great" + 0.002*"work" + 0.002*"talk" + 0.002*"workshop" + 0.002*"come" + 0.002*"session" + 0.001*"amp" + 0.001*"tutorial"'), (1, '0.001*"representation" + 0.001*"representation learning" + 0.001*"fairness" + 0.000*"reading" + 0.000*"learning fairness" + 0.000*"app" + 0.000*"distributions" + 0.000*"et" + 0.000*"et al" + 0.000*"al"'), (2, '0.000*"graphs" + 0.000*"opportunities" + 0.000*"evaluation" + 0.000*"aguera" + 0.000*"blaise" + 0.000*"youd" + 0.000*"continual" + 0.000*"blaise aguera" + 0.000*"continual learning" + 0.000*"identify"'), (3, '0.001*"celestekidd" + 0.001*"dec" + 0.001*"visit" + 0.001*"pm" + 0.001*"humans" + 0.001*"beliefs" + 0.001*"live" + 0.001*"started" + 0.001*"group" + 0.001*"neurips"'), (4, '0.000*"getting" + 0.000*"ibm" + 0.000*"talks" + 0.000*"available online" + 0.000*"talks available online" + 0.000*"talks available" + 0.000*"outstanding" + 0.000*"available" + 0.000*"hours" + 0.000*"talks slides"'), (5, '0.001*"l

### CorEx

In [37]:
from sklearn.metrics import pairwise_distances
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.manifold import TSNE
from corextopic import corextopic, vis_topic

pipeline(corpus, "cv", "corex", 10)

[[('fair ml health', 0.006440596534509281), ('using machine', 0.00608331756711483), ('using machine learning', 0.00608331756711483), ('simulation', 0.005783741390629775), ('machine learning techniques', 0.005783741390629775), ('timkraska', 0.005783741390629775), ('learning techniques', 0.005783741390629775), ('computational biology', 0.005127377279210976), ('machine learning', 0.0050054478712125185), ('im presenting', 0.0044715032050290735)], [('trials', 0.02365389713345164), ('clinical trials', 0.022985079369086732), ('rigorous clinical', 0.022985079369086732), ('rigorous clinical trials', 0.022985079369086732), ('rigorous', 0.02228461352668251), ('clinical', 0.016418915336772356), ('best researchers', 0.0156619049782197), ('need best researchers', 0.01499920870583553), ('need best', 0.01499920870583553), ('big need', 0.01499920870583553)], [('learning creativity', 0.007716184281571741), ('machine learning creativity', 0.007716184281571741), ('creativity design', 0.00667642649249881),

### Back to LDA, for use with PyLDAVis:

In [41]:
def vectorize(data, min_df):
    vec = CountVectorizer(stop_words=stop_words, ngram_range=(1, 3), min_df=min_df)
    doc_word = vec.fit_transform(data)
    feature_names = vec.get_feature_names()
    id2word = dict((v, k) for k, v in vec.vocabulary_.items())
    
    return doc_word, feature_names, id2word

In [42]:
doc_word, feature_names, id2word = vectorize(corpus, 0.01)

In [43]:
def lda(doc_word, feature_names, id2word, num_topics):
    sparse_corpus = matutils.Sparse2Corpus(doc_word.transpose())
    model = models.LdaModel(corpus=sparse_corpus, num_topics=num_topics, id2word=id2word, passes=5, random_state=42)
    topics = model.print_topics()
    for n, topic in topics:
        print("\nTopic ", n)
        print(topic)

In [44]:
lda(doc_word, feature_names, id2word, 10)


Topic  0
0.085*"research" + 0.070*"people" + 0.057*"im" + 0.055*"year" + 0.051*"work" + 0.048*"check" + 0.044*"interesting" + 0.035*"rl" + 0.031*"conference" + 0.030*"day"

Topic  1
0.107*"machine" + 0.097*"learning" + 0.094*"machine learning" + 0.052*"looking" + 0.041*"forward" + 0.040*"good" + 0.039*"social" + 0.039*"pm" + 0.036*"workshop" + 0.031*"looking forward"

Topic  2
0.114*"session" + 0.108*"poster" + 0.088*"just" + 0.064*"poster session" + 0.059*"time" + 0.058*"best" + 0.041*"live" + 0.035*"need" + 0.029*"world" + 0.026*"clinical"

Topic  3
0.126*"ai" + 0.092*"talk" + 0.073*"models" + 0.046*"researchers" + 0.041*"today" + 0.041*"booth" + 0.036*"nice" + 0.034*"big" + 0.033*"lot" + 0.031*"stop"

Topic  4
0.072*"hall" + 0.068*"neural" + 0.062*"poster" + 0.062*"paper" + 0.060*"exhibition" + 0.059*"exhibition hall" + 0.054*"come" + 0.049*"today" + 0.047*"east exhibition" + 0.047*"east exhibition hall"

Topic  5
0.094*"celestekidd" + 0.071*"tutorial" + 0.054*"amazing" + 0.051*"va

In [45]:
print(feature_names)

['ai', 'algorithms', 'amazing', 'amp', 'approach', 'artificial', 'available', 'award', 'awesome', 'based', 'bayesian', 'bc', 'bengio', 'best', 'better', 'big', 'blackinai', 'blaiseaguera', 'booth', 'celestekidd', 'challenge', 'challenges', 'change', 'chat', 'check', 'climate', 'clinical', 'code', 'come', 'community', 'conference', 'cool', 'data', 'day', 'dec', 'deep', 'deep learning', 'design', 'different', 'discussion', 'dont', 'east', 'east exhibition', 'east exhibition hall', 'ethics', 'event', 'excited', 'exciting', 'exhibition', 'exhibition hall', 'fairness', 'forward', 'framework', 'future', 'giving', 'going', 'good', 'graph', 'great', 'hall', 'health', 'hear', 'help', 'human', 'ideas', 'im', 'important', 'intelligence', 'interested', 'interesting', 'join', 'just', 'keynote', 'know', 'language', 'learn', 'learning', 'learning workshop', 'like', 'live', 'look', 'looking', 'looking forward', 'lot', 'machine', 'machine learning', 'make', 'making', 'meet', 'meeting', 'methods', 'ml',

OK, looking good. Let's jump to visualization.

## Visualization with pyLDAvis

In [49]:
from gensim.corpora.dictionary import Dictionary

def vectorize_vis(data, min_df):
    vec = CountVectorizer(stop_words=stop_words, ngram_range=(1, 3), min_df=min_df)
    doc_word = vec.fit_transform(data)
    feature_names = vec.get_feature_names()
    id2word = dict((v, k) for k, v in vec.vocabulary_.items())
    dictionary = Dictionary([feature_names])
    
    return doc_word, feature_names, id2word, dictionary

  and should_run_async(code)


In [50]:
import pyLDAvis.gensim
pyLDAvis.enable_notebook()

def lda_vis(doc_word, feature_names, id2word, dictionary, num_topics):
    corpus = matutils.Sparse2Corpus(doc_word.transpose())
    model = models.LdaModel(corpus=corpus, num_topics=num_topics, id2word=id2word, passes=5, random_state=42)

    topics = model.print_topics()
    for n, topic in topics:
        print("\nTopic ", n)
        print(topic)

    vis_data = pyLDAvis.gensim.prepare(model, corpus, dictionary)
    return vis_data

  and should_run_async(code)


In [51]:
doc_word, feature_names, id2word, dictionary = vectorize_vis(corpus, 0.002)
vis_data = lda_vis(doc_word, feature_names, id2word, dictionary, 10)

  and should_run_async(code)



Topic  0
0.024*"session" + 0.021*"bengio" + 0.019*"neurips" + 0.018*"poster" + 0.017*"research" + 0.017*"papers" + 0.015*"yoshua" + 0.015*"yoshua bengio" + 0.014*"great" + 0.012*"neural"

Topic  1
0.078*"poster" + 0.034*"come" + 0.033*"hall" + 0.031*"east" + 0.024*"today" + 0.023*"presenting" + 0.021*"exhibition" + 0.021*"exhibition hall" + 0.018*"east exhibition" + 0.018*"east exhibition hall"

Topic  2
0.045*"learning" + 0.033*"machine" + 0.031*"machine learning" + 0.028*"workshop" + 0.022*"ai" + 0.022*"amp" + 0.011*"booth" + 0.009*"upcoming" + 0.009*"climate" + 0.008*"computational"

Topic  3
0.026*"papers" + 0.022*"just" + 0.014*"things" + 0.013*"pm east" + 0.013*"help" + 0.013*"research" + 0.012*"people" + 0.012*"bias" + 0.012*"language" + 0.011*"love"

Topic  4
0.041*"celestekidd" + 0.026*"talk" + 0.021*"know" + 0.019*"neuripsconf" + 0.015*"people" + 0.013*"humans" + 0.013*"thank" + 0.013*"beliefs" + 0.011*"form" + 0.011*"vancouver"

Topic  5
0.075*"learning" + 0.039*"deep" + 0.

In [52]:
pyLDAvis.display(vis_data)

  and should_run_async(code)


In [53]:
pyLDAvis.save_html(vis_data, "pldavis1.html")

  and should_run_async(code)


In [70]:
vis_data = lda_vis(doc_word, feature_names, id2word, dictionary, 8)

  and should_run_async(code)



Topic  0
0.032*"session" + 0.031*"poster" + 0.023*"work" + 0.017*"papers" + 0.017*"great" + 0.017*"poster session" + 0.015*"ml" + 0.012*"neurips" + 0.011*"interesting" + 0.010*"bengio"

Topic  1
0.050*"poster" + 0.030*"come" + 0.030*"hall" + 0.027*"east" + 0.022*"presenting" + 0.022*"today" + 0.021*"workshop" + 0.019*"work" + 0.019*"exhibition" + 0.019*"exhibition hall"

Topic  2
0.034*"ai" + 0.030*"learning" + 0.021*"paper" + 0.021*"amp" + 0.020*"new" + 0.012*"neuripsconf" + 0.012*"research" + 0.012*"data" + 0.012*"machine" + 0.010*"machine learning"

Topic  3
0.022*"just" + 0.019*"people" + 0.018*"time" + 0.017*"research" + 0.016*"im" + 0.016*"conference" + 0.014*"good" + 0.013*"models" + 0.012*"year" + 0.010*"paper"

Topic  4
0.021*"vancouver" + 0.016*"ai" + 0.014*"talk" + 0.014*"dec" + 0.014*"chat" + 0.013*"celestekidd" + 0.012*"know" + 0.010*"booth" + 0.009*"ml" + 0.009*"workshop"

Topic  5
0.063*"learning" + 0.043*"talk" + 0.037*"deep" + 0.027*"deep learning" + 0.022*"workshop" 

In [71]:
pyLDAvis.display(vis_data)

  and should_run_async(code)


In [72]:
pyLDAvis.save_html(vis_data, "pldavis2.html")

  and should_run_async(code)


In [58]:
def lda_dt(doc_word, feature_names, id2word, dictionary, num_topics):
    corpus = matutils.Sparse2Corpus(doc_word.transpose())
    model = models.LdaModel(corpus=corpus, num_topics=num_topics, id2word=id2word, passes=5, random_state=42)
    topics = model.print_topics()
    for n, topic in topics:
        print("\nTopic ", n)
        print(topic)

    return model, corpus

  and should_run_async(code)


In [59]:
lda_model, corpus = lda_dt(doc_word, feature_names, id2word, dictionary, 10)

  and should_run_async(code)



Topic  0
0.024*"session" + 0.021*"bengio" + 0.019*"neurips" + 0.018*"poster" + 0.017*"research" + 0.017*"papers" + 0.015*"yoshua" + 0.015*"yoshua bengio" + 0.014*"great" + 0.012*"neural"

Topic  1
0.078*"poster" + 0.034*"come" + 0.033*"hall" + 0.031*"east" + 0.024*"today" + 0.023*"presenting" + 0.021*"exhibition" + 0.021*"exhibition hall" + 0.018*"east exhibition" + 0.018*"east exhibition hall"

Topic  2
0.045*"learning" + 0.033*"machine" + 0.031*"machine learning" + 0.028*"workshop" + 0.022*"ai" + 0.022*"amp" + 0.011*"booth" + 0.009*"upcoming" + 0.009*"climate" + 0.008*"computational"

Topic  3
0.026*"papers" + 0.022*"just" + 0.014*"things" + 0.013*"pm east" + 0.013*"help" + 0.013*"research" + 0.012*"people" + 0.012*"bias" + 0.012*"language" + 0.011*"love"

Topic  4
0.041*"celestekidd" + 0.026*"talk" + 0.021*"know" + 0.019*"neuripsconf" + 0.015*"people" + 0.013*"humans" + 0.013*"thank" + 0.013*"beliefs" + 0.011*"form" + 0.011*"vancouver"

Topic  5
0.075*"learning" + 0.039*"deep" + 0.

In [60]:
all_topic_words = []
for topic_id in range(lda_model.num_topics):
    topk = lda_model.show_topic(topic_id, 30)
    topk_words = [ w for w, _ in topk ]
    all_topic_words.append(topk_words)
    
    print('{}: {}'.format(topic_id, ' '.join(topk_words)))

0: session bengio neurips poster research papers yoshua yoshua bengio great neural google latest sessions booth graphs want poster session deep nets msftresearch work excited rl coauthored network week neuripsconf apple analysis poster sessions
1: poster come hall east today presenting exhibition exhibition hall east exhibition hall east exhibition session work pm poster session dec networks check hall bc workshop paper neural hi neuripsconf wimlworkshop morning present say neural networks research learn
2: learning machine machine learning workshop ai amp booth upcoming climate computational want miss competition thursday change challenge neuripsconf panel talks stop happy ml talk team today experience imitation climate change west join
3: papers just things pm east help research people bias language love time making seeing conference like message mean hiring look lab healthcare demo says able natural ibm world main year read
4: celestekidd talk know neuripsconf people humans thank be

  and should_run_async(code)


In [64]:
lda_corpus = lda_model[corpus]
lda_corpus
lda_docs = [doc for doc in lda_corpus]

  and should_run_async(code)


In [65]:
def topic_weights_to_matrix(topic_weights, doc_ids=None, topic_ids=None):
    topic_dicts = [dict(doc) for doc in topic_weights]
    doc_to_top_mat = pd.DataFrame(topic_dicts).fillna(0)
    if doc_ids:
        doc_to_top_mat.rename(index=lambda ind: doc_ids[ind], inplace=True)
    if topic_ids:
        doc_to_top_mat.rename(columns=lambda ind: topic_ids[ind], inplace=True)
    return doc_to_top_mat

  and should_run_async(code)


In [66]:
doc_topic_matrix = topic_weights_to_matrix(lda_docs)
doc_topic_matrix

  and should_run_async(code)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,0.774971,0.025002,0.025006,0.025001,0.025002,0.025004,0.025001,0.025005,0.025003,0.025004
1,0.774985,0.025001,0.025001,0.025006,0.025001,0.025002,0.025001,0.025001,0.025001,0.025001
2,0.000000,0.000000,0.324852,0.000000,0.000000,0.000000,0.000000,0.227959,0.400507,0.000000
3,0.010004,0.574421,0.010001,0.010001,0.010001,0.345564,0.010003,0.010002,0.010001,0.010001
4,0.016674,0.176679,0.235965,0.178296,0.016673,0.016671,0.309034,0.016668,0.016671,0.016670
...,...,...,...,...,...,...,...,...,...,...
4713,0.000000,0.000000,0.000000,0.178942,0.000000,0.000000,0.661041,0.000000,0.000000,0.106159
4714,0.010002,0.010001,0.517696,0.200365,0.010001,0.010003,0.010001,0.211929,0.010001,0.010001
4715,0.025002,0.025002,0.025005,0.025001,0.025001,0.025003,0.025001,0.774978,0.025006,0.025002
4716,0.073398,0.127272,0.094635,0.000000,0.000000,0.000000,0.667181,0.000000,0.000000,0.000000


In [69]:
doc_topic_matrix.to_pickle("./dtm_2019")

  and should_run_async(code)


In [67]:
doc_topic = doc_topic_matrix.values
doc_topic

  and should_run_async(code)


array([[0.77497101, 0.02500193, 0.02500628, ..., 0.02500511, 0.0250029 ,
        0.02500433],
       [0.77498472, 0.02500105, 0.02500118, ..., 0.02500105, 0.02500129,
        0.02500105],
       [0.        , 0.        , 0.3248522 , ..., 0.2279595 , 0.400507  ,
        0.        ],
       ...,
       [0.02500238, 0.02500158, 0.02500535, ..., 0.77497768, 0.02500601,
        0.02500162],
       [0.07339827, 0.12727159, 0.09463549, ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.15896943,
        0.51714253]])