# Process

In [6]:
file = open('RLEM_subs.txt').read()

In [17]:
## num words
words = file.split()
len(words)

35241

In [26]:
num_words_per_doc = 100
docs = list(map(lambda x: ' '.join(words[x: x + num_words_per_doc]), range(0, len(words), num_words_per_doc)))

In [22]:
## num docs
len(docs)

353

# Run LDA

In [30]:
import pandas as pd 
from gensim.sklearn_api import LdaTransformer
from gensim.corpora import Dictionary
from sklearn.base import BaseEstimator, MetaEstimatorMixin
import re
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from collections import defaultdict

class CV2BOW(BaseEstimator, MetaEstimatorMixin):
    """Transform a corpus into Bag-of-Word representation."""
    def fit(self, X, y=None):
        return self
        
    def transform(self, X):
        t = X.tocoo()

        # (token_id, token_count)
        docs = defaultdict(list)
        for d, r, c in list(zip(t.data, t.row, t.col)):
            docs[r].append((c, d))
        docs = list(map(lambda x: docs[x], range(len(docs))))
        return docs

pipeline__lda = Pipeline([
    ('cv', CountVectorizer(min_df=.01, max_df=0.5, stop_words='english')),
    ('doc2bow', CV2BOW()),
    ('lda', LdaTransformer(num_topics=10, iterations=50)),
])

In [31]:
docs_s = pd.Series(docs)
lda = pipeline__lda.fit_transform(docs)
beta_matrix = pd.DataFrame(
    data=pipeline__lda['lda'].gensim_model.expElogbeta,
    columns=sorted(pipeline__lda['cv'].vocabulary_)
).T

top_words = {}
for col in beta_matrix.columns:
    topic = beta_matrix[col].sort_values(ascending=False)
    topic_key = '%s, %s, %s' % (topic.index[0], topic.index[1], topic.index[2])
    top_words[topic_key] = list(topic.iloc[3:10].index)

In [32]:
pd.DataFrame(top_words)

Unnamed: 0,"know, right, ll","model, learning, energy","function, like, know","know, going, thank","energy, know, learning","control, like, actually","know, energy, like","learning, control, know","like, thank, different","model, know, like"
0,just,policy,actually,policy,building,think,different,like,try,building
1,okay,just,thing,just,just,know,thank,state,energy,controller
2,let,reinforcement,control,like,like,right,just,time,data,learning
3,minutes,control,controller,right,yeah,question,kind,actually,agents,state
4,don,agent,use,using,reinforcement,learning,okay,reinforcement,demand,based
5,rl,using,set,control,price,building,actually,just,really,function
6,really,like,point,set,time,model,right,set,just,kind
