## LDA topic modeling
#### Example using the 20 Newsgroups dataset from sklearn
* https://medium.com/mlreview/topic-modeling-with-scikit-learn-e80d33668730
* https://medium.com/@yanlinc/how-to-build-a-lda-topic-model-using-from-text-601cdcbfd3a6

In [1]:
# get the data
from sklearn.datasets import fetch_20newsgroups

dataset = fetch_20newsgroups(shuffle=True, random_state=1, remove=('headers', 'footers', 'quotes'))
documents = dataset.data

In [2]:
# define a simple tokenizer (NLTK won't be available to us later on, in our Lambda function)
import re

REPLACE_NO_SPACE = re.compile("(\.)|(\;)|(\:)|(\!)|(\')|(\?)|(\,)|(\")|(\()|(\))|(\[)|(\])|(=)|(`)")
REPLACE_WITH_SPACE = re.compile("(<br\s*/><br\s*/>)|(\-)|(\/)|(\n)|(\t)")

def simple_tokenizer(input_text):
    tokens = REPLACE_NO_SPACE.sub("", input_text.lower())
    tokens = REPLACE_WITH_SPACE.sub(" ", tokens)
    return tokens

In [3]:
# tokenize the text
tok_documents = [simple_tokenizer(doc) for doc in documents]
len(tok_documents)

11314

In [4]:
tok_documents[0]

'well im not sure about the story nad it did seem biased what i disagree with is your statement that the us media is out to ruin israels reputation that is rediculous the us media is the most pro israeli media in the world having lived in europe i realize that incidences such as the one described in the letter have occured the us media as a whole seem to try to ignore them the us is subsidizing israels existance and the europeans are not at least not to the same degree so i think that might be a reason they report more clearly on the atrocities  what is a shame is that in austria daily reports of the inhuman acts commited by israeli soldiers and the blessing received from the government makes some of the holocaust guilt go away after all look how the jews are treating other races when they got power it is unfortunate '

In [5]:
# Vectorize the text with TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer

no_features = 1000

tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, max_features=no_features, stop_words='english')
tfidf = tfidf_vectorizer.fit_transform(tok_documents)
tfidf_feature_names = tfidf_vectorizer.get_feature_names()

In [6]:
from sklearn.decomposition import LatentDirichletAllocation

no_topics = 20

# Instantiate the LDA class object from scikitlearn
lda = LatentDirichletAllocation(n_components=no_topics, 
                                max_iter=5, 
                                learning_method='online', 
                                learning_offset=50.,
                                random_state=0)

# Fit our LDA model onto the vectorized text data
lda.fit(tfidf)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
                          evaluate_every=-1, learning_decay=0.7,
                          learning_method='online', learning_offset=50.0,
                          max_doc_update_iter=100, max_iter=5,
                          mean_change_tol=0.001, n_components=20, n_jobs=None,
                          perp_tol=0.1, random_state=0, topic_word_prior=None,
                          total_samples=1000000.0, verbose=0)

In [7]:
# column names
topicnames = ['Topic' + str(i) for i in range(lda.n_components)]

In [8]:
# Topic-Keyword Matrix
import pandas as pd
df_topic_keywords = pd.DataFrame(lda.components_)
# Assign Column and Index
df_topic_keywords.columns = tfidf_vectorizer.get_feature_names()
df_topic_keywords.index = topicnames
# View
df_topic_keywords.head()

Unnamed: 0,00,02,04,0t,10,100,1000,11,12,13,...,x11,xt,year,years,yes,york,youll,young,youre,youve
Topic0,0.050064,0.05002,0.05003,0.050017,0.050484,0.050024,1.134016,0.050022,0.05007,0.050123,...,0.050021,0.050017,0.050034,0.166502,0.050538,0.050521,0.050024,0.050085,0.050032,0.050042
Topic1,0.704079,1.422826,1.709286,0.05002,19.092435,9.271399,3.652171,8.272621,12.355965,8.64721,...,0.050019,0.050021,64.334747,79.356512,42.112647,16.139716,10.725409,22.960021,49.971997,18.01373
Topic2,0.05002,0.050016,0.05002,0.050022,0.050019,0.050018,0.050023,0.050021,0.050019,0.050021,...,0.05002,0.050018,0.050019,0.05007,0.05002,0.050021,0.05002,0.050048,0.050021,0.050016
Topic3,0.05002,0.050018,0.05002,0.050022,0.050019,0.050018,0.050021,0.050018,0.050026,0.050021,...,0.050017,0.050018,0.050028,0.050034,0.050072,0.050019,0.050021,0.050034,0.050063,0.05002
Topic4,9.23174,6.486786,3.931163,0.144278,10.494104,5.501869,2.826167,5.728174,7.733292,3.742519,...,12.511049,9.827182,5.553785,2.626648,9.272604,1.292217,7.256,0.32168,8.146721,1.310245


In [9]:
# Get the top 15 keywords for each topic
import numpy as np

def show_topics(vectorizer, lda_model, n_words):
    keywords = np.array(vectorizer.get_feature_names())
    topic_keywords = []
    for topic_weights in lda_model.components_:
        top_keyword_locs = (-topic_weights).argsort()[:n_words]
        topic_keywords.append(keywords.take(top_keyword_locs))
    return topic_keywords

In [10]:
# Get the top 15 keywords each topic
topic_keywords = show_topics(vectorizer=tfidf_vectorizer, lda_model=lda, n_words=15)
len(topic_keywords)

20

In [11]:
# Topic - Keywords Dataframe
df_topic_keywords = pd.DataFrame(topic_keywords)
df_topic_keywords.columns = ['Word '+str(i) for i in range(df_topic_keywords.shape[1])]
df_topic_keywords.index = ['Topic '+str(i) for i in range(df_topic_keywords.shape[0])]
df_topic_keywords.head(2)

Unnamed: 0,Word 0,Word 1,Word 2,Word 3,Word 4,Word 5,Word 6,Word 7,Word 8,Word 9,Word 10,Word 11,Word 12,Word 13,Word 14
Topic 0,israel,israeli,jews,armenian,armenians,turkish,turkey,war,killed,turks,peace,jewish,armenia,land,soviet
Topic 1,people,dont,think,just,god,like,know,time,did,good,say,said,im,right,does


In [12]:
# Extract topic probability scores with LDA Transform
topic_probability_scores = lda.transform(tfidf)
print(len(topic_probability_scores))
print(len(tok_documents))

11314
11314


In [13]:
# create a dataframe with our results
df_final = pd.DataFrame(documents, columns=['text'])
dom_topics=[np.argmax(topic_probability_scores[index]) for index in range(len(topic_probability_scores))]
df_final['pred_topic']=dom_topics
df_final['topic_words']=df_toks['pred_topic'].apply(lambda x: df_topic_keywords.iloc[x].values.tolist())
df_final.head(5)

NameError: name 'df_toks' is not defined

In [None]:
df_final.iloc[9][0]

In [None]:
df_final.iloc[9][2]