# Question Topic Modeling

Steps:
- construct vector space model for documents, resulting in a term-document matrix A.
- apply TFIDF term weight normalisation to A.
- normalize TFIDF vectors to unit length.
- initialise factors using Non-Negative Double Single Value Decomposition on A.
- apply project gradient Non-Negative Factorization to A.



- Basis vector = the topics (clusters).
- Coefficient Matrix = the membership weights for documents relative to each topic (cluster). 

In [8]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF

In [5]:
q = pd.read_csv("questions.csv")
q.head()

Unnamed: 0,Question
0,What is the step by step guide to invest in sh...
1,What is the story of Kohinoor (Koh-i-Noor) Dia...
2,How can I increase the speed of my internet co...
3,Why am I mentally very lonely? How can I solve...
4,"Which one dissolve in water quikly sugar, salt..."


In [6]:
tfidf = TfidfVectorizer(max_df = 0.95, min_df = 2, stop_words = 'english')
dtm = tfidf.fit_transform(q['Question'])

In [9]:
# Non-negative Matrix Factorization
nmf_model = NMF(n_components=20, random_state=44)
nmf_model.fit(dtm)

NMF(alpha=0.0, beta_loss='frobenius', init=None, l1_ratio=0.0, max_iter=200,
  n_components=20, random_state=44, shuffle=False, solver='cd', tol=0.0001,
  verbose=0)

In [10]:
# Print the top 15 most common words for each of the 20 topics
for index, topic in enumerate(nmf_model.components_):
    print(f"The top 15 words for topic # {index}")
    print([tfidf.get_feature_names()[i] for i in topic.argsort()[-15:]])
    print('\n')           

The top 15 words for topic # 0
['thing', 'read', 'place', 'visit', 'places', 'phone', 'buy', 'laptop', 'movie', 'ways', '2016', 'books', 'book', 'movies', 'best']


The top 15 words for topic # 1
['majors', 'recruit', 'sex', 'looking', 'differ', 'use', 'exist', 'really', 'compare', 'cost', 'long', 'feel', 'work', 'mean', 'does']


The top 15 words for topic # 2
['add', 'answered', 'needing', 'post', 'easily', 'improvement', 'delete', 'asked', 'google', 'answers', 'answer', 'ask', 'question', 'questions', 'quora']


The top 15 words for topic # 3
['using', 'website', 'investment', 'friends', 'black', 'internet', 'free', 'home', 'easy', 'youtube', 'ways', 'earn', 'online', 'make', 'money']


The top 15 words for topic # 4
['balance', 'earth', 'day', 'death', 'changed', 'live', 'want', 'change', 'moment', 'real', 'important', 'thing', 'meaning', 'purpose', 'life']


The top 15 words for topic # 5
['reservation', 'engineering', 'minister', 'president', 'company', 'china', 'business', 'coun

In [12]:
topic_results = nmf_model.transform(dtm)
topic_results.argmax(axis=1)
q['Topic'] = topic_results.argmax(axis=1)
q.head()

Unnamed: 0,Question,Topic
0,What is the step by step guide to invest in sh...,5
1,What is the story of Kohinoor (Koh-i-Noor) Dia...,16
2,How can I increase the speed of my internet co...,17
3,Why am I mentally very lonely? How can I solve...,11
4,"Which one dissolve in water quikly sugar, salt...",14


# News Article Topic Modeling

In [1]:
import pandas as pd
import random
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

In [2]:
npr = pd.read_csv('npr.csv')
npr.head()

Unnamed: 0,Article
0,"In the Washington of 2016, even when the polic..."
1,Donald Trump has used Twitter — his prefe...
2,Donald Trump is unabashedly praising Russian...
3,"Updated at 2:50 p. m. ET, Russian President Vl..."
4,"From photography, illustration and video, to d..."


In [3]:
len(npr)

11992

In [4]:
# discard word that shows in 0.9 of the document and appears at least in 2 documents
cv = CountVectorizer(max_df = 0.9, min_df = 2, stop_words='english')
# document term matrix
dtm = cv.fit_transform(npr['Article'])
dtm

<11992x54777 sparse matrix of type '<class 'numpy.int64'>'
	with 3033388 stored elements in Compressed Sparse Row format>

In [5]:
LDA = LatentDirichletAllocation(n_components = 7, random_state = 44)
LDA.fit(dtm)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7,
             learning_method='batch', learning_offset=10.0,
             max_doc_update_iter=100, max_iter=10, mean_change_tol=0.001,
             n_components=7, n_jobs=None, n_topics=None, perp_tol=0.1,
             random_state=44, topic_word_prior=None,
             total_samples=1000000.0, verbose=0)

In [6]:
random_word_id = random.randint(0,54777)
# grab the word of the vocabulary
cv.get_feature_names()[random_word_id]

'undercard'

In [7]:
# number of components
len(LDA.components_)

7

In [8]:
# the components itself
LDA.components_

array([[1.44046744e-01, 2.42446204e+02, 3.14227403e+00, ...,
        1.43348891e-01, 1.42857149e-01, 1.95893786e-01],
       [1.46832143e+01, 1.65457566e+03, 1.42857151e-01, ...,
        6.14192361e+00, 1.43024727e-01, 1.42857180e-01],
       [1.18194891e+01, 4.60986276e+02, 1.42857149e-01, ...,
        1.42861840e-01, 1.42857145e-01, 1.43002371e-01],
       ...,
       [8.39830311e+00, 8.29002990e+02, 1.42857152e-01, ...,
        1.42911501e-01, 1.21401285e+00, 1.42857186e-01],
       [1.10551225e+01, 1.12332326e+03, 1.42857150e-01, ...,
        1.42857145e-01, 1.42857145e-01, 1.42857174e-01],
       [4.25817140e+01, 6.00628946e+01, 1.43410749e-01, ...,
        1.43182225e-01, 1.07153383e+00, 1.58426300e-01]])

In [9]:
# grab only one component
single_topic = LDA.components_[0]

In [10]:
# sort the array of indexes by the lowest to highest value: showing location of the highest values
# so what we're doing is we're going to take these single topics and then figure out what index
# positions we should be looking at for high probability words for this particular single topic.
top_ten_words = single_topic.argsort()[-10:]  # getting the top ten values (index positions of top ten highest values)

In [11]:
# visualizing the top ten words
for index in top_ten_words:
    print(cv.get_feature_names()[index])

prison
team
reports
npr
world
death
years
virus
zika
said


In [12]:
# So what we're doing is for each of those topics we're just grabbing the top 15 words
# and then we basically grab those index positions of our get_feature
for i, topic in enumerate(LDA.components_):
    print(f"THE TOP 15 WORDS FOR TOPIC #{i}")
    print([cv.get_feature_names()[index] for index in topic.argsort()[-15:]])
    print('\n')
    print('\n')

THE TOP 15 WORDS FOR TOPIC #0
['country', 'says', 'told', 'time', 'year', 'prison', 'team', 'reports', 'npr', 'world', 'death', 'years', 'virus', 'zika', 'said']




THE TOP 15 WORDS FOR TOPIC #1
['country', 'world', 'percent', '000', 'just', 'years', 'city', 'company', 'china', 'year', 'like', 'new', 'people', 'food', 'says']




THE TOP 15 WORDS FOR TOPIC #2
['voters', 'donald', 'party', 'republican', 'white', 'election', 'new', 'state', 'obama', 'house', 'campaign', 'clinton', 'president', 'said', 'trump']




THE TOP 15 WORDS FOR TOPIC #3
['patients', 'new', 'don', 'just', 'children', 'study', 'women', 'students', 'percent', 'like', 'care', 'school', 'people', 'health', 'says']




THE TOP 15 WORDS FOR TOPIC #4
['scientists', 'way', 'science', 'war', 'human', 'said', 'water', 'world', 'new', 'time', 'years', 'just', 'like', 'people', 'says']




THE TOP 15 WORDS FOR TOPIC #5
['department', 'new', 'rights', 'country', 'states', 'president', 'government', 'federal', 'court', 'state',

In [13]:
topic_results = LDA.transform(dtm)
topic_results.shape

(11992, 7)

In [14]:
npr['Topic'] = topic_results.argmax(axis=1)

In [17]:
npr.head(10)

Unnamed: 0,Article,Topic
0,"In the Washington of 2016, even when the polic...",2
1,Donald Trump has used Twitter — his prefe...,2
2,Donald Trump is unabashedly praising Russian...,2
3,"Updated at 2:50 p. m. ET, Russian President Vl...",2
4,"From photography, illustration and video, to d...",5
5,I did not want to join yoga class. I hated tho...,3
6,With a who has publicly supported the debunk...,3
7,"I was standing by the airport exit, debating w...",1
8,"If movies were trying to be more realistic, pe...",4
9,"Eighteen years ago, on New Year’s Eve, David F...",1
