In [27]:
# Importing all the necessary libraries
import os
import numpy as np
import pandas as pd

In [28]:
# Loading the dataset and converting it to dataframe.
# The original dataset consistes of around 800,000 records, but we are taking 200,000 for our processing.
df = pd.read_csv("../AI2/quora_questions.csv", nrows=200000)

In [29]:
df.head()

Unnamed: 0,question
0,What is the step by step guide to invest in sh...
1,What is the story of Kohinoor (Koh-i-Noor) Dia...
2,How can I increase the speed of my internet co...
3,Why am I mentally very lonely? How can I solve...
4,"Which one dissolve in water quikly sugar, salt..."


In [40]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [41]:
tfidf = TfidfVectorizer(max_df=0.30, min_df=10, stop_words="english")

In [42]:
doc_term_matrix_tfidf = tfidf.fit_transform(df["question"])

In [43]:
doc_term_matrix_tfidf

<200000x9790 sparse matrix of type '<class 'numpy.float64'>'
	with 913044 stored elements in Compressed Sparse Row format>

In [44]:
from sklearn.decomposition import NMF

In [45]:
nmf_model = NMF(n_components=20, random_state=1)

In [46]:
nmf_model.fit(doc_term_matrix_tfidf)

NMF(alpha=0.0, beta_loss='frobenius', init=None, l1_ratio=0.0, max_iter=200,
    n_components=20, random_state=1, shuffle=False, solver='cd', tol=0.0001,
    verbose=0)

In [47]:
tfidf.get_feature_names()[500]

'aliexpress'

In [48]:
word_list = []
probability_list = []

top_number = 50
count = 0

for prob_num in nmf_model.components_:
    question = f"Top 50 words for topic {count} are: "
    print(question)
    for number in prob_num.argsort()[-top_number:]:
        print([tfidf.get_feature_names()[number]], end="")
        word_list.append([tfidf.get_feature_names()[number]])
        probability_list.append(number)
    
    print("\n")
    count += 1
        

Top 50 words for topic 0 are: 
['apps']['preparation']['seo']['college']['ve']['games']['service']['gift']['software']['beginners']['download']['bollywood']['digital']['company']['sites']['institute']['marketing']['smartphone']['course']['delhi']['watch']['coaching']['songs']['android']['world']['learning']['app']['engineering']['mobile']['friend']['hollywood']['free']['read']['online']['thing']['site']['website']['place']['visit']['places']['phone']['buy']['ways']['laptop']['movie']['2016']['books']['book']['movies']['best']

Top 50 words for topic 1 are: 
['support']['increase']['pay']['means']['energy']['man']['god']['new']['exactly']['works']['symbol']['dog']['girl']['average']['stand']['facebook']['water']['woman']['phrase']['com']['guy']['look']['use']['matter']['cause']['dream']['person']['love']['help']['say']['differ']['word']['need']['affect']['come']['universities']['says']['majors']['grads']['recruit']['looking']['really']['sex']['exist']['compare']['cost']['long']['feel'][

['muscle']['face']['increase']['food']['height']['advice']['day']['25']['really']['water']['possible']['20']['want']['effectively']['30']['faster']['old']['year']['green']['slowly']['eat']['week']['eating']['tea']['days']['weeks']['months']['body']['kg']['plan']['belly']['loose']['losing']['safely']['doing']['diet']['reduce']['healthy']['pounds']['help']['month']['exercise']['quickly']['loss']['fat']['fast']['ways']['gain']['lose']['weight']

Top 50 words for topic 16 are: 
['yes']['greatest']['thing']['universe']['free']['visit']['short']['10']['period']['backward']['women']['past']['fall']['woman']['exist']['light']['faster']['speed']['right']['years']['notice']['unexpected']['movie']['space']['gifts']['invited']['foreign']['visitor']['bring']['jobs']['manage']['real']['visitors']['change']['having']['long']['spend']['machine']['stop']['job']['feel']['movies']['favorite']['home']['person']['love']['sex']['possible']['travel']['time']

Top 50 words for topic 17 are: 
['successful']['f

In [49]:
question_topics = nmf_model.transform(doc_term_matrix_tfidf)

In [50]:
question_topics

array([[3.05765024e-04, 1.96703808e-07, 1.09313703e-05, ...,
        0.00000000e+00, 0.00000000e+00, 8.16804805e-04],
       [3.35233642e-04, 3.16154574e-04, 0.00000000e+00, ...,
        0.00000000e+00, 2.99618500e-04, 1.67678068e-04],
       [2.35799586e-04, 6.25435177e-04, 1.35696397e-03, ...,
        8.93308458e-04, 4.27922633e-04, 2.99266804e-03],
       ...,
       [0.00000000e+00, 1.05394361e-03, 2.05308783e-03, ...,
        3.67063863e-05, 6.36355609e-03, 5.06219398e-04],
       [4.94091563e-03, 2.51074138e-04, 0.00000000e+00, ...,
        0.00000000e+00, 3.48972349e-03, 0.00000000e+00],
       [2.30320256e-03, 6.62936439e-04, 8.05777675e-07, ...,
        7.28488884e-06, 3.40439757e-03, 0.00000000e+00]])

In [51]:
question_topics.shape

(200000, 20)

In [53]:
question_topics[1]

array([3.35233642e-04, 3.16154574e-04, 0.00000000e+00, 0.00000000e+00,
       2.37279636e-04, 1.52183052e-04, 1.50760062e-04, 0.00000000e+00,
       3.01256009e-04, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       6.99983586e-04, 1.40437556e-05, 3.28912020e-04, 0.00000000e+00,
       1.49484922e-04, 0.00000000e+00, 2.99618500e-04, 1.67678068e-04])

In [54]:
question_topics[2].round(2)

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0.])

In [55]:
question_topics[3].argmax()

13

In [56]:
topic_list = []

for popular_index_pos in question_topics:
    topic_list.append(popular_index_pos.argmax())
    
df["Topic Number"] = topic_list

In [58]:
df.head(60)

Unnamed: 0,question,Topic Number
0,What is the step by step guide to invest in sh...,6
1,What is the story of Kohinoor (Koh-i-Noor) Dia...,12
2,How can I increase the speed of my internet co...,19
3,Why am I mentally very lonely? How can I solve...,13
4,"Which one dissolve in water quikly sugar, salt...",15
5,Astrology: I am a Capricorn Sun Cap moon and c...,1
6,Should I buy tiago?,0
7,How can I be a good geologist?,8
8,When do you use ã‚· instead of ã—?,9
9,Motorola (company): Can I hack my Charter Moto...,9
