In [2]:
import pandas as pd

df = pd.read_csv("quora_questions.csv")

In [3]:
df.head()

Unnamed: 0,Question
0,What is the step by step guide to invest in sh...
1,What is the story of Kohinoor (Koh-i-Noor) Dia...
2,How can I increase the speed of my internet co...
3,Why am I mentally very lonely? How can I solve...
4,"Which one dissolve in water quikly sugar, salt..."


In [4]:
df.shape

(404289, 1)

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import re
from nltk.corpus import stopwords
import nltk

In [6]:
def preprocess_text(text):
    text = re.sub(r'\W', ' ', str(text))
    text = text.lower()
    text = re.sub(r'\s+[a-zA-Z]\s+', ' ', text)
    text = re.sub(r'\^[a-zA-Z]\s+', ' ', text)
    text = re.sub(r'\s+', ' ', text, flags=re.I)
    return text

# Apply preprocessing to each question
df['cleaned_question'] = df['Question'].apply(preprocess_text)

tfidf_vect = TfidfVectorizer(max_df=0.95, min_df=2, stop_words='english')

dtm = tfidf_vect.fit_transform(df['cleaned_question'])


In [7]:
dtm

<404289x38669 sparse matrix of type '<class 'numpy.float64'>'
	with 2002912 stored elements in Compressed Sparse Row format>

In [8]:
from sklearn.decomposition import NMF

n_components = 20

nmf_model = NMF(n_components=n_components, random_state=42)

nmf_model.fit(dtm)





In [9]:
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic %d:" % (topic_idx))
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))

feature_names = tfidf_vect.get_feature_names_out()
display_topics(nmf_model, feature_names, 10)  # Display top 10 words for each topic


Topic 0:
best movies book books 2016 ways movie laptop buy phone
Topic 1:
does mean work feel long cost compare really exist use
Topic 2:
quora questions question ask answer answers google asked delete improvement
Topic 3:
money make online earn ways youtube easy home free internet
Topic 4:
life purpose meaning thing important real moment change want live
Topic 5:
india pakistan war spotify job available olympics country business china
Topic 6:
learn programming language start learning java languages python want hacking
Topic 7:
trump donald clinton president hillary win did election better vote
Topic 8:
world war did start iii country end happen pakistan place
Topic 9:
like feel sex look girl live girls work women culture
Topic 10:
good books bad ways engineering work job start read business
Topic 11:
500 notes 1000 rs rupee indian black banning ban government
Topic 12:
know new things day going employees don year 2017 girl
Topic 13:
english improve skills writing speaking pronunciati

In [15]:
topic_results = nmf_model.transform(dtm)

In [14]:
topic_results.argmax(axis=1)

df['Topic'] = topic_results.argmax(axis=1)

df.head(10)

Unnamed: 0,Question,cleaned_question,Topic
0,What is the step by step guide to invest in sh...,what is the step by step guide to invest in sh...,5
1,What is the story of Kohinoor (Koh-i-Noor) Dia...,what is the story of kohinoor koh noor diamond,16
2,How can I increase the speed of my internet co...,how can increase the speed of my internet conn...,17
3,Why am I mentally very lonely? How can I solve...,why am mentally very lonely how can solve it,11
4,"Which one dissolve in water quikly sugar, salt...",which one dissolve in water quikly sugar salt ...,14
5,Astrology: I am a Capricorn Sun Cap moon and c...,astrology am capricorn sun cap moon and cap ri...,1
6,Should I buy tiago?,should buy tiago,0
7,How can I be a good geologist?,how can be good geologist,10
8,When do you use シ instead of し?,when do you use シ instead of し,19
9,Motorola (company): Can I hack my Charter Moto...,motorola company can hack my charter motorolla...,17
