# Topic Modelling for quora questions

In [2]:
import numpy as np
import pandas as pd

In [4]:
df = pd.read_csv('C:\\Users\\HP\\OneDrive\\Desktop\\UPDATED_NLP_COURSE\\05-Topic-Modeling\\quora_questions.csv')

In [7]:
df.head()

Unnamed: 0,Question
0,What is the step by step guide to invest in sh...
1,What is the story of Kohinoor (Koh-i-Noor) Dia...
2,How can I increase the speed of my internet co...
3,Why am I mentally very lonely? How can I solve...
4,"Which one dissolve in water quikly sugar, salt..."


In [9]:
len(df)

404289

In [11]:
data = df[0:50000]

In [14]:
data.isnull().sum()

Question    0
dtype: int64

In [17]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_df = 0.90, min_df = 2, stop_words='english')
dtm = cv.fit_transform(data['Question'])

In [19]:
dtm

<50000x13573 sparse matrix of type '<class 'numpy.int64'>'
	with 238154 stored elements in Compressed Sparse Row format>

In [23]:
from sklearn.decomposition import LatentDirichletAllocation

LDA = LatentDirichletAllocation(n_components = 7)

LDA.fit(dtm)

LatentDirichletAllocation(n_components=7)

In [27]:
len(LDA.components_[0])  # Each word is treated as a feature, for which we have 50000x13573 words, and this returns the probability of each word/feature belonging to that topic

13573

In [30]:
first_topic = LDA.components_[0]

In [32]:
first_topic.argsort()

array([ 346, 2328, 8062, ..., 6984, 7392, 1496], dtype=int64)

In [35]:
first_topic[346]  # Least Probability of the word belonging to first topic

0.14285716827814351

In [37]:
first_topic[1496] # Highest Probability of the word belonging to first topic

1295.8476384240719

In [39]:
for index,topic in enumerate(LDA.components_):
    print(f'THE TOP 15 WORDS FOR TOPIC #{index}')
    print([cv.get_feature_names()[i] for i in topic.argsort()[-15:]])
    print('\n')

THE TOP 15 WORDS FOR TOPIC #0
['earn', 'start', 'ways', 'programming', 'lose', 'language', 'use', 'weight', 'india', 'way', 'online', 'money', 'learn', 'make', 'best']


THE TOP 15 WORDS FOR TOPIC #1
['did', 'people', 'black', 'math', 'rs', 'improve', 'war', 'does', 'notes', '1000', 'love', '500', 'indian', 'india', 'world']


THE TOP 15 WORDS FOR TOPIC #2
['did', 'women', 'make', 'used', 'word', 'sex', 'ask', 'like', 'examples', 'thing', 'questions', 'does', 'life', 'people', 'quora']


THE TOP 15 WORDS FOR TOPIC #3
['hillary', 'iphone', 'did', 'movies', 'new', 'clinton', 'english', 'movie', 'instagram', 'president', 'donald', 'does', 'best', 'know', 'trump']


THE TOP 15 WORDS FOR TOPIC #4
['quora', 'does', 'study', 'exam', 'science', 'computer', 'prepare', 'data', 'way', 'book', 'books', 'engineering', 'difference', 'good', 'best']


THE TOP 15 WORDS FOR TOPIC #5
['com', 'india', 'phone', 'app', 'android', 'time', 'think', 'feel', 'best', 'account', 'facebook', 'stop', 'girl', 'does

In [42]:
topic_results = LDA.transform(dtm)

In [44]:
topic_results[0] # Probability of each document belonging to each topic 

array([0.76219992, 0.01787775, 0.14841885, 0.0178632 , 0.01788413,
       0.01786298, 0.01789317])

In [46]:
data['Topic'] = topic_results.argmax(axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['Topic'] = topic_results.argmax(axis=1)


In [47]:
data.head()

Unnamed: 0,Question,Topic
0,What is the step by step guide to invest in sh...,0
1,What is the story of Kohinoor (Koh-i-Noor) Dia...,5
2,How can I increase the speed of my internet co...,5
3,Why am I mentally very lonely? How can I solve...,2
4,"Which one dissolve in water quikly sugar, salt...",2
