<a href="https://colab.research.google.com/github/vispute/StackOverflow_semantic_search_engine/blob/master/3_LDA_Topic_modelling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re

import warnings
warnings.filterwarnings('ignore')

In [None]:
# Acquiring preprocessed_dataset
tbs_df = pd.read_csv('/content/drive/My Drive/AAIC Course/Personal case study - StackOverflow/tbs_df.csv')

In [None]:
# defining a function to remove stop_words
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
stop_words.add('would')
stop_words.update([chr(c) for c in range(97, 123)])
# stop_words.remove('no'); stop_words.remove('not'); stop_words.remove('nor')

def stopwrd_removal(sent):
  lst = []
  for wrd in sent.split():
    if wrd not in stop_words:
      lst.append(wrd)
  return " ".join(lst)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
def text_preprocessor(column):
  """pass any column with Text in it from tbs_df | Note: returns nothing makes inplace changes in tbs_df"""
  # 1. remove html tags, html urls, replace html comparison operators
  text = tbs_df[column].values
  tbs_df[column] = [re.sub('<.*?>', '', i) for i in text]
  tbs_df[column] = tbs_df[column].str.replace('&lt;', '<')\
                                          .str.replace('&gt;', '>')\
                                          .str.replace('&le;', '<=' )\
                                          .str.replace('&ge;', '>=')

  # 2. remove latex i,e., mostly formulas since it's mathematics based dataset
  tbs_df[column] = [re.sub('\$.*?\$', '', i) for i in text]

  # 3. all lowercase 
  tbs_df[column] = tbs_df[column].str.lower()

  # 4. decontractions
  tbs_df[column] = tbs_df[column].str.replace("won't", "will not").str.replace("can\'t", "can not").str.replace("n\'t", " not").str.replace("\'re", " are").str.\
                                                replace("\'s", " is").str.replace("\'d", " would").str.replace("\'ll", " will").str.\
                                                replace("\'t", " not").str.replace("\'ve", " have").str.replace("\'m", " am")

  # 5. remove all special-characters other than alpha-numericals
  tbs_df[column] = [re.sub('\W', ' ', i) for i in text]
  # remove all digits
  tbs_df[column] = [re.sub('\d', ' ', i) for i in text]

  # 6. Stop_word removal
  tbs_df[column] = [stopwrd_removal(i) for i in text]

  # 7. remove all white-space i.e., \n, \t, and extra_spaces
  tbs_df[column] = [re.sub('  +', ' ', i) for i in text]
  tbs_df[column] = tbs_df[column].str.replace("\n", " ").str.replace("\t", " ").str.strip()

**Note: For LDA modelling I am removing all digits and stopwords**

In [None]:
# 1. train_test split
train_set = 0.80
test_set = 1 - train_set
text_preprocessor('combined_text')

# 2. splitting 'combined_text'
title_body_train = tbs_df['combined_text'].values[:int(tbs_df.shape[0]*train_set)]
title_body_test = tbs_df['combined_text'].values[-int(tbs_df.shape[0]*test_set):]

# 3. tags splitting
tags = tbs_df['tag_pred1'].str.cat(tbs_df['tag_pred2'], sep = ' ').str.cat(tbs_df['tag_pred3'], sep = ' ').str.cat(tbs_df['tag_pred4'], sep = ' ').str.cat(tbs_df['tag_pred5'], sep = ' ').tolist()
tags_train = np.array(tags[:int(tbs_df.shape[0]*train_set)])
tags_test = np.array(tags[-int(tbs_df.shape[0]*test_set):])

title_body_train.shape, title_body_test.shape, tags_train.shape, tags_test.shape

((145631,), (36407,), (145631,), (36407,))

In [None]:
# 4. truncating title_body on 60 words
title_body_train = [' '.join(i.split(' ')[:60]) for i in title_body_train]
title_body_test = [' '.join(i.split(' ')[:60]) for i in title_body_test]

In [None]:
# 5. cleaning tags i.e., removing special characters
clean_train_tags = []
for i in tags_train:
  clean_train_tags.append(re.sub('[<>-]', "", i).strip())

clean_test_tags = []
for i in tags_test:
  clean_test_tags.append(re.sub('[<>-]', "", i).strip())

In [None]:
# 6. Joining 'combined_text' + 'Tags'
final_train = [i + ' ' + j for i, j in zip(title_body_train, clean_train_tags)]
final_test = [i + ' ' + j for i, j in zip(title_body_test, clean_test_tags)]

In [None]:
final_train[2480:2490]

['symbolic computer algebra statistics functionality exist cas specifically geared toward statistics symbolic algebra systems like mathematica maple often used calculus logic physics problems rarely used statistics statistical constructs could added symbolic algebra system improve use field specific code samples many people like able please think following three users research statistician non statistics researcher using statistics another field biology statistics student computational_statistics',
 'use rejection sampling generate draws unit exponential working practice test problems one says design rejection sampling algorithm produce draws unit exponential using draws gamma understand possible impression envelope function needs scalable manner constant see way gamma going little mass around exponential function mass around kind transformation need gamma function allow function envelope using tried flipping make inverse gamma adequately capture random_generation',
 ' mathematical_sta

# 8.1. LDA Model : Training

In [None]:
# 1. creating BOW Matrix - gensim returns tuple of (token_id in dict, frequency)
# https://radimrehurek.com/gensim/models/ldamodel.html
import gensim
from gensim import corpora

train_tokens = [i.split(' ') for i in final_train]
dictionary = corpora.Dictionary(train_tokens)
train_BOW = [dictionary.doc2bow(i) for i in train_tokens]

In [None]:
import pickle
with open('/content/drive/My Drive/AAIC Course/Personal case study - StackOverflow/LDA_dictionary.pickle', 'wb') as handle:
    pickle.dump(dictionary, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
# 2. Preparing test set
test_tokens = [i.split(' ') for i in final_test]
test_BOW = [dictionary.doc2bow(i) for i in test_tokens]

In [None]:
# 3. Training LDA model on BOW train matrix
from gensim.models.ldamodel import LdaModel
ldamodel_title_body_tag = LdaModel(train_BOW, num_topics = 250, id2word = dictionary, passes = 10, random_state = 101, update_every = 128)
ldamodel_title_body_tag.save('/content/drive/My Drive/AAIC Course/Personal case study - StackOverflow/ldamodel_title_body_tag')
ldamodel_title_body_tag = LdaModel.load('/content/drive/My Drive/AAIC Course/Personal case study - StackOverflow/ldamodel_title_body_tag')

#### 4. Testing LDA model

In [None]:
# 4. Testing LDA model
x = ldamodel_title_body_tag[test_BOW[:5]]
for i in x:
  print(i)

[(42, 0.39686412), (113, 0.103119574), (128, 0.14167082), (149, 0.13142478), (191, 0.10757557), (239, 0.103853054)]
[(33, 0.071046695), (72, 0.11917574), (117, 0.5029193), (180, 0.19618227), (233, 0.09486958)]
[(17, 0.06788025), (167, 0.12746437), (195, 0.1024162), (236, 0.53147554), (244, 0.15495716)]
[(1, 0.08113217), (29, 0.04023653), (44, 0.018637098), (71, 0.1350893), (80, 0.20825763), (166, 0.2095908), (214, 0.06706375), (217, 0.08086122), (224, 0.062055305), (226, 0.034199007), (233, 0.04745785)]
[(42, 0.19511972), (166, 0.33886144), (229, 0.44585556)]


In [None]:
x = ldamodel_title_body_tag.get_document_topics(test_BOW[:5], minimum_probability = 0.20)
for i in x:
  print(i)

[(42, 0.39710984)]
[(117, 0.5304948)]
[(236, 0.4874936)]
[(80, 0.21931224)]
[(166, 0.33872607), (229, 0.4466301)]


In [None]:
ldamodel_title_body_tag.get_topic_terms(73, topn = 5),  ldamodel_title_body_tag.show_topic(73, topn = 5)

([(697, 0.02759998),
  (135, 0.025255308),
  (298, 0.023426188),
  (78, 0.021936793),
  (692, 0.01676521)],
 [('survey', 0.02759998),
  ('scale', 0.025255308),
  ('likert', 0.023426188),
  ('data', 0.021936793),
  ('questions', 0.01676521)])

In [None]:
len(ldamodel_title_body_tag.print_topics(num_topics = -1, num_words = 5))

250

In [None]:
x = ldamodel_title_body_tag[test_BOW[16]]
[ldamodel_title_body_tag.show_topic(topicid = i[0], topn = 5) for i in x], final_test[100]

([[('na', 0.05231732),
   ('data', 0.0281994),
   ('curves', 0.014549139),
   ('like', 0.009442198),
   ('set', 0.0067460616)],
  [('problem', 0.05177094),
   ('np', 0.030071674),
   ('complete', 0.019165054),
   ('set', 0.01636283),
   ('polynomial', 0.014421305)]],
 'checking combinatorics modelling flash memory system requests writes take cycles complete read take system handle requests proportions based observed values timings arbitrary calculation assuming system fully loaded means probability read write read writes read writes read writes happily adds correct combinatorics rusty online calculators allow proportions combinatorics')

# 8.2 LDA Model : predicting whole dataset

In [None]:
# 1. Loading trained LDA model and LDA_dictionary
import pickle
from gensim.models.ldamodel import LdaModel

handle = open('/content/drive/My Drive/AAIC Course/Personal case study - StackOverflow/LDA_dictionary.pickle', 'rb')
dictionary = pickle.load(handle)

ldamodel_title_body_tag = LdaModel.load('/content/drive/My Drive/AAIC Course/Personal case study - StackOverflow/ldamodel_title_body_tag')

In [None]:
# 2. input text preprocessor
def text_preprocessor(corpus, stop_word = False, remove_digits = False):
  clean_corpus = []
  for doc in corpus:
    # 1. remove html tags, html urls, replace html comparison operators
    clean_str = re.sub('<.*?>', '', doc)
    clean_str = clean_str.replace('&lt;', '<')\
                .replace('&gt;', '>')\
                .replace('&le;', '<=' )\
                .replace('&ge;', '>=')

    # 2. remove latex i,e., mostly formulas since it's mathematics based dataset
    clean_str = re.sub('\$.*?\$', '', clean_str)

    # 3. all lowercase 
    clean_str = clean_str.lower()

    # 4. decontractions
    clean_str = clean_str.replace("won't", "will not").replace("can\'t", "can not").replace("n\'t", " not").replace("\'re", " are").\
                                                  replace("\'s", " is").replace("\'d", " would").replace("\'ll", " will").\
                                                  replace("\'t", " not").replace("\'ve", " have").replace("\'m", " am")

    # 5. remove all special-characters other than alpha-numericals
    clean_str = re.sub('\W', ' ', clean_str)
    if remove_digits == True:
      clean_str = re.sub('\d', ' ', clean_str)

    # 6. Stop_word removal
    if stop_word == True:
      clean_str = stopwrd_removal(clean_str)

    # 7. remove all white-space i.e., \n, \t, and extra_spaces
    clean_str = re.sub('  +', ' ', clean_str)
    clean_str = clean_str.replace("\n", " ").replace("\t", " ").strip()

    clean_corpus.append(clean_str)

  return clean_corpus

In [None]:
# 3. defining a final topic prediction function
def final_topic_prediction(corpus):
  clean_corpus = text_preprocessor(corpus, stop_word = True, remove_digits = True)
  tokens_corpus = [i.split(' ') for i in clean_corpus]
  BOW_corpus = [dictionary.doc2bow(i) for i in tokens_corpus]

  topics_pred = []
  for BOW_query in BOW_corpus:
    topic_proba_tuple = ldamodel_title_body_tag.get_document_topics(BOW_query, minimum_probability = 0.20)
    topics_pred.append(list(dict(topic_proba_tuple).keys()))
  return topics_pred

In [None]:
# 4. preparing whole dataset : title + predicted_tags
title = tbs_df['Title'].values
tags = tbs_df['tag_pred1'].str.cat(tbs_df['tag_pred2'], sep = ' ').str.cat(tbs_df['tag_pred3'], sep = ' ').str.cat(tbs_df['tag_pred4'], sep = ' ').str.cat(tbs_df['tag_pred5'], sep = ' ').tolist()
clean_tags = []
for i in tags:
  clean_tags.append(re.sub('[<>-]', "", i).strip())
corpus = [i + ' ' + j for i, j in zip(title, clean_tags)]

In [None]:
%%time
# 5. predicting topics for whole dataset
topic_id_lst = final_topic_prediction(corpus)
topic_id_lst[:5]

CPU times: user 2min 57s, sys: 522 ms, total: 2min 58s
Wall time: 2min 58s


In [None]:
# 4. creating new colummns with predicted topics
print('maximum no.of topics one doc can have:', max([len(i) for i in topic_id_lst]))
tbs_df = pd.concat([tbs_df, pd.DataFrame(topic_id_lst, columns = ['topic_pred1', 'topic_pred2', 'topic_pred3', 'topic_pred4'])], axis = 1)
# replacing nan values with 1000 to maintain the pandas series dtype = numerics
tbs_df = tbs_df.fillna(1000)

maximum no.of topics one doc can have: 4


In [None]:
tbs_df.to_csv('/content/drive/My Drive/AAIC Course/Personal case study - StackOverflow/tbs_df.csv', index = False)

In [None]:
tbs_df.iloc[2480:2490, :]

Unnamed: 0,Title,Body,Tags,CreationDate,LastActivityDate,Score,ViewCount,AnswerCount,CommentCount,FavoriteCount,Comments,index_left,tag_1,tag_2,tag_3,tag_4,tag_5,index,combined_text,sentiment_comb,subjectivity_comb,sentiment_comments,subjectivity_comments,UNIX_CreationDate,Title_1,tag_pred1,tag_pred2,tag_pred3,tag_pred4,tag_pred5,topic_pred1,topic_pred2,topic_pred3,topic_pred4
2480,Symbolic computer algebra for statistics,<p>What functionality should exist in a <a hre...,<python><computational_statistics><computing><...,2011-05-04T20:32:27.960,2019-01-19T23:04:45.170,8,415,1,5,1,"There's a package dedicated to statistics, see...",65729,<computational_statistics>,-,-,-,-,2480,symbolic computer algebra statistics functiona...,0.122222,0.427778,0.27375,0.556944,1304541000.0,symbolic computer algebra for statistics,<computational_statistics>,-,-,-,-,179.0,1000.0,1000.0,1000.0
2481,How to use rejection sampling to generate draw...,"<p>I'm working on some practice test problems,...",<self_study><monte_carlo><simulation>,2011-05-04T21:17:49.260,2011-05-05T21:09:49.177,4,1114,1,3,0,Wikipedia provides (without explanation) an [a...,65730,<self_study>,<monte_carlo>,<simulation>,-,-,2481,use rejection sampling generate draws unit exp...,0.143229,0.633333,0.0375,0.283333,1304544000.0,how to use rejection sampling to generate draw...,<random_generation>,-,-,-,-,226.0,1000.0,1000.0,1000.0
2482,"$\operatorname{Var}(X^2)$, if $\operatorname{V...","<p>What would be <span class=""math-container"">...",<mathematical_statistics><variance>,2011-05-05T03:42:06.087,2018-12-18T23:35:56.927,6,18479,4,2,1,$Var[X] \stackrel{d}{=} \mathbb{E}[X^2] - (\ma...,65731,<mathematical_statistics>,<variance>,-,-,-,2482,,0.0,0.0,0.09,0.54,1304567000.0,if,<mathematical_statistics>,-,-,-,-,236.0,1000.0,1000.0,1000.0
2483,Bootstrapping data envelopment analysis effici...,<p>I want to perform bootstrapping for calcula...,<r><bootstrap><efficiency>,2011-05-05T05:47:30.180,2012-10-09T09:41:04.573,3,1757,1,3,0,"@user4472, please provide some context and mor...",65732,<bootstrap>,-,-,-,-,2483,bootstrapping data envelopment analysis effici...,-0.125,0.375,0.283333,0.45,1304574000.0,bootstrapping data envelopment analysis effici...,<bootstrap>,-,-,-,-,206.0,227.0,1000.0,1000.0
2484,"Panel Data: In a fixed effects model, does aut...","<p>Given a panel of countries over time, a fix...",<autocorrelation><panel_data><fixed_effects_mo...,2011-05-05T08:06:46.827,2011-05-05T08:06:46.827,4,2855,0,1,2,Wooldridge in his [book](http://books.google.c...,65733,<autocorrelation>,<panel_data>,<fixed_effects_model>,-,-,2484,panel data fixed effects model auto correlatio...,0.126667,0.296667,0.104,0.505,1304583000.0,panel data in a fixed effects model does auto ...,<panel_data>,<fixed_effects_model>,-,-,-,242.0,1000.0,1000.0,1000.0
2485,"When estimating variance, why do unbiased esti...",<p>I am totally confused: On the one hand you ...,<normal_distribution><variance><unbiased_estim...,2011-05-05T08:11:02.280,2019-03-02T22:56:03.470,7,1831,3,0,4,-,65734,<normal_distribution>,<variance>,<unbiased_estimator>,-,-,2485,estimating variance unbiased estimators divide...,0.175463,0.541759,0.0,0.0,1304583000.0,when estimating variance why do unbiased estim...,<unbiased_estimator>,<maximum_likelihood>,-,-,-,201.0,236.0,1000.0,1000.0
2486,Is it problematic if one predictor in a set ac...,<p>I am running a logistic regression with cus...,<logistic><modeling>,2011-05-05T09:34:46.440,2011-05-05T10:32:03.557,3,94,1,3,0,@ayush Could you edit your question clarifying...,65735,<logistic>,<modeling>,-,-,-,2486,problematic one predictor set accounts almost ...,0.087326,0.450694,0.074578,0.515931,1304588000.0,is it problematic if one predictor in a set ac...,<predictor>,-,-,-,-,42.0,1000.0,1000.0,1000.0
2487,Making a heatmap with a precomputed distance m...,<p>I have made a heatmap based upon a regular ...,<r><data_visualization>,2011-05-05T09:39:26.173,2019-01-15T23:26:36.430,3,5130,2,6,3,@Lars What do you want to modify: the heatmap ...,65736,<data_visualization>,-,-,-,-,2487,making heatmap precomputed distance matrix dat...,-0.041667,0.330128,0.360417,0.7125,1304588000.0,making a heatmap with a precomputed distance m...,<matrix>,<data_visualization>,-,-,-,31.0,168.0,1000.0,1000.0
2488,Advice on missing value imputation,<p>I am working on insurance data in which a c...,<data_imputation>,2011-05-05T11:20:13.343,2012-12-19T20:10:02.963,5,646,2,6,1,"In my opinion, if the variable is significant ...",65737,<data_imputation>,-,-,-,-,2488,advice missing value imputation working insura...,0.207213,0.510014,0.070455,0.513182,1304594000.0,advice on missing value imputation,<missing_data>,<data_imputation>,-,-,-,207.0,1000.0,1000.0,1000.0
2489,How to analyse repeated measure ANOVA with thr...,<h3>Context:</h3>\n\n<p>My question concerns a...,<hypothesis_testing><anova><repeated_measures>,2011-05-05T12:26:43.317,2014-12-06T05:40:54.650,8,1730,1,7,5,Could you specify what your question is? Is th...,65738,<hypothesis_testing>,<anova>,<repeated_measures>,-,-,2489,analyse repeated measure anova three condition...,0.011835,0.460504,-0.014794,0.437511,1304598000.0,how to analyse repeated measure anova with thr...,<repeated_measures>,<anova>,-,-,-,244.0,1000.0,1000.0,1000.0
