<a href="https://colab.research.google.com/github/vispute/StackOverflow_semantic_search_engine/blob/master/4_BM25.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re

import warnings
warnings.filterwarnings('ignore')
!pip install rank-bm25

Collecting rank-bm25
  Downloading https://files.pythonhosted.org/packages/16/5a/23ed3132063a0684ea66fb410260c71c4ffda3b99f8f1c021d1e245401b5/rank_bm25-0.2.1-py3-none-any.whl
Installing collected packages: rank-bm25
Successfully installed rank-bm25-0.2.1


In [None]:
# Acquiring preprocessed_dataset
tbs_df = pd.read_csv('/content/drive/My Drive/AAIC Course/Personal case study - StackOverflow/tbs_df.csv')
tbs_df = tbs_df.fillna(' ')

In [None]:
# defining a function to remove stop_words
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
stop_words.add('would')
stop_words.update([chr(c) for c in range(97, 123)])
# stop_words.remove('no'); stop_words.remove('not'); stop_words.remove('nor')

def stopwrd_removal(sent):
  lst = []
  for wrd in sent.split():
    if wrd not in stop_words:
      lst.append(wrd)
  return " ".join(lst)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
def text_preprocessor(column):
  """pass any column with Text in it from tbs_df | Note: returns nothing makes inplace changes in tbs_df"""
  # 1. remove html tags, html urls, replace html comparison operators
  text = tbs_df[column].values
  tbs_df[column] = [re.sub('<.*?>', '', i) for i in text]
  tbs_df[column] = tbs_df[column].str.replace('&lt;', '<')\
                                          .str.replace('&gt;', '>')\
                                          .str.replace('&le;', '<=' )\
                                          .str.replace('&ge;', '>=')

  # 2. remove latex i,e., mostly formulas since it's mathematics based dataset
  tbs_df[column] = [re.sub('\$.*?\$', '', i) for i in text]

  # 3. all lowercase 
  tbs_df[column] = tbs_df[column].str.lower()

  # 4. decontractions
  tbs_df[column] = tbs_df[column].str.replace("won't", "will not").str.replace("can\'t", "can not").str.replace("n\'t", " not").str.replace("\'re", " are").str.\
                                                replace("\'s", " is").str.replace("\'d", " would").str.replace("\'ll", " will").str.\
                                                replace("\'t", " not").str.replace("\'ve", " have").str.replace("\'m", " am")

  # 5. remove all special-characters other than alpha-numericals
  tbs_df[column] = [re.sub('\W', ' ', i) for i in text]
  # remove all digits
  tbs_df[column] = [re.sub('\d', ' ', i) for i in text]

  # 6. Stop_word removal
  tbs_df[column] = [stopwrd_removal(i) for i in text]

  # 7. remove all white-space i.e., \n, \t, and extra_spaces
  tbs_df[column] = [re.sub('  +', ' ', i) for i in text]
  tbs_df[column] = tbs_df[column].str.replace("\n", " ").str.replace("\t", " ").str.strip()
  

**Note: For BM25 modelling I am removing all digits and stopwords**

In [None]:
# 1. preparing dataset for BM25 : truncated "title + body"
# 1.1 title_body preprocessing
text_preprocessor('combined_text')
title_body = tbs_df['combined_text'].values

# 1.2 truncating title_body on 40 words
title_body = [' '.join(i.split(' ')[:40]) for i in title_body]

len(title_body), title_body[:5]

(182039,
 ['euler cycles biconnected components graph euler cycle biconnected components euler cycles well',
  'two neighbors graph depth dfs tree undirected graph two nodes identical distance root dfs tree neighbors original graph thinking sure back edges',
  'unique path directed graph designing algorithm class determine directed graph unique respect vertex one path started using bfs breadth first search find shortest path another vertex running bfs see alternate path found think time consuming however anyone hints solution found',
  'call average include outliers call average include outliers example set avg excluding outlier avg describe average statistics',
  'correcting outliers running average daemon reads data sensors among things calculates besides simply reporting state average time takes sensors change one value another keeps running average datapoints assumes runtime fairly constant unfortunately demonstrated graph input data pristine line represents different'])

**BM25 Model :**

In [None]:
from rank_bm25 import BM25Okapi
final_data = title_body
train_tokens = [i.split(' ') for i in final_data]
bm25 = BM25Okapi(train_tokens)

**Testing :**

In [None]:
query = "meaning of the intercept in regression with binary explanatory variables"
tokenized_query = query.split(" ")

doc_scores = bm25.get_scores(tokenized_query)
corpus = tbs_df.Title.tolist()
bm25.get_top_n(tokenized_query, corpus, n = 5)

['What is the meaning of the intercept in regression with binary explanatory variables?',
 'Standard error and explanatory (independent) variables',
 'Standard Error of Intercept in Multiple Linear Regression',
 'Interpretation of intercept of a regression line in time series data',
 'When is it valid to include interaction terms in a regression model?']

In [None]:
query = "normal distribution vs uniform dist"
tokenized_query = query.split(" ")

doc_scores = bm25.get_scores(tokenized_query)
corpus = tbs_df.Title.tolist()
bm25.get_top_n(tokenized_query, corpus, n = 5)

['Random Number generated from Normal Distribution N(0,1)',
 'How to calculate mean, median, mode, std dev from distribution',
 'Normal Distribution with Uniform Mean',
 'What is the ratio of uniform and normal distribution?',
 'Distribution of MLE of $N$ based on a random sample of size n from discrete uniform dist.(1,2,...,$N$)']

In [None]:
query = "difference between tensorflow and pytorch"
tokenized_query = query.split(" ")

doc_scores = bm25.get_scores(tokenized_query)
corpus = tbs_df.Title.tolist()
bm25.get_top_n(tokenized_query, corpus, n = 5)

["Pytorch's pack_padded_sequence in Tensorflow?",
 'What are the differences between TensorFlow and PyTorch?',
 'Is there a reason to use TensorFlow over PyTorch for research purposes?',
 'Tensorflow (or Keras) vs. Pytorch vs. some other ML library for implementing a CNN',
 'Tutorial for restricted Boltzmann machine using PyTorch or Tensorflow?']

In [None]:
query = "difference between tensorflow and pytorch"
tokenized_query = query.split(" ")

doc_scores = bm25.get_scores(tokenized_query)
corpus = tbs_df.Title.tolist()[:int(tbs_df.shape[0]*train_set)]
bm25.get_top_n(tokenized_query, corpus, n = 5)

["Pytorch's pack_padded_sequence in Tensorflow?",
 'What are the differences between TensorFlow and PyTorch?',
 'CrossMapLRN2d in pytorch',
 'Places365 for pytorch',
 'PyTorch vs. Tensorflow eager']

In [None]:
query = "difference between tf and pytorch"
tokenized_query = query.split(" ")

doc_scores = bm25.get_scores(tokenized_query)
corpus = tbs_df.Title.tolist()[:int(tbs_df.shape[0]*train_set)]
bm25.get_top_n(tokenized_query, corpus, n = 5)

['Difference between tf-idf and tf with Random Forests',
 'Is there a difference between tf.nn.conv1d and tf.nn.convolution in Tensorflow?',
 'Difference in tf-idf values in R',
 'Difference between train.RMSPropOptimizer and tf.train.GradientDescentOptimizer (tensorflow)',
 'PyTorch: How to use pytorch pretrained for single channel image']

In [None]:
query = "installing nltk "
tokenized_query = query.split(" ")

doc_scores = bm25.get_scores(tokenized_query)
corpus = tbs_df.Title.tolist()
bm25.get_top_n(tokenized_query, corpus, n = 5)

['Installing NLTK using WHL file -',
 '$\\operatorname{Var}(X^2)$, if $\\operatorname{Var}(X)=\\sigma^2$',
 'Difference between from nltk import word_tokenize and from nltk.tokenize import word_tokenize?',
 'How was the perplexity of the Brown corpus measured?',
 'How to change plot size in nltk.plot()']

In [None]:
query = "installing nltk "
tokenized_query = query.split(" ")

doc_scores = bm25.get_scores(tokenized_query)
corpus = tbs_df.Title.tolist()[:int(tbs_df.shape[0]*train_set)]
bm25.get_top_n(tokenized_query, corpus, n = 5)

['Installing NLTK using WHL file -',
 'Error while installing grpcio while installing tensorflow in Linux',
 'About installing Theano',
 'Error while using NLTK/ How should I read paragraph using NLTK?',
 'Difference between from nltk import word_tokenize and from nltk.tokenize import word_tokenize?']

In [None]:
query = "how to install nltk"
tokenized_query = query.split(" ")

doc_scores = bm25.get_scores(tokenized_query)
corpus = tbs_df.Title.tolist()[:int(tbs_df.shape[0]*train_set)]
bm25.get_top_n(tokenized_query, corpus, n = 5)

['normal conda install vs forge install',
 'Manual install vs conda install tensorflow-gpu',
 'Error while using NLTK/ How should I read paragraph using NLTK?',
 "can't install tensorflow with gpu",
 'How to install fGarch package']

In [None]:
query = "how to install pip"
tokenized_query = query.split(" ")

doc_scores = bm25.get_scores(tokenized_query)
corpus = tbs_df.Title.tolist()
bm25.get_top_n(tokenized_query, corpus, n = 5)

['Can not install spacy package on windows 10 via pip',
 'what if conda has no package?',
 'How to use plot model in keras?',
 'Getting errors while trying to install tensorflow on ubuntu',
 'getting error while installing install_tensorflow()']

In [None]:
query = "how to install pip"
tokenized_query = query.split(" ")

doc_scores = bm25.get_scores(tokenized_query)
corpus = tbs_df.Title.tolist()[:int(tbs_df.shape[0]*train_set)]
bm25.get_top_n(tokenized_query, corpus, n = 5)

['"Pip install tensorflow" only installs version 1.14 – how to install version 2 (with eager mode enabled?)',
 'Can not install spacy package on windows 10 via pip',
 'normal conda install vs forge install',
 'Manual install vs conda install tensorflow-gpu',
 'How to install fGarch package']

In [None]:
query = "how to install pip"
tokenized_query = query.split(" ")

doc_scores = bm25.get_scores(tokenized_query)
corpus = tbs_df.Title.tolist()[:int(df.shape[0]*train_set)]
bm25.get_top_n(tokenized_query, corpus, n = 5)

['How to use plot model in keras?',
 'How to integrate google cloud with dropbox and jupyter notebook using tensorflow',
 'Installing NLTK using WHL file -',
 'Different available packages in TensorFlow virtualenv?',
 "What's the meaning of a posterior inclusion probability in Bayesian?"]

In [None]:
query = "change backend keras"
tokenized_query = query.split(" ")

doc_scores = bm25.get_scores(tokenized_query)
corpus = tbs_df.Title.tolist()
bm25.get_top_n(tokenized_query, corpus, n = 5)

['how to change keras backend in windows?',
 'Keras backend (tensorflow) vs tensorflow',
 "AttributeError: module 'keras.backend' has no attribute 'backend'",
 'Keras backend function equivalent for str.format',
 'Is it possible to call from Keras unsupported backend function directly from tensorflow?']

In [None]:
query = "change backend keras"
tokenized_query = query.split(" ")

doc_scores = bm25.get_scores(tokenized_query)
corpus = tbs_df.Title.tolist()
bm25.get_top_n(tokenized_query, corpus, n = 5)

['how to change keras backend in windows?',
 "AttributeError: module 'keras.backend' has no attribute 'backend'",
 'Why does Keras need TensorFlow as backend?',
 'Switching Keras backend Tensorflow to GPU',
 'Axis parameter in the Keras backend sum']

In [None]:
query = "change backend keras"
tokenized_query = query.split(" ")

doc_scores = bm25.get_scores(tokenized_query)
corpus = tbs_df.Title.tolist()[:int(df.shape[0]*train_set)]
bm25.get_top_n(tokenized_query, corpus, n = 5)

['how to change keras backend in windows?',
 'Keras backend (tensorflow) vs tensorflow',
 'Choosing between TensorFlow or Theano as backend for Keras',
 'Keras backend function equivalent for str.format',
 'What is the meaning of fuzz factor?']

In [None]:
query = "what is euler cycle"
tokenized_query = query.split(" ")

doc_scores = bm25.get_scores(tokenized_query)
corpus = tbs_df.Title.tolist()
bm25.get_top_n(tokenized_query, corpus, n = 5)

['Euler cycles in biconnected components',
 'Given the same set of nodes, why is it (generally) easier to find a Euler cycle than a Hamilton cycle?',
 'Cycle of length k with no repeated edges',
 'Prove: A connected graph contains an Eulerian cycle iff every vertex has even degree',
 'Euler Circuit with least deviation from input']

In [None]:
query = "what is euler cycle"
tokenized_query = query.split(" ")

doc_scores = bm25.get_scores(tokenized_query)
corpus = tbs_df.Title.tolist()[:int(tbs_df.shape[0]*train_set)]
bm25.get_top_n(tokenized_query, corpus, n = 5)

['Given the same set of nodes, why is it (generally) easier to find a Euler cycle than a Hamilton cycle?',
 'Euler cycles in biconnected components',
 'Project Euler Problem 213 (continued...)',
 'Understanding the correctness of the Euler Tour Technique',
 'Time complexity of euler totient function']

In [None]:
query = "accuracy stuck in keras"
tokenized_query = query.split(" ")

doc_scores = bm25.get_scores(tokenized_query)
corpus = tbs_df.Title.tolist()
bm25.get_top_n(tokenized_query, corpus, n = 10)

['Training Accuracy stuck in Keras',
 'ANN on Pattern Recognition',
 'Why does my loss value start at approximately -10,000 and my accuracy not improve?',
 'Keras del stuck with constant loss and accuracy',
 'Deep Neural Network using Keras/Tensorflow solves Spiral Dataset Classification. But Accuracy is stuck around 50%',
 "Accuracy doesn't match in Keras",
 'Model Validation accuracy stuck at 0.65671 Keras',
 'Accuracy of RNN getting stuck after 90%',
 'FaceNet training, tripletloss not decrease but accuracy increase then stuck,what are possible causes?',
 'Keras Neural Network training is stuck (gets stuck around epoch 6)']

In [None]:
query = "accuracy stuck in keras"
tokenized_query = query.split(" ")

doc_scores = bm25.get_scores(tokenized_query)
corpus = tbs_df.Title.tolist()[:int(tbs_df.shape[0]*train_set)]
bm25.get_top_n(tokenized_query, corpus, n = 5)

['Keras LSTM accuracy stuck at 50%',
 'Training Accuracy stuck in Keras',
 'Model Validation accuracy stuck at 0.65671 Keras',
 'Keras del stuck with constant loss and accuracy',
 'Keras Neural Network training is stuck (gets stuck around epoch 6)']