### 1. Loading training dataset

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.notebook import tqdm
import re
import warnings
warnings.filterwarnings('ignore')

  import pandas.util.testing as tm


In [None]:
# 1. Acquiring preprocessed_dataset
tbs_df = pd.read_csv('/content/drive/My Drive/AAIC Course/Personal case study - StackOverflow/tbs_df.csv')
tbs_df = tbs_df.fillna(' ')

### 2. Tag Predictor

In [None]:
import tensorflow as tf
# tf.compat.v1.enable_eager_execution()
from tensorflow.keras.layers import Input, Softmax, GRU, LSTM, RNN, Embedding, Dense, RepeatVector, TimeDistributed, Bidirectional
from tensorflow.keras.models import Model

import numpy as np

In [None]:
# 1. Loading saved tokenizers
import pickle
handle = open('/content/drive/My Drive/AAIC Course/Personal case study - StackOverflow/tag_predictor_tokenizer.pickle', 'rb')
token_text = pickle.load(handle)

handle = open('/content/drive/My Drive/AAIC Course/Personal case study - StackOverflow/tag_predictor_token_tar.pickle', 'rb')
token_tar = pickle.load(handle)

text_vocab = token_text.word_index
tar_vocab = token_tar.word_index

In [None]:
# 2. loading saved w2v model
from gensim.models import Word2Vec
w2v_model_sg = Word2Vec.load("/content/drive/My Drive/AAIC Course/Personal case study - StackOverflow/word2vec_sg.model")
len(w2v_model_sg.wv.vocab)

117779

In [None]:
# 3. creating Embedding Matrix with Word2Vec representations
max_words = 20000
w2v_vocab = set(w2v_model_sg.wv.vocab)
embedding_matrix = np.random.normal(loc = 0, scale = 0.15, size = (max_words+1, 100))
for word, i in text_vocab.items():
    if i <= max_words and word in w2v_vocab:
      vector = w2v_model_sg[word]
    # if vector is not None:
      embedding_matrix[i] = vector
embedding_matrix.shape

(20001, 100)

In [None]:
# 4. Creating freezed 'Embedding layer'
from tensorflow.keras.initializers import Constant
text_embedding_layer = Embedding(input_dim = max_words+1, output_dim= 100, embeddings_initializer = Constant(embedding_matrix),
                               mask_zero = True, trainable = False, name = 'text_embed')

In [None]:
# 5. Constructing a model
tf.keras.backend.clear_session()

enc_inputs = Input(name = 'text_seq', shape = (250,))
enc_embed = text_embedding_layer(enc_inputs)
encoder = Bidirectional(GRU(name = 'ENCODER', units = 128, dropout = 0.2))

enc_out = encoder(enc_embed)

dec_lstm = GRU(name = 'DECODER', units = 256, dropout = 0.2, return_sequences= True, return_state= True)

repeat = RepeatVector(5)(enc_out)
dec_out, dec_hidden = dec_lstm(repeat)

dec_dense = Dense(units = len(tar_vocab)+1, activation = 'softmax')
out = dec_dense(dec_out)

model = Model(inputs = enc_inputs, outputs = out)
model.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
text_seq (InputLayer)        [(None, 250)]             0         
_________________________________________________________________
text_embed (Embedding)       (None, 250, 100)          2000100   
_________________________________________________________________
bidirectional (Bidirectional (None, 256)               176640    
_________________________________________________________________
repeat_vector (RepeatVector) (None, 5, 256)            0         
_________________________________________________________________
DECODER (GRU)                [(None, 5, 256), (None, 2 394752    
_________________________________________________________________
dense (Dense)                (None, 5, 243)            62451     
Total params: 2,633,943
Trainable params: 633,843
Non-trainable params: 2,000,100
_____________________________________________

In [None]:
# 6. loading model weights
model.load_weights('/content/drive/My Drive/AAIC Course/Personal case study - StackOverflow/weights--019--2.4615.hdf5')

In [None]:
# defining a function to remove stop_words
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
stop_words.add('would')
stop_words.update([chr(c) for c in range(97, 123)])
# stop_words.remove('no'); stop_words.remove('not'); stop_words.remove('nor')

def stopwrd_removal(sent):
  lst = []
  for wrd in sent.split():
    if wrd not in stop_words:
      lst.append(wrd)
  return " ".join(lst)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
# 7. input text preprocessor
def text_preprocessor(corpus, stop_word = False, remove_digits = False):
  clean_corpus = []
  for doc in corpus:
    # 1. remove html tags, html urls, replace html comparison operators
    clean_str = re.sub('<.*?>', '', doc)
    clean_str = clean_str.replace('&lt;', '<')\
                .replace('&gt;', '>')\
                .replace('&le;', '<=' )\
                .replace('&ge;', '>=')

    # 2. remove latex i,e., mostly formulas since it's mathematics based dataset
    clean_str = re.sub('\$.*?\$', '', clean_str)

    # 3. all lowercase 
    clean_str = clean_str.lower()

    # 4. decontractions
    clean_str = clean_str.replace("won't", "will not").replace("can\'t", "can not").replace("n\'t", " not").replace("\'re", " are").\
                                                  replace("\'s", " is").replace("\'d", " would").replace("\'ll", " will").\
                                                  replace("\'t", " not").replace("\'ve", " have").replace("\'m", " am")

    # 5. remove all special-characters other than alpha-numericals
    clean_str = re.sub('\W', ' ', clean_str)
    if remove_digits == True:
      clean_str = re.sub('\d', ' ', clean_str)

    # 6. Stop_word removal
    if stop_word == True:
      clean_str = stopwrd_removal(clean_str)

    # 7. remove all white-space i.e., \n, \t, and extra_spaces
    clean_str = re.sub('  +', ' ', clean_str)
    clean_str = clean_str.replace("\n", " ").replace("\t", " ").strip()

    clean_corpus.append(clean_str)

  return clean_corpus

def padded_sequence(clean_corpus):
    # 8. converting words into tokens (int)
    tokens = token_text.texts_to_sequences(clean_corpus)

    # 9. padding the sequence
    from tensorflow.keras.preprocessing.sequence import pad_sequences
    sequence = pad_sequences(tokens, maxlen = 250, padding = 'post')

    return sequence

In [None]:
tar_vocab_reverse = {v:k for k,v in tar_vocab.items()}
def final_tag_prediction(corpus):
  """
  1. Text preprocessing on corpus
  2. Convert clean corpus to padded sequence
  3. Passes sequence through model and model generates output
  4. Converts model output into tags list of tags
  """
  # 10. model prediction
  clean_corpus = text_preprocessor(corpus,  stop_word = False, remove_digits = False)
  sequence = padded_sequence(clean_corpus)
  model_out = model.predict(sequence)

  # 11. converting model prediction to human readable tags
  final_lst = []
  for dp in model_out:
    tar_wrd_idx_lst = []
    for time_step in dp:
      tar_wrd_idx = np.argmax(time_step)
      tar_wrd = ('<' + tar_vocab_reverse[tar_wrd_idx] + '>')
      tar_wrd_idx_lst.append(tar_wrd)
      tar_wrd_idx_lst = list(set(tar_wrd_idx_lst))
    final_lst.append(tar_wrd_idx_lst)

  return final_lst

In [None]:
# 12. Creating a dictionary of {tag : datapoint_idx}
idx = []
unique_tags = list(set(np.concatenate([tbs_df['tag_pred1'].unique(), tbs_df['tag_pred2'].unique(), tbs_df['tag_pred3'].unique(), tbs_df['tag_pred4'].unique(), tbs_df['tag_pred5'].unique()], axis = 0)))
unique_tags.remove('-')
for tag in unique_tags:
  idx.append(list(tbs_df[(tbs_df['tag_pred1'] == tag) | (tbs_df['tag_pred2'] == tag) | (tbs_df['tag_pred3'] == tag) | (tbs_df['tag_pred4'] == tag) | (tbs_df['tag_pred5'] == tag)].index))
tag_idx_dict = dict(zip(unique_tags, idx))

In [None]:
# 13. final indices of tag_corpus
def tag_corpus(tags_pred):
  """This function takes predicted tags and returns indices of corrospoinding questions from dataset"""
  tag_corpus_idx = []
  for tag in tags_pred:
    tag_corpus_idx += tag_idx_dict[tag]
  return list(set(tag_corpus_idx))

In [None]:
%%time
# FINAL TESTING
corpus = [tbs_df['Title'][129792]]
tags_pred = final_tag_prediction(corpus)
tag_corpus_idx = tag_corpus(tags_pred[0])
print(len(tag_corpus_idx), tags_pred)

807 [['<outliers>']]
CPU times: user 32.6 ms, sys: 2.98 ms, total: 35.6 ms
Wall time: 34.2 ms


### 3. LDA

In [None]:
# 1. Loading LDA model and LDA_dictionary
from gensim.models.ldamodel import LdaModel

handle = open('/content/drive/My Drive/AAIC Course/Personal case study - StackOverflow/LDA_dictionary.pickle', 'rb')
dictionary = pickle.load(handle)

ldamodel_title_body_tag = LdaModel.load('/content/drive/My Drive/AAIC Course/Personal case study - StackOverflow/ldamodel_title_body_tag')

In [None]:
# 2. defining a final topic prediction function
def final_topic_prediction(corpus):
  clean_corpus = text_preprocessor(corpus, stop_word = True, remove_digits = True)
  tokens_corpus = [i.split(' ') for i in clean_corpus]
  BOW_corpus = [dictionary.doc2bow(i) for i in tokens_corpus]

  topics_pred = []
  for BOW_query in BOW_corpus:
    topic_proba_tuple = ldamodel_title_body_tag.get_document_topics(BOW_query, minimum_probability = 0.20)
    topics_pred.append([k for k,v in topic_proba_tuple])
  return topics_pred

In [None]:
# 3. Creating a dictionary of {topic_id : datapoint_idx}
idx = []
topics = list(set(np.concatenate([tbs_df['topic_pred1'].unique(), tbs_df['topic_pred2'].unique(), tbs_df['topic_pred3'].unique(), tbs_df['topic_pred4'].unique()], axis = 0)))
topics.remove(1000)
for topic in topics:
    idx.append(list(tbs_df[(tbs_df['topic_pred1'] == topic) | (tbs_df['topic_pred2'] == topic) | (tbs_df['topic_pred3'] == topic) | (tbs_df['topic_pred4'] == topic)].index))
topic_idx_dict = dict(zip(topics, idx))

In [None]:
# 4. final indices of tag_corpus
def topic_corpus(topics_pred):
  """This function takes predicted topics and returns indices of corrospoinding questions from dataset"""
  topic_corpus_idx = []
  for topic in topics_pred:
    topic_corpus_idx += topic_idx_dict[topic]
  return list(set(topic_corpus_idx))

In [None]:
%%time
# FINAL TESTING
topics_pred = final_topic_prediction([tbs_df['Title'].values[129792]])
topics_corpus_idx = topic_corpus(topics_pred[0])
print(len(topics_corpus_idx), topics_pred)

4778 [[38, 81]]
CPU times: user 2.21 ms, sys: 18 µs, total: 2.22 ms
Wall time: 2.58 ms


### 4. BM25

In [None]:
!pip install rank-bm25
from rank_bm25 import BM25Okapi

Collecting rank-bm25
  Downloading https://files.pythonhosted.org/packages/16/5a/23ed3132063a0684ea66fb410260c71c4ffda3b99f8f1c021d1e245401b5/rank_bm25-0.2.1-py3-none-any.whl
Installing collected packages: rank-bm25
Successfully installed rank-bm25-0.2.1


In [None]:
# 1. preparing dataset for BM25 : truncated "title + body"
# title_body preprocessing
corpus =  tbs_df['combined_text'].values
title_body = text_preprocessor(corpus, remove_digits= True, stop_word=True)

# truncating title_body on 40 words
title_body = [' '.join(i.split(' ')[:40]) for i in title_body]

len(title_body)

182039

In [None]:
# 2. Training BM25 model
train_tokens = [i.split(' ') for i in title_body]
bm25 = BM25Okapi(train_tokens)

In [None]:
# 3. Defining a final function
def BM25_corpus(query, train_data, n_results):
  # finding results indices
  query = text_preprocessor([query], remove_digits= True, stop_word=True)[0]
  tokenized_query = query.split(" ")
  idx = range(len(train_data))
  BM25_corpus_idx = bm25.get_top_n(tokenized_query, idx, n = n_results)

  # getting scores associated with each result
  doc_scores = bm25.get_scores(tokenized_query)
  BM25_scores = np.sort(doc_scores)[::-1][:n_results]

  return BM25_corpus_idx, BM25_scores

In [None]:
%%time
# Final testing
query = tbs_df['Title'][129792]
BM25_corpus_idx, BM25_scores = BM25_corpus(query, train_data = train_tokens, n_results = 100)
print(BM25_corpus_idx, BM25_scores, tbs_df.Title.values[BM25_corpus_idx])

[37109, 129792, 96480, 151612, 55051, 32678, 109072, 11557, 11499, 120617, 104985, 94943, 71311, 23136, 132652, 133116, 87293, 151886, 86985, 126892, 165859, 92932, 25974, 65588, 98599, 178384, 108891, 158708, 6261, 60018, 172472, 163430, 62860, 8748, 41627, 125235, 131273, 181968, 86475, 95657, 10030, 125371, 58555, 86480, 70097, 86753, 151008, 35655, 47587, 174849, 1577, 150926, 62428, 78067, 141734, 123314, 30327, 52858, 95652, 115305, 78564, 140858, 17938, 62582, 18813, 68310, 71043, 89320, 88394, 49729, 181299, 3663, 295, 97846, 130609, 77222, 112365, 58508, 104081, 58557, 84749, 88322, 13084, 49690, 89552, 75008, 11717, 154358, 19535, 45270, 32553, 8287, 120912, 159719, 110735, 68339, 129500, 162342, 125298, 160735] [21.78167401 20.77296241 20.49769274 18.20527936 17.99528755 17.84799521
 17.48663342 16.79578485 16.12831391 15.84872905 15.79784663 15.69476294
 15.60062952 15.55997442 15.24151023 15.20392383 15.20392383 14.84913141
 14.79660187 14.79660187 14.79660187 14.57498002 

In [None]:
len(set(tag_corpus_idx + topics_corpus_idx + BM25_corpus_idx))

4839

### 5. Combining all corpus indices : 'tag_corpus' + 'topic_corpus' + 'BM25_corpus)

In [None]:
def all_results_idx(query):
  # 1. tag_predictor
  tags_pred = final_tag_prediction([query])
  tag_corpus_idx = tag_corpus(tags_pred[0])

  # 2. LDA - topic prediction
  topics_pred = final_topic_prediction([query + ' ' + ' '.join(tags_pred[0])]) # adding tags to query
  topics_corpus_idx = topic_corpus(topics_pred[0])

  # 3. BM25 results
  BM25_corpus_idx, BM25_scores = BM25_corpus(query, train_tokens, n_results = 100)

  all_idx = list(set(tag_corpus_idx + topics_corpus_idx + BM25_corpus_idx))
  return all_idx, dict(zip(BM25_corpus_idx, BM25_scores))

In [None]:
%%time
# testing
query = tbs_df['Title'][129792]
all_idx, BM25_dict = all_results_idx(query)

print(len(all_idx), query)

4839 Which features to use while detecting outliers in data
CPU times: user 642 ms, sys: 2 ms, total: 644 ms
Wall time: 642 ms


### 6.Sentence Embeddings


### 6.1 BERT

In [None]:
# 1. Loading BERT vector representation of all questions in the dataset
bert_embeddings = np.load('/content/drive/My Drive/AAIC Course/Personal case study - StackOverflow/bert_train_out.npy')

# 2. Laoding pretrained BERT model
import tensorflow as tf
import tensorflow_hub as hub
!pip install transformers
from transformers import BertTokenizer, TFBertModel

# Load pretrained model/tokenizer
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = TFBertModel.from_pretrained('bert-base-uncased')



Some weights of the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the ckeckpoint was trained on, you can already use TFBertModel for predictions without further training.


In [None]:
# 3. BERT model
bert_input_ids = Input(name = 'bert_input_ids', shape = (44,), dtype = 'int64')
bert_attn_mask = Input(name = 'bert_attn_mask', shape = (44,), dtype = 'int64')
bert_token_typ = Input(name = 'bert_token_typ', shape = (44,), dtype = 'int64')

bert_output = bert_model([bert_input_ids, bert_attn_mask, bert_token_typ])
bert_output = bert_output[0][:,0,:]
# bert_output = bert_output[1][:]

BERT = Model(inputs = [bert_input_ids, bert_attn_mask, bert_token_typ], outputs = bert_output)
BERT.summary()

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
bert_input_ids (InputLayer)     [(None, 44)]         0                                            
__________________________________________________________________________________________________
bert_attn_mask (InputLayer)     [(None, 44)]         0                                            
__________________________________________________________________________________________________
bert_token_typ (InputLayer)     [(None, 44)]         0                                            
__________________________________________________________________________________________________
tf_bert_model (TFBertModel)     ((None, 44, 768), (N 109482240   bert_input_ids[0][0]             
                                                                 bert_attn_mask[0][0]       

In [None]:
# 4. input text preprocessor
def text_preprocessor(corpus, stop_word = False, remove_digits = False):
  clean_corpus = []
  for doc in corpus:
    # 1. remove html tags, html urls, replace html comparison operators
    clean_str = re.sub('<.*?>', '', doc)
    clean_str = clean_str.replace('&lt;', '<')\
                .replace('&gt;', '>')\
                .replace('&le;', '<=' )\
                .replace('&ge;', '>=')

    # 2. remove latex i,e., mostly formulas since it's mathematics based dataset
    clean_str = re.sub('\$.*?\$', '', clean_str)

    # 3. all lowercase 
    clean_str = clean_str.lower()

    # 4. decontractions
    clean_str = clean_str.replace("won't", "will not").replace("can\'t", "can not").replace("n\'t", " not").replace("\'re", " are").\
                                                  replace("\'s", " is").replace("\'d", " would").replace("\'ll", " will").\
                                                  replace("\'t", " not").replace("\'ve", " have").replace("\'m", " am")

    # # 5. remove all special-characters other than alpha-numericals
    clean_str = re.sub('\W', ' ', clean_str)
    if remove_digits == True:
      clean_str = re.sub('\d', ' ', clean_str)

    # 6. Stop_word removal
    if stop_word == True:
      clean_str = stopwrd_removal(clean_str)

    # 7. remove all white-space i.e., \n, \t, and extra_spaces
    clean_str = re.sub('  +', ' ', clean_str)
    clean_str = clean_str.replace("\n", " ").replace("\t", " ").strip()

    clean_corpus.append(clean_str)

  return clean_corpus

In [None]:
# 5. A function to create vector representation of query
def BERT_sentence_vec(query):
  clean_query = text_preprocessor([query],  stop_word = False, remove_digits = False)
  tokens = bert_tokenizer.batch_encode_plus(clean_query, truncation = True, max_length =  44, pad_to_max_length = True)

  input_ids = np.array(tokens['input_ids'])
  attn_mask = np.array(tokens['attention_mask'])
  token_typ_ids = np.array(tokens['token_type_ids'])
  bert_out = BERT.predict([input_ids, attn_mask, token_typ_ids])
  return bert_out

In [None]:
# cross checking the current results with previous results
query = tbs_df['Title'][50]
sum(bert_embeddings[50] - BERT_sentence_vec(query)[0]), query

(5.094334483146667e-07,
 'How would you explain Markov Chain Monte Carlo (MCMC) to a layperson?')

In [None]:
%%time
# 6. Final testing of BERT model with query point
query = "How does deepmind's Atari game AI work?"
bert_out = BERT_sentence_vec(query)
print(bert_out.shape)

(1, 768)
CPU times: user 70.4 ms, sys: 3.03 ms, total: 73.5 ms
Wall time: 61.9 ms


### 6.2. USE Embeddings

In [None]:
# 1. Loading USE vector representation of all questions in the dataset
use_embeddings = np.load('/content/drive/My Drive/AAIC Course/Personal case study - StackOverflow/use_embeddings.npy')

In [None]:
# 2. Laoding pretrained USE model
use_model = hub.load("https://tfhub.dev/google/universal-sentence-encoder-large/5")

INFO:absl:Using /tmp/tfhub_modules to cache modules.
INFO:absl:Downloading TF-Hub Module 'https://tfhub.dev/google/universal-sentence-encoder-large/5'.
INFO:absl:Downloaded https://tfhub.dev/google/universal-sentence-encoder-large/5, Total size: 577.10MB
INFO:absl:Downloaded TF-Hub Module 'https://tfhub.dev/google/universal-sentence-encoder-large/5'.


In [None]:
# 3. A function to create vector representation of query
def USE_sentence_vec(query):
  clean_query = text_preprocessor([query],  stop_word = False, remove_digits = False)
  use_out = use_model(clean_query)

  return use_out

In [None]:
# cross checking the current results with previous results
query = tbs_df['Title'][100000]
sum(use_embeddings[100000] - USE_sentence_vec(query)[0]), query

(<tf.Tensor: shape=(), dtype=float32, numpy=7.386261e-07>,
 'Why would a BaggingRegressor only use a subset of samples and features during fitting?')

In [None]:
%%time
# 4. Final testing of BERT model with query point
query = "How does deepmind's Atari game AI work?"
use_out = USE_sentence_vec(query)
print(use_out.shape)

(1, 512)
CPU times: user 34.2 ms, sys: 4.3 ms, total: 38.5 ms
Wall time: 29.3 ms


### 7. Ranking : compute cos-sim based results

In [None]:
from numpy.linalg import norm
def cos_sim(a, b):
  cos_sim = np.dot(a, b)/(norm(a)*norm(b))
  return cos_sim

def inverse_euc_dist(a, b):
  euc_dist = norm(a-b)
  return 1/euc_dist

**Final search results using : BERT embeddings**

In [52]:
def BERT_results(query, n = 10):
  all_idx, BM25_dict = all_results_idx(query)
  BERT_corpus = bert_embeddings[all_idx]

  query_vector = BERT_sentence_vec(query)[0]
  cos_sim_lst = [cos_sim(query_vector, b) for b in BERT_corpus]

  d = dict(zip(all_idx, cos_sim_lst))
  cos_sim_idx = list(dict(sorted(d.items(), key=lambda x: x[1], reverse=True)).keys())

  return tbs_df.Title.values[cos_sim_idx][:n]

In [54]:
%%time
# some sample queries to try: 
# tensorflow vs pytorch # difference between tensorflow and pytorch # keras accuracy stuck # change keras backend
# what is the best deep learning library for scala # install nltk # optimizing overfitted models
results = BERT_results(query = "change keras backend", n = 10)
print(results)

['Keras transpose' 'Reflective padding as pure keras verision'
 'Problems with accuracy.score sklearn'
 'SKLearn DT regressor - good enough score?'
 'Keras bug NasNetlarge no top' 'What to do after GridSearchCV()?'
 'Keras prediction' 'how to change keras backend in windows?'
 'Tips for optimizing my Keras Model'
 'Error in training a merged model in Keras']
CPU times: user 504 ms, sys: 5.74 ms, total: 510 ms
Wall time: 495 ms


**Final search results using : USE embeddings**

In [57]:
def USE_results(query, n = 10):
  all_idx, BM25_dict = all_results_idx(query)
  BERT_corpus = use_embeddings[all_idx]

  query_vector = USE_sentence_vec(query)[0]
  cos_sim_lst = [cos_sim(query_vector, b) for b in BERT_corpus]

  d = dict(zip(all_idx, cos_sim_lst))
  cos_sim_idx = list(dict(sorted(d.items(), key=lambda x: x[1], reverse=True)).keys())

  return tbs_df.Title.values[cos_sim_idx][:10]

In [58]:
%%time
results = USE_results(query = "change keras backend", n = 10)
print(results)

['Switching Keras backend Tensorflow to GPU'
 'how to change keras backend in windows?'
 'Converting a Keras model to PyTorch' 'Changing padding values in Keras'
 'Keras custom layer using tensorflow function' 'Keras Import Error'
 'How do i pass data into keras?' 'Keras update N batches'
 'Feed data into Keras LSTM layer' 'Feed data into Keras LSTM layer']
CPU times: user 505 ms, sys: 4.79 ms, total: 510 ms
Wall time: 501 ms


# Crucial points to note:
**1. The purpose of this case study is to build simple search engine with LOW latency.**

**2. By using 'right data structures' in our mechanism, we make it happen to get results under 800-900 milliseconds on a normal 8 gb machine.**

# Observations : 
    1. pretrained BERT model is not trained to capture semantic relationships at first place. Its trained on two tasks : NSP (Next sentence prediction) and MLM (Masked language model).
    2. The corpus on which BERT model is trained is general wikipidia data, But our stackoverflow corpus has all the mathematical, computer science and machine learning related technical terms.
    3. first things first Universal sentence embedding model is trained to capture semantic relationships with contextual meaning.
    
# Conclusion :
    1. Hence BERT model fails to give good results as compare to USE model.
    2. We need to fine tune bert model on our technical corpus to get good results with bert.
    
- In our case USE embeddings outperformed. A searching mechanism with USE vectors gives great reults with 'semantic relationship' between query and resulting results.

# 8. Comparison with stackoverflow.com results

find all results here : https://imgur.com/a/9XRVEOd

### q1  =  'How to reverse a linked list in python'
**top 5 results :**

 <img src='https://i.imgur.com/rbCbWua.png' width="800">

 **next 5 results :**

 <img src='https://i.imgur.com/6B4bNU1.png' width="800">


In [60]:
%%time
results = BERT_results(query = "How to reverse a linked list in python", n = 10)
print(results)

['How to anonymize (de-identify) data in Python?'
 'How to run AgglomerativeClustering on a big data in python?'
 'How to deal with non-numeric missing values with python'
 'How to convert a SQLContext Dataframe to RDD of vectors in Python?'
 'How to stratify a dataset to keep groups of data together in Python?'
 'How to work with large amount of data overcoming RAM issues in python'
 'How to create indexes from "open" variables'
 'How to convert categorical data to numerical data in Pyspark'
 'How to replace short words into full words from tweets using python'
 'How can I detect partially obscured objects using Python?']
CPU times: user 680 ms, sys: 1.61 ms, total: 682 ms
Wall time: 664 ms


In [63]:
%%time
results = USE_results(query = "How to reverse a linked list in python", n = 10)
print(results)

['print the nodes of an immutable single-linked list in reverse order'
 'Using singly linked list instead of a doubly linked list?'
 'How to find middle element of doubly linked list using head and tail?'
 'How to find middle element of linked list in one pass?'
 'Correct way to implement linked list'
 'Sort doubly linked list efficiently'
 'Finding the nth to last node in a linked list'
 'concatenating the content of list in python'
 'A question about linked list' 'Linked lists with auxiliary data']
CPU times: user 671 ms, sys: 4.44 ms, total: 676 ms
Wall time: 676 ms


### q2  = 'valueerror'

**top 5 results :**

<img src='https://i.imgur.com/8SlM6lH.png' width="800">

**next 5 results :**

 <img src='https://i.imgur.com/AYcDUsI.png' width="800">

In [64]:
%%time
results = BERT_results(query = "valueerror", n = 10)
print(results)

['Keras save model FailedPreconditionError'
 'Keras mnist.load_data() unshuffled?' 'Groupby product, return tuple'
 "AttributeError: 'numpy.ndarray' object has no attribute 'predict'"
 'LogisticRegression - binary classification, "custom threshold"'
 'Keras input dimension bug?' 'XGBClassifier default scoring metric'
 'OneVsRestClassifier and predict_proba' 'Keras Custom Loss Function'
 'liblinear one vs rest learn parameters']
CPU times: user 294 ms, sys: 6.08 ms, total: 300 ms
Wall time: 285 ms


In [66]:
%%time
results = USE_results(query = "valueerror", n = 10)
print(results)

['ValueError from statsmodels ExponentialSmoothing'
 'Getting a ValueError from train_test_split'
 'XGBClassifier error! ValueError: feature_names mismatch:'
 'train_test_split ValueError: Input contains NaN'
 'ValueError: not enough values to unpack (expected 4, got 2)'
 "ValueError: ('Error when checking model input: expected no data, but got:', array)"
 'ValueError while Comparing with Multilevel Index'
 'ValueError while using linear regression'
 "TypeError: unhashable type: 'numpy.ndarray'"
 "TypeError: unhashable type: 'numpy.ndarray''"]
CPU times: user 272 ms, sys: 6.96 ms, total: 279 ms
Wall time: 273 ms


### q3  = 'matplotlib'

**top 5 results :**

<img src='https://i.imgur.com/EryAZE7.png' width="800">

**next 5 results :**

 <img src='https://i.imgur.com/aBc6X0O.png' width="800">

In [67]:
%%time
results = BERT_results(query = "matplotlib", n = 10)
print(results)

['Conceptual clustering with sklearn?'
 'Sklearn StratifiedKFold code explanation'
 'Should I delete the intercept' 'Feature Importance from a GridSearchCV'
 'Does $\\mathbb{P}(XY=a)=\\mathbb{P}(X=a)\\mathbb{P}(Y=a)$ for $X,Y$ id?'
 'Decision Trees split in scikit' 'mvBacon from R to Python'
 'MinMaxScaler broadcast shapes'
 "Scikit-learn's SGDClassifier code question"
 'Using TF-IDF with other features in SKLearn']
CPU times: user 282 ms, sys: 5.84 ms, total: 288 ms
Wall time: 273 ms


In [65]:
%%time
results = USE_results(query = "matplotlib", n = 10)
print(results)

['Using Matplotlib' 'matplotlib graph to plot values and variance'
 'How to plot a contour map in python using matplotlib?'
 'How do I fix mis-rendered matplotlib?'
 'How to annotate labels in a 3D matplotlib scatter plot?'
 'Error plotting with datetime and value using matplotlib in python'
 'A better way of visualizing extreme oscillation curve in matplotlib?'
 'How to plot a 3-axis bar chart with matplotlib (and pandas + jupyter)'
 'How to plot 2D or 3D graph using Python?'
 'Matplotlib Plot Difference Between Two Unsorted Value Series']
CPU times: user 277 ms, sys: 1.71 ms, total: 279 ms
Wall time: 269 ms
