# The Task

Based on a user query, return an existing question which most closely resembles the user's query

In [52]:
# import libraries
import os
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow_hub as hub
from sklearn import preprocessing

import spacy
from spacy.lang.en import English
import spacy
EN = spacy.load('en_core_web_sm')

from IPython.display import HTML
import logging
logging.getLogger('tensorflow').disabled = True #OPTIONAL - to disable outputs from Tensorflow

## Import the Data

In [67]:
data = pd.read_csv('Preprocessed_data.csv')
data

Unnamed: 0,post_corpus,question_content,question_url,tags,overall_scores_norm,sentiment_polarity,sentiment_subjectivity,processed_title,original_title,overall_scores
0,using match attribute python objects array nt ...,using match attribute python objects array nt ...,https://stackoverflow.com/questions/683,python|arrays|iteration,0.012227,0.163567,0.568209,using match attribute python objects array,Using 'in' to match an attribute of Python obj...,84
1,specifying mysql enum django model go specifyi...,specifying mysql enum django model go specifyi...,https://stackoverflow.com/questions/21454,python|mysql|django|django-models|enums,0.029648,0.035299,0.536159,specifying mysql enum django model,Specifying a mySQL ENUM in a Django model,196
2,unicode vs utf8 confusion python django stumbl...,unicode vs utf8 confusion python django stumbl...,https://stackoverflow.com/questions/22149,python|django|unicode,0.007405,0.082857,0.403250,unicode vs utf8 confusion python django,Unicode vs UTF-8 confusion in Python / Django?,53
3,cleanest fastest server setup django deploy me...,cleanest fastest server setup django deploy me...,https://stackoverflow.com/questions/26025,python|django|apache|hosting,0.008494,0.205105,0.483688,cleanest fastest server setup django,Cleanest & Fastest server setup for Django,60
4,always including user django template context ...,always including user django template context ...,https://stackoverflow.com/questions/41547,python|django|authentication|session|cookies,0.015027,0.052020,0.628824,always including user django template context,Always including the user in the django templa...,102
5,generate urls django django template language ...,generate urls django django template language ...,https://stackoverflow.com/questions/43290,python|django|url|django-urls,0.006472,0.080000,0.468333,generate urls django,How to generate urls in django,47
6,project design fs layout large django projects...,project design fs layout large django projects...,https://stackoverflow.com/questions/44135,python|django,0.004916,0.124153,0.470832,project design fs layout large django projects,Project design / FS layout for large django pr...,37
7,would make comma separated string list strings...,would make comma separated string list strings...,https://stackoverflow.com/questions/44778,python|list,0.139619,-0.014071,0.537851,would make comma separated string list strings,How would you make a comma-separated string fr...,903
8,best django search app building django project...,best django search app building django project...,https://stackoverflow.com/questions/55056,python|django|search|search-engine,0.026226,0.181002,0.492883,best django search app,What's the best Django search app?,174
9,ignore case python strings easiest way compare...,ignore case python strings easiest way compare...,https://stackoverflow.com/questions/62567,python|string|case-insensitive,0.013160,0.075911,0.541590,ignore case python strings,Ignore case in Python strings,90


## Import saved WordEmbeddings

In [54]:
# Import saved Wordvec Embeddings
import gensim
w2v_model = gensim.models.word2vec.Word2Vec.load('models/SO_word2vec_embeddings.bin')

## Calculate Sentence Embeddings
In order to calculate the embeddings for an entire sentence, I defined the following function which averages the the embeddings for each valid token 

In [55]:
def question_to_vec(question, embeddings, dim=300):
    question_embedding = np.zeros(dim)
    valid_words = 0
    for word in question.split(' '):
        if word in embeddings:
            valid_words += 1
            question_embedding += embeddings[word]
    if valid_words > 0:
        return question_embedding/valid_words
    else:
        return question_embedding

In [5]:
'''
all_title_embeddings = []
for title in data.processed_title:
    all_title_embeddings.append(question_to_vec(title, w2v_model))
all_title_embeddings = np.array(all_title_embeddings)

embeddings = pd.DataFrame(data = all_title_embeddings)
embeddings.to_csv('models/title_embeddings.csv', index=False)
'''

  """
  import sys


Since the number of titles have have is fixed, I saved the sentence embeddings for all titles in a .csv file to save computation time on future runs 

In [56]:
all_title_embeddings = pd.read_csv('models/title_embeddings.csv').values

## Import the saved model

In [57]:
import keras.backend as K

# Custom loss function to handle multilabel classification task
def multitask_loss(y_true, y_pred):
    # Avoid divide by 0
    y_pred = K.clip(y_pred, K.epsilon(), 1 - K.epsilon())
    # Multi-task loss
    return K.mean(K.sum(- y_true * K.log(y_pred) - (1 - y_true) * K.log(1 - y_pred), axis=1))

Using TensorFlow backend.


In [58]:
from keras.models import load_model
import keras.losses

keras.losses.multitask_loss = multitask_loss
model = load_model('models/Tag_predictor.h5')

In [59]:
def predict_tags(text, include_neutral=True):
    # Tokenize text
    x_test = pad_sequences(tokenizer.texts_to_sequences([text]), maxlen=MAX_SEQUENCE_LENGTH)
    # Predict
    prediction = model.predict([x_test])[0]
    for i,value in enumerate(prediction):
        if value > 0.5:
            prediction[i] = 1
        else:
            prediction[i] = 0
    tags = tag_encoder.inverse_transform(np.array([prediction]))
    return tags

In [60]:
import re
import nltk
import inflect
from nltk.corpus import stopwords

def tokenize_text(text):
    "Apply tokenization using spacy to docstrings."
    tokens = EN.tokenizer(text)
    return [token.text.lower() for token in tokens if not token.is_space]

def to_lowercase(words):
    """Convert all characters to lowercase from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = word.lower()
        new_words.append(new_word)
    return new_words

def remove_punctuation(words):
    """Remove punctuation from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = re.sub(r'[^\w\s]', '', word)
        if new_word != '':
            new_words.append(new_word)
    return new_words

def remove_stopwords(words):
    """Remove stop words from list of tokenized words"""
    new_words = []
    for word in words:
        if word not in stopwords.words('english'):
            new_words.append(word)
    return new_words

def normalize(words):
    words = to_lowercase(words)
    words = remove_punctuation(words)
    words = remove_stopwords(words)
    return words

def tokenize_code(text):
    "A very basic procedure for tokenizing code strings."
    return RegexpTokenizer(r'\w+').tokenize(text)

def preprocess_text(text):
    return ' '.join(normalize(tokenize_text(text)))

## Import the saved Tokenizer

In [61]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

MAX_SEQUENCE_LENGTH = 300
import pickle
with open('models/tokenizer.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle)

## Getting the most similar results
So the way we actually calculate the the most similar results, is by comparing how far each result is from the query in terms of distance. This can only be done if both the query and the results are in a shared vector space. **Fortunately, that is exactly what our word embeddings are for**. They create each sentence as a vector in the embedding space, which makes it easy for us to distinguish them. 

After we have those vectors, we can assign a **Similarity Measure** as a metric which measures the closeness of two vectors. Common examples are Cosine distance, Euclidean distance and more.

**However, for this specific task, I decided to assign a custom similarity measure**. It is defined as follows:

![Similarity Measure](jupyter_imgs/similaritymeasure.png)

- It considers the cosine distance as a base measure
- It takes into account the popularity of the post based on the votes it has received by users at StackOverflow
- It takes into account the overall sentiment of the responses that people have made. A positive sentiment entails that the answers were helpful and thus is a good post 

In [95]:
from IPython.display import HTML
import logging
from sklearn.metrics.pairwise import cosine_similarity

search_string = "Combine lists of lists" 
search_string = ' '.join(normalize(tokenize_text(search_string)))
results_returned = "5" 
search_vect = np.array([question_to_vec(search_string, w2v_model)])    # Vectorize the user query

# Calculate Cosine similarites for the query and all titles
cosine_similarities = pd.Series(cosine_similarity(search_vect, all_title_embeddings)[0])

# Custom Similarity Measure
cosine_similarities = cosine_similarities*(1 + 0.4*data.overall_scores_norm + 0.1*(data.sentiment_polarity))

output =""
for i,j in cosine_similarities.nlargest(int(results_returned)).iteritems():
    output += '<a target="_blank" href='+ str(data.question_url[i])+'><h2>' + data.original_title[i] + '</h2></a>'
    output += '<h3> Similarity Score: ' + str(j) + '</h3>'
    output += '<h3> Stackover Votes: ' + str(data.overall_scores[i]) + '</h3>'
    output +='<p style="font-family:verdana; font-size:110%;"> '
    for i in data.question_content[i][:50].split():
        if i.lower() in search_string:
            output += " <b>"+str(i)+"</b>"
        else:
            output += " "+str(i)
    output += "</p><hr>"
    
output = '<h3>Results:</h3>'+output
display(HTML(output))

  """
  import sys
