# Question Similarity Engine

## 0. Load required packages, functions and pickle objects

In [228]:
from IPython.display import HTML
import random

def hide_toggle(for_next=False):
    this_cell = """$('div.cell.code_cell.rendered.selected')"""
    next_cell = this_cell + '.next()'

    toggle_text = 'Toggle show/hide'  # text shown on toggle link
    target_cell = this_cell  # target cell to control with toggle
    js_hide_current = ''  # bit of JS to permanently hide code in current cell (only when toggling next cell)

    if for_next:
        target_cell = next_cell
        toggle_text += ' next cell'
        js_hide_current = this_cell + '.find("div.input").hide();'

    js_f_name = 'code_toggle_{}'.format(str(random.randint(1,2**64)))

    html = """
        <script>
            function {f_name}() {{
                {cell_selector}.find('div.input').toggle();
            }}

            {js_hide_current}
        </script>

        <a href="javascript:{f_name}()">{toggle_text}</a>
    """.format(
        f_name=js_f_name,
        cell_selector=target_cell,
        js_hide_current=js_hide_current, 
        toggle_text=toggle_text
    )

    return HTML(html)

# Start with loading all necessary libraries
#Importing general purpose python libraries below
import pandas as pd
import numpy as np
import sklearn
import warnings
warnings.filterwarnings("ignore")
import datetime
#tqdm is great for measure progress on long-running tasks
from tqdm import tqdm, tqdm_notebook
from tqdm.notebook import tqdm
tqdm.pandas()
import pickle

#For charts and visualizations
import matplotlib.pyplot as plt
#% matplotlib inline
import seaborn as sns

#For Wordclouds
from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

#For NLP & Feature Extraction
from textatistic import Textatistic
from sklearn.feature_extraction.text import TfidfVectorizer
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from sklearn.metrics.pairwise import cosine_similarity, linear_kernel

#For modeling
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn import metrics
from sklearn.metrics import precision_recall_curve, auc, roc_curve
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import confusion_matrix, accuracy_score

# Import the classifiers
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_curve, roc_auc_score

#Read dataset
df = pd.read_csv('final/df_final_20210405.csv')

#Initialize Functions
def normalized_word_share(row):
    w1 = set(map(lambda word: word.lower().strip(), row['question1'].replace('?','').split(" ")))
    w2 = set(map(lambda word: word.lower().strip(), row['question2'].replace('?','').split(" ")))    
    return 2.0 * len(w1 & w2)/(len(w1) + len(w2))
def preprocess_entities(text):
    doc = nlp(text, disable = ['parser'])
    return [X.text for X in doc.ents], [X.label_ for X in doc.ents]
#Tokenization and Lemmatization
def preprocess(text):
    # Create Doc object
    doc = nlp(text.lower(), disable=['ner', 'parser'])
    # Generate lemmas
    lemmas = [token.lemma_ for token in doc]
    # Remove stopwords and non-alphabetic characters
    a_lemmas = [lemma for lemma in lemmas 
            if lemma.isalpha() and lemma not in STOP_WORDS]
    return ' '.join(a_lemmas)

#Load Pickle Objects
tfid_vectorizer = pickle.load(open("final/tfid_vectorizer.p","rb"))
q1weights = pickle.load(open("final/q1weights.p","rb"))
scaler = pickle.load(open("final/scaler.p","rb"))
model = pickle.load(open("final/model.p","rb"))
clf = pickle.load(open("final/clf.p","rb"))
nlp = pickle.load(open("final/nlp.p","rb"))
doc_embeddings1 = pickle.load(open("final/doc_embeddings1.p","rb"))
sbert_model = pickle.load(open("final/sbert_model.p","rb"))
features = pickle.load(open("final/features.p","rb"))

#load nlp spacy lg model
nlp = spacy.load('en_core_web_lg')

def run_demo():
    user_input_question = input()

    # initial set of features
    demo_df=df[['qid1','question1','q1_cleaned','entities1','entity_types1','q1len','q1_n_words']]
    demo_df['question2']=user_input_question
    demo_df['q2len'] = demo_df['question2'].str.len()
    demo_df['qlen_diff']= abs(demo_df['q1len'] - demo_df['q2len'])
    demo_df['q2_n_words'] = demo_df['question2'].apply(lambda row: len(row.split(" ")))
    demo_df['q_n_words_diff'] = abs(demo_df['q1_n_words'] - demo_df['q2_n_words'])
    demo_df['word_share'] = demo_df.apply(normalized_word_share, axis=1)

    # preprocess user input
    demo_df['q2_cleaned'] = preprocess(user_input_question)

    # create feature tfidf_word_match
    demo_tfidf_wm = tfid_vectorizer.transform([user_input_question])
    demo_b1 = np.array(demo_tfidf_wm.todense()[0])
    demo_q2weights={}
    for ix,i in enumerate(demo_b1[0]):
        if i>0:
            demo_q2weights[tfidf_tokens[ix]] = i
    demo_tfidf_word_match=[]
    for ix in range(df.shape[0]):
        q1words = {}
        q2words = {}
        for word in str(demo_df.iloc[ix]['q1_cleaned']).lower().split():
            if word not in STOP_WORDS:
                q1words[word] = 1
        for word in str(demo_df.iloc[ix]['q2_cleaned']).lower().split():
            if word not in STOP_WORDS:
                q2words[word] = 1
        if len(q1words) == 0 or len(q2words) == 0:
            # The computer-generated chaff includes a few questions that are nothing but stopwords
            demo_tfidf_word_match.append(0)
        else:
            shared_weights = [q1weights[ix][w] for w in q1words.keys() if w in q1weights[ix].keys() if w in q2words] + [demo_q2weights[w] for w in q2words.keys() if w in demo_q2weights.keys() if w in q1words]
            total_weights = [q1weights[ix][w] for w in q1words if w in q1weights[ix].keys()] + [demo_q2weights[w] for w in q2words if w in demo_q2weights.keys()]
            R = np.sum(shared_weights) / np.sum(total_weights)
            demo_tfidf_word_match.append(R)
    #     if ix%50000==0:
    #         print('almost there')
    demo_df['tfidf_word_match']=demo_tfidf_word_match

    #bert cosine similarity scores
    input_embedding = sbert_model.encode(user_input_question)

    bert_cosine_sim = []
    for i in range(len(doc_embeddings1)):
        bert_cosine_sim.append(float(np.dot(doc_embeddings1[i],input_embedding.T)/
                                     (np.linalg.norm(doc_embeddings1[i], ord=2)*
                                      np.linalg.norm(input_embedding, ord=2))))
    demo_df['bert_cosine_sim_'] = bert_cosine_sim

    # Named Entity Recognition Features
    q2_ents=0
    if preprocess_entities(user_input_question)[0] == []:
        q2_ents=0
    else:
        q2_ents = len(preprocess_entities(user_input_question)[0])
    #Compare entities between questions
    demo_df['diff_num_entities'] = demo_df.apply(lambda x: abs(len(x['entities1']) - q2_ents), axis = 1)

    X = scaler.transform(demo_df[features])
    X=np.nan_to_num(X)
    print('Model Ready!')
    yproba = model.predict_proba(X)[::,1]
    
    y_pred=clf.predict(X)

    demo_df['model_score']=yproba
    demo_df['model_pred']=y_pred

    # print(user_input_question)
    #print('Here are some previously answered questions:')
    return demo_df
    #return demo_df[['question1','bert_cosine_sim_','model_score']].nlargest(5,'model_score')

## 1. Enter Question

In [238]:
df1=run_demo()

How can I combat climate change?
Model Ready!


## 2. View Similar Questions

In [239]:
df1[['question1','model_score','model_pred']].nlargest(5,'model_score')

Unnamed: 0,question1,model_score,model_pred
40083,What important steps can we take as individuals to combat climate change?,0.631991,1
20290,Can climate change be reversed?,0.622994,1
35692,Is Climate change real?,0.599,1
64612,Will technology ever allow us to alter the climate?,0.545552,1
41759,What's the difference between climate change and global warming?,0.520883,1
