<h1>Get results functions</h1>

In this file we define the results function for both tecnologies that calculates the most similar records to the user input in our data and return it to the chatbot.

In [1]:
import pandas as pd
import gensim
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
%run create_embeddings.ipynb
import preprocessing
import tfidf
import scipy
#pd.set_option('max_colwidth', 5000)

<h2>Import data</h2>

In [2]:
#Data on Stack Overflow DB already preprocessed
data_stack = pd.read_csv('DB/Preprocessed_data.csv')
data_stack.set_index('id', inplace=True)

#Create the data to return results for tfidf
data_stack_tfidf = data_stack.dropna()

data_stack = data_stack[data_stack['processed_title'].notna()]

#Data on Ontology DB already preprocessed
data_onto = pd.read_csv('DB/Preprocessed_ontology.csv')

#Matrix for stack tfidf processing 
stack_corpus = data_stack['processed_title'] + data_stack['questions_processed'] + data_stack['answers_processed'] 
stack_corpus = stack_corpus.dropna()

#Matrix for ontology tfidf processing
onto_corpus = data_onto.description_processed

In [3]:
#Import word to vec model 
word_2_vec_model = gensim.models.word2vec.Word2Vec.load('DB/word2vec_trained.bin')
word_2_vec_model

<gensim.models.word2vec.Word2Vec at 0x1bf17762ac0>

In [4]:
#Fucntion to clear raw txt 
def normalize(txt):
    phrase = preprocessing.clear_text(txt)
    return phrase

In [5]:
#Import title embeddings 
embeddings = pd.read_csv('DB/titleEmbeddings.csv')

idList = embeddings['id']
embeddings = embeddings.drop(columns = ['id'])


<h2>Word to vec results</h2>

In [6]:
#Function to calculate cosine similarity between user raw txt and stack overflow processed title with word to vec 

def w2v_stack_result(txt):
    stringSearch = normalize(txt)

    numberResult = 5

    #Vectorize user query
    vectorSearch = np.array([questionToVec(stringSearch, word_2_vec_model)])
    
    #Load embeddings 
    allTitleEmb = embeddings.values

    #Cosine similarity between titles and user query vectorized
    similarityCosine = pd.Series(cosine_similarity(vectorSearch, allTitleEmb)[0])
    
    #Lista che conterrà coppie i,j con i indice e j score di similarità
    results = []
    
    #Una volta calcolati gli score di similarità rendo i 10 più alti
    for i,similiarity_score in similarityCosine.nlargest(numberResult).iteritems():
        #Filtro i tag da rendere come risultato, eliminando quelli che sono già presenti nella string search per facilitare prossimi passaggi 
        idResults = idList[i]
        tags = data_stack.loc[idResults].tags.split('|')
        stack_id = idResults
        stack_title = data_stack.loc[idResults].title
        
        results.append(tuple([stack_id, similiarity_score, tags, stack_title ]))
        
    return results

In [7]:
##Function to calculate cosine similarity between raw txt and function description of ontology with word to vec 
def w2v_ontology_result(txt):
    stringSearch = normalize(txt)

    #print(stringSearch)

    numberResult = 3

    #Viene vettorizzata la query dell'utente

    vectorSearch = np.array([questionToVec(stringSearch, word_2_vec_model)])
    
    #Carico il file con gli embeddings dei titoli sui quali effettuare la corrispondenza
    allFuncEmb = pd.read_csv('DB/functionEmbeddings.csv').values

    #Calcolo della similarità del coseno for le query e tutti i titoli

    similarityCosine = pd.Series(cosine_similarity(vectorSearch, allFuncEmb)[0])
    
    #Lista che conterrà coppie i,j con i indice e j score di similarità
    results = []
    
      
    #Una volta calcolati gli score di similarità rendo i 10 più alti
    for i,similiarity_score in similarityCosine.nlargest(numberResult).iteritems():
        #print(i, data.Body[i])
        #Filtro i tag da rendere come risultato, eliminando quelli che sono già presenti nella string search per facilitare prossimi passaggi 
        description = data_onto.iloc[[i]].description.item()
        function = data_onto.iloc[[i]].function.item()
                    
        results.append(tuple([function,description, similiarity_score]))
      
    return results

In [8]:
#Function to calculate cosine similarity between user raw txt and stack overflow processed title with tf-idf 
def tfidf_stack_result(txt):
    stringSearch = normalize(txt)
    related_docs_indices = tfidf.get_results(stack_corpus, stringSearch)
    
    results = []
    
    for idx in related_docs_indices:
        
        stack_id = data_stack_tfidf.iloc[idx].name
        stack_title = data_stack_tfidf.iloc[idx].title
        tags = data_stack_tfidf.iloc[idx].tags.split('|')
        
        results.append(tuple([stack_id, 0, tags, stack_title ]))
    
    return results

In [9]:
#Function to calculate cosine similarity between user raw txt and stack overflow processed title with tf-idf 
def tfidf_onto_result(txt):
    stringSearch = normalize(txt)
    related_docs_indices = tfidf.get_results(onto_corpus, stringSearch)
    
    results = []
    
    for idx in related_docs_indices:
        
        description = data_onto.iloc[[idx]].description.item()
        function = data_onto.iloc[[idx]].function.item()
        
        results.append(tuple([function,description, 0]))
    
    return results

Example results 

In [10]:
w2v_stack_result('combine list of list')

[(32840468,
  0.9165012689195305,
  ['list'],
  'combining list of list to single list Python'),
 (53249949,
  0.9130771175085204,
  ['list', 'zip', 'tuples'],
  'How to combine a string list with a list of lists of integers'),
 (67668893,
  0.9116054752341257,
  ['list', 'numpy'],
  'How to change two separate list of list to single list of list?'),
 (64295178,
  0.9079844119741829,
  ['list'],
  'Appending list to a list does not generate list of lists'),
 (38604805, 0.9056062631734002, ['list'], 'Convert list into list of lists')]

In [11]:
tfidf_stack_result('combine list of list')

[(62794627,
  0,
  ['list'],
  'Combine list correspond to the row index of 2D list'),
 (43774214,
  0,
  ['list'],
  'Is there a way to combine the indexes inside a list of lists'),
 (60413135, 0, ['python-3.x', 'list'], 'How to combine two set of list?'),
 (19012467,
  0,
  ['list'],
  'Python: removing first item in each list of a group of lists, starting with the second list')]