<h1>Get results functions</h1>

In this file we define the results function for both tecnologies that calculates the most similar records to the user input in our data and return it to the chatbot.

In [None]:
import pandas as pd
import gensim
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
%run create_embeddings.ipynb
import preprocessing
import tfidf
import scipy
#pd.set_option('max_colwidth', 5000)

<h2>Import data</h2>

In [None]:
#Data on Stack Overflow DB already preprocessed
data_stack = pd.read_csv('DB/Preprocessed_data.csv')
data_stack.set_index('id', inplace=True)

#Create the data to return results for tfidf
data_stack_tfidf = data_stack.dropna()

data_stack = data_stack[data_stack['processed_title'].notna()]

#Data on Ontology DB already preprocessed
data_onto = pd.read_csv('DB/Preprocessed_ontology.csv')

#Matrix for stack tfidf processing 
stack_corpus = data_stack['processed_title'] + data_stack['questions_processed'] + data_stack['answers_processed'] 
stack_corpus = stack_corpus.dropna()

#Matrix for ontology tfidf processing
onto_corpus = data_onto.description_processed

In [None]:
#Import word to vec model 
word_2_vec_model = gensim.models.word2vec.Word2Vec.load('DB/word2vec_trained.bin')
word_2_vec_model

In [None]:
#Fucntion to clear raw txt 
def normalize(txt):
    phrase = preprocessing.clear_text(txt)
    return phrase

In [None]:
#Import title embeddings 
embeddings = pd.read_csv('DB/titleEmbeddings.csv')

idList = embeddings['id']
embeddings = embeddings.drop(columns = ['id'])


<h2>Word to vec results</h2>

In [None]:
#Function to calculate cosine similarity between user raw txt and stack overflow processed title with word to vec 

def w2v_stack_result(txt):
    stringSearch = normalize(txt)

    numberResult = 5

    #Vectorize user query
    vectorSearch = np.array([questionToVec(stringSearch, word_2_vec_model)])
    
    #Load embeddings 
    allTitleEmb = embeddings.values

    #Cosine similarity between titles and user query vectorized
    similarityCosine = pd.Series(cosine_similarity(vectorSearch, allTitleEmb)[0])
    
    #Lista che conterrà coppie i,j con i indice e j score di similarità
    results = []
    
    #Una volta calcolati gli score di similarità rendo i 10 più alti
    for i,similiarity_score in similarityCosine.nlargest(numberResult).iteritems():
        #Filtro i tag da rendere come risultato, eliminando quelli che sono già presenti nella string search per facilitare prossimi passaggi 
        idResults = idList[i]
        tags = data_stack.loc[idResults].tags.split('|')
        stack_id = idResults
        stack_title = data_stack.loc[idResults].title
        
        results.append(tuple([stack_id, similiarity_score, tags, stack_title ]))
        
    return results

In [None]:
##Function to calculate cosine similarity between raw txt and function description of ontology with word to vec 
def w2v_ontology_result(txt):
    stringSearch = normalize(txt)

    #print(stringSearch)

    numberResult = 3

    #Viene vettorizzata la query dell'utente

    vectorSearch = np.array([questionToVec(stringSearch, word_2_vec_model)])
    
    #Carico il file con gli embeddings dei titoli sui quali effettuare la corrispondenza
    allFuncEmb = pd.read_csv('DB/functionEmbeddings.csv').values

    #Calcolo della similarità del coseno for le query e tutti i titoli

    similarityCosine = pd.Series(cosine_similarity(vectorSearch, allFuncEmb)[0])
    
    #Lista che conterrà coppie i,j con i indice e j score di similarità
    results = []
    
      
    #Una volta calcolati gli score di similarità rendo i 10 più alti
    for i,similiarity_score in similarityCosine.nlargest(numberResult).iteritems():
        #print(i, data.Body[i])
        #Filtro i tag da rendere come risultato, eliminando quelli che sono già presenti nella string search per facilitare prossimi passaggi 
        description = data_onto.iloc[[i]].description.item()
        function = data_onto.iloc[[i]].function.item()
                    
        results.append(tuple([function,description, similiarity_score]))
      
    return results

In [None]:
#Function to calculate cosine similarity between user raw txt and stack overflow processed title with tf-idf 
def tfidf_stack_result(txt):
    
    stringSearch = normalize(txt)
    related_docs_indices,similarity_values = tfidf.get_results(stack_corpus, stringSearch)
    
    results = []
    i = 0 
    
    for idx in related_docs_indices:
        
        stack_id = data_stack_tfidf.iloc[idx].name
        stack_title = data_stack_tfidf.iloc[idx].title
        tags = data_stack_tfidf.iloc[idx].tags.split('|')
        sim_value = similarity_values[i]
        
        i = i+1
        
        results.append(tuple([stack_id, sim_value, tags, stack_title ]))
    
    return results

In [25]:
#Function to calculate cosine similarity between user raw txt and stack overflow processed title with tf-idf 
def tfidf_onto_result(txt):
    stringSearch = normalize(txt)
    related_docs_indices,similarity_values = tfidf.get_results(onto_corpus, stringSearch)
    
    results = []
    i=0
    
    for idx in related_docs_indices:
        
        description = data_onto.iloc[[idx]].description.item()
        function = data_onto.iloc[[idx]].function.item()
        sim_value = similarity_values[i]
        
        i = i+1
        
        results.append(tuple([function,description, sim_value]))
    
    return results

Example results 

In [27]:
w2v_stack_result('problem loading csv')

[(52303121, 0.7828488355884394, ['csv', 'geocode'], 'CSV uploading issue'),
 (45473164,
  0.7819319109892543,
  ['csv', 'neo4j', 'cypher'],
  'Loading Data from CSV to Neo4j'),
 (51687602,
  0.7669548453682289,
  ['numpy'],
  'issue when loading a data file with numpy'),
 (30718958,
  0.7659812488548408,
  ['csv'],
  'Issue with Outputting data from CSV File'),
 (62863078,
  0.7638483273148219,
  ['pandas'],
  'Problem loading a compressed (.gz) .csv file from url')]

In [30]:
tfidf_stack_result('problem loading csv')

[0.49044952 0.4693495  0.46120036 0.4378283 ]


[(17673314,
  0.4904495215566453,
  ['mysql', 'datetime'],
  'Date Time convert from CSV to MySQL'),
 (68513445,
  0.4693494967501211,
  ['pandas', 'dataframe', 'csv', 'export-to-csv'],
  'Append Pandas dataframe to top of csv file without loading csv file content'),
 (56635383,
  0.46120035712587226,
  ['csv'],
  'Compare two CSV files and create a new CSV'),
 (29016807,
  0.43782829543979157,
  ['file', 'csv', 'import-from-csv'],
  'CSV file based on other files data')]

In [26]:
tfidf_onto_result('How to switch axes in matplotlib?')

[('javax.management.monitor.CounterMonitorMBean-getNotify()',
  "Gets the notification's on/off switch value.   ",
  0.4709165875675452),
 ('javax.management.monitor.StringMonitorMBean-getNotifyMatch()',
  "Gets the matching notification's on/off switch value.   ",
  0.3967464503165317),
 ('javax.management.monitor.GaugeMonitorMBean-getNotifyLow()',
  "Gets the low notification's on/off switch value.   ",
  0.3878863765875934),
 ('javax.management.monitor.GaugeMonitorMBean-getNotifyHigh()',
  "Gets the high notification's on/off switch value.   ",
  0.38516534840187544)]

In [29]:
w2v_stack_result('combine list of list')

[(32840468,
  0.9165012689195305,
  ['list'],
  'combining list of list to single list Python'),
 (53249949,
  0.9130771175085204,
  ['list', 'zip', 'tuples'],
  'How to combine a string list with a list of lists of integers'),
 (67668893,
  0.9116054752341257,
  ['list', 'numpy'],
  'How to change two separate list of list to single list of list?'),
 (64295178,
  0.9079844119741829,
  ['list'],
  'Appending list to a list does not generate list of lists'),
 (38604805, 0.9056062631734002, ['list'], 'Convert list into list of lists')]