<h1>Get results functions</h1>

In this file we define the results function for both tecnologies that calculates the most similar records to the user input in our data and return it to the chatbot.

In [1]:
import pandas as pd
import gensim
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
%run create_embeddings.ipynb
import preprocessing
import tfidf
import scipy
#pd.set_option('max_colwidth', 5000)

<h2>Import data</h2>

In [2]:
#Data on Stack Overflow DB already preprocessed
data_stack = pd.read_csv('DB/Preprocessed_data.csv')
data_stack.set_index('id', inplace=True)
data_stack = data_stack.dropna()

#Data on Ontology DB already preprocessed
data_onto = pd.read_csv('DB/Preprocessed_ontology.csv')

#Matrix for stack tfidf processing 
stack_corpus = data_stack['processed_title'] + data_stack['questions_processed'] + data_stack['answers_processed'] 
stack_corpus = stack_corpus.dropna()

#Matrix for ontology tfidf processing
onto_corpus = data_onto.description_processed

In [3]:
#Import word to vec model 
word_2_vec_model = gensim.models.word2vec.Word2Vec.load('DB/word2vec_trained.bin')
word_2_vec_model

<gensim.models.word2vec.Word2Vec at 0x2c558359c40>

In [4]:
#Fucntion to clear raw txt 
def normalize(txt):
    phrase = preprocessing.clear_text(txt)
    return phrase

In [5]:
#Import title embeddings 
embeddings = pd.read_csv('DB/titleEmbeddings.csv')

idList = embeddings['id']
embeddings = embeddings.drop(columns = ['id'])


<h2>Word to vec results</h2>

In [6]:
#Function to calculate cosine similarity between user raw txt and stack overflow processed title with word to vec 

def w2v_stack_result(txt):
    stringSearch = normalize(txt)

    numberResult = 5

    #Vectorize user query
    vectorSearch = np.array([questionToVec(stringSearch, word_2_vec_model)])
    
    #Load embeddings 
    allTitleEmb = embeddings.values

    #Cosine similarity between titles and user query vectorized
    similarityCosine = pd.Series(cosine_similarity(vectorSearch, allTitleEmb)[0])
    
    #Personalizzazione della misura di similarità
    similarityCosine = similarityCosine*(1+0.4*data_stack.score)
    
    #Lista che conterrà coppie i,j con i indice e j score di similarità
    results = []
    
    #Una volta calcolati gli score di similarità rendo i 10 più alti
    for i,similiarity_score in similarityCosine.nlargest(numberResult).iteritems():
        #Filtro i tag da rendere come risultato, eliminando quelli che sono già presenti nella string search per facilitare prossimi passaggi 
        idResults = idList[i]
        tags = data_stack.loc[idResults].tags.split('|')
        stack_id = idResults
        stack_title = data_stack.loc[idResults].title
        
        results.append(tuple([stack_id, similiarity_score, tags, stack_title ]))
        
    return results

In [7]:
##Function to calculate cosine similarity between raw txt and function description of ontology with word to vec 
def w2v_ontology_result(txt):
    stringSearch = normalize(txt)

    #print(stringSearch)

    numberResult = 3

    #Viene vettorizzata la query dell'utente

    vectorSearch = np.array([questionToVec(stringSearch, word_2_vec_model)])
    
    #Carico il file con gli embeddings dei titoli sui quali effettuare la corrispondenza
    allFuncEmb = pd.read_csv('DB/functionEmbeddings.csv').values

    #Calcolo della similarità del coseno for le query e tutti i titoli

    similarityCosine = pd.Series(cosine_similarity(vectorSearch, allFuncEmb)[0])
    
    #Lista che conterrà coppie i,j con i indice e j score di similarità
    results = []
    
      
    #Una volta calcolati gli score di similarità rendo i 10 più alti
    for i,similiarity_score in similarityCosine.nlargest(numberResult).iteritems():
        #print(i, data.Body[i])
        #Filtro i tag da rendere come risultato, eliminando quelli che sono già presenti nella string search per facilitare prossimi passaggi 
        description = data_onto.iloc[[i]].description.item()
        function = data_onto.iloc[[i]].function.item()
                    
        results.append(tuple([function,description, similiarity_score]))
      
    return results

In [8]:
#Function to calculate cosine similarity between user raw txt and stack overflow processed title with tf-idf 
def tfidf_stack_result(txt):
    stringSearch = normalize(txt)
    related_docs_indices = tfidf.get_results(stack_corpus, stringSearch)
    
    results = []
    
    for idx in related_docs_indices:
        
        stack_id = data_stack.iloc[idx].name
        stack_title = data_stack.iloc[idx].title
        tags = data_stack.iloc[idx].tags.split('|')
        
        results.append(tuple([stack_id, 0, tags, stack_title ]))
    
    return results

In [17]:
#Function to calculate cosine similarity between user raw txt and stack overflow processed title with tf-idf 
def tfidf_onto_result(txt):
    stringSearch = normalize(txt)
    related_docs_indices = tfidf.get_results(onto_corpus, stringSearch)
    
    results = []
    
    for idx in related_docs_indices:
        
        description = data_onto.iloc[[idx]].description.item()
        function = data_onto.iloc[[idx]].function.item()
        
        results.append(tuple([function,description, 0]))
    
    return results

Example results 

In [10]:
w2v_stack_result('indexing problem on dataframe')

[(60473294,
  0.4343128309518088,
  ['pandas', 'dummy-variable', 'dummy-data'],
  'Create dummy coded columns for a column and concatenate it to the dataset'),
 (58060307, 0.4172855542502418, ['pandas'], 'Filter columns by same value'),
 (68163767,
  0.34741993813297584,
  ['python-3.x', 'pandas', 'dataframe'],
  'Group by a pandas dataframe by 15 min time intervals but for the whole day'),
 (58397836,
  0.2845990069755504,
  ['pandas', 'datetime'],
  'How to do date subraction of date condering only weekdays when there are null values?'),
 (51062770,
  0.2804673807754901,
  ['pandas'],
  'Python Pandas - Lambda apply keep initial format')]

In [11]:
preprocessing.clear_text('how to add image to code')

'add image code '

In [12]:
preprocessing.clear_text('problem to add image to code')

'problem add image code '

In [13]:
w2v_ontology_result('how to add image to code')

[('javax.swing.ImageIcon-getImageObserver()',
  'Returns the image observer for the image.   ',
  0.6364787123557293),
 ('java.awt.Component-createImage(java.awt.image.ImageProducer)',
  'Creates an image from the specified image producer.  ',
  0.6057158511295015),
 ('java.awt.Toolkit-createImage(java.awt.image.ImageProducer)',
  'Creates an image with the specified image producer.  ',
  0.6057158511295015)]

In [14]:
tfidf_stack_result('indexing problem on dataframe')

[(22976831, 0, ['pandas'], 'Boolean indexing of multi-index Dataframes'),
 (46520813,
  0,
  ['arrays', 'numpy'],
  'Curious numpy advanced indexing selection case'),
 (23887135, 0, ['pandas'], 'pandas indexing in multiindex dataframe'),
 (51867278, 0, ['numpy'], 'how to understand this python code ,thanks a lot')]

In [18]:
tfidf_onto_result('Python Pandas Data frame creation')

[('jdk.internal.org.objectweb.asm.tree.analysis.Frame-init(jdk.internal.org.objectweb.asm.tree.analysis.Frame)',
  'Copies the state of the given frame into this frame.   ',
  0),
 ('jdk.internal.org.objectweb.asm.tree.analysis.Frame-merge(jdk.internal.org.objectweb.asm.tree.analysis.Frame-jdk.internal.org.objectweb.asm.tree.analysis.Interpreter)',
  'Merges this frame with the given frame.   ',
  0),
 ('javax.swing.JOptionPane-getRootFrame()',
  'Returns the Frame to use for the class methods in  which a frame is not provided.   ',
  0),
 ('javax.swing.JOptionPane-getFrameForComponent(java.awt.Component)',
  "Returns the specified component's Frame.   ",
  0)]