<h1>Get results functions</h1>

In this file we define the results function for both tecnologies that calculates the most similar records to the user input in our data and return it to the chatbot.

In [1]:
import pandas as pd
import gensim
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import spacy
%run create_embeddings.ipynb
import preprocessing
import tfidf
import scipy
#pd.set_option('max_colwidth', 5000)

<h2>Import data</h2>

In [2]:
#Data on Stack Overflow DB already preprocessed
data_stack = pd.read_csv('DB/Preprocessed_data.csv')
data_stack.set_index('id', inplace=True)
data_stack = data_stack.dropna()

#Data on Ontology DB already preprocessed
data_onto = pd.read_csv('DB/Preprocessed_ontology.csv')

#Matrix for stack tfidf processing 
stack_corpus = data_stack['processed_title'] + data_stack['questions_processed'] + data_stack['answers_processed'] 
stack_corpus = stack_corpus.dropna()

#Matrix for ontology tfidf processing
onto_corpus = data_onto.description_processed

In [3]:
#Import word to vec model 
word_2_vec_model = gensim.models.word2vec.Word2Vec.load('DB/word2vec_trained.bin')
word_2_vec_model

<gensim.models.word2vec.Word2Vec at 0x109417c70>

In [4]:
#Fucntion to clear raw txt 
def normalize(txt):
    phrase = preprocessing.clear_text(txt)
    return phrase

In [5]:
#Import title embeddings 
embeddings = pd.read_csv('DB/titleEmbeddings.csv')

idList = embeddings['id']
embeddings = embeddings.drop(columns = ['id'])


<h2>Word to vec results</h2>

In [44]:
#Function to calculate cosine similarity between user raw txt and stack overflow processed title with word to vec 

def w2v_stack_result(txt):
    stringSearch = normalize(txt)

    numberResult = 5

    #Vectorize user query
    vectorSearch = np.array([questionToVec(stringSearch, word_2_vec_model)])
    
    #Load embeddings 
    allTitleEmb = embeddings.values

    #Cosine similarity between titles and user query vectorized
    similarityCosine = pd.Series(cosine_similarity(vectorSearch, allTitleEmb)[0])
    
    #Personalizzazione della misura di similarità
    similarityCosine = similarityCosine*(1+0.4*data_stack.score)
    
    #Lista che conterrà coppie i,j con i indice e j score di similarità
    results = []
    
    #Una volta calcolati gli score di similarità rendo i 10 più alti
    for i,similiarity_score in similarityCosine.nlargest(numberResult).iteritems():
        #Filtro i tag da rendere come risultato, eliminando quelli che sono già presenti nella string search per facilitare prossimi passaggi 
        idResults = idList[i]
        tags = data_stack.loc[idResults].tags.split('|')
        stack_id = idResults
        stack_title = data_stack.loc[idResults].title
        
        results.append(tuple([stack_id, similiarity_score, tags, stack_title ]))
        
    return results

In [7]:
##Function to calculate cosine similarity between raw txt and function description of ontology with word to vec 
def w2v_ontology_result(txt):
    stringSearch = normalize(txt)

    #print(stringSearch)

    numberResult = 3

    #Viene vettorizzata la query dell'utente

    vectorSearch = np.array([questionToVec(stringSearch, word_2_vec_model)])
    
    #Carico il file con gli embeddings dei titoli sui quali effettuare la corrispondenza
    allFuncEmb = pd.read_csv('DB/functionEmbeddings.csv').values

    #Calcolo della similarità del coseno for le query e tutti i titoli

    similarityCosine = pd.Series(cosine_similarity(vectorSearch, allFuncEmb)[0])
    
    #Lista che conterrà coppie i,j con i indice e j score di similarità
    results = []
    
      
    #Una volta calcolati gli score di similarità rendo i 10 più alti
    for i,similiarity_score in similarityCosine.nlargest(numberResult).iteritems():
        #print(i, data.Body[i])
        #Filtro i tag da rendere come risultato, eliminando quelli che sono già presenti nella string search per facilitare prossimi passaggi 
        description = data_onto.iloc[[i]].description.item()
        function = data_onto.iloc[[i]].function.item()
                    
        results.append(tuple([function,description, similiarity_score]))
      
    return results

In [34]:
#Function to calculate cosine similarity between user raw txt and stack overflow processed title with tf-idf 
def tfidf_stack_result(txt):
    stringSearch = normalize(txt)
    related_docs_indices = tfidf.get_results(stack_corpus, stringSearch)
    
    results = []
    
    for idx in related_docs_indices:
        
        stack_id = data_stack.iloc[idx].name
        stack_title = data_stack.iloc[idx].title
        tags = data_stack.iloc[idx].tags.split('|')
        
        results.append(tuple([stack_id, 0, tags, stack_title ]))
    
    return results

Example results 

In [47]:
w2v_stack_result('indexing problem on dataframe')

[(58060307, 0.4904165268263586, ['pandas'], 'Filter columns by same value'),
 (60473294,
  0.4649300092825572,
  ['pandas', 'dummy-variable', 'dummy-data'],
  'Create dummy coded columns for a column and concatenate it to the dataset'),
 (68163767,
  0.3776741724396832,
  ['python-3.x', 'pandas', 'dataframe'],
  'Group by a pandas dataframe by 15 min time intervals but for the whole day'),
 (58397836,
  0.32831126267868566,
  ['pandas', 'datetime'],
  'How to do date subraction of date condering only weekdays when there are null values?'),
 (40011539,
  0.3149059198015368,
  ['java', 'c++', 'string', 'algorithm'],
  'How to find the non-repeating string from the list of string'),
 (36892138,
  0.29621322121576854,
  ['python-2.7', 'dictionary'],
  'How to access list value inside dictionary'),
 (51062770,
  0.280001261279483,
  ['pandas'],
  'Python Pandas - Lambda apply keep initial format'),
 (49517830,
  0.278060874881881,
  ['regex', 'pandas'],
  'Incorrect regex identification usi

In [12]:
preprocessing.clear_text('how to add image to code')

'add image code '

In [13]:
preprocessing.clear_text('problem to add image to code')

'problem add image code '

In [14]:
w2v_ontology_result('how to add image to code')

[('java.awt.Robot-createScreenCapture(java.awt.Rectangle)',
  'Creates an image containing pixels read from the screen.  This image does  not include the mouse cursor.  ',
  0.6254305493934694),
 ('java.awt.image.PixelGrabber-grabPixels()',
  'Request the Image or Image eProducer to start delivering pixels and  wait for all of the pixels in the rectangle of interest to be  delivered.  ',
  0.531585316854855),
 ('javax.swing.AbstractButton-getIconTextGap()',
  'Returns the amount of space between the text and the icon  displayed in this button.   ',
  0.4561271081460775)]

In [48]:
tfidf_stack_result('indexing problem on dataframe')

[(22976831, 0, ['pandas'], 'Boolean indexing of multi-index Dataframes'),
 (46520813,
  0,
  ['arrays', 'numpy'],
  'Curious numpy advanced indexing selection case'),
 (23887135, 0, ['pandas'], 'pandas indexing in multiindex dataframe'),
 (51867278, 0, ['numpy'], 'how to understand this python code ,thanks a lot'),
 (60154213,
  0,
  ['arrays', 'numpy', 'indexing'],
  'Is there a way to do array-based indexing in NumPy so that indices corresponding to all possible combinations of indexing array elements are selected?'),
 (29819233, 0, ['pandas'], 'Indexing on DataFrame with MultiIndex'),
 (36394194, 0, ['pandas'], 'Pandas indexing confusion'),
 (66950849,
  0,
  ['pandas', 'numpy'],
  'array[row][col] vs array[row,col] in Python'),
 (43547506, 0, ['numpy', 'indexing'], 'Numpy Chain Indexing')]