## Imports

In [1]:
import pickle
import os
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from nltk import jaccard_distance
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 

nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\dib_n\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

## Load Models

In [2]:
def getModels():
    list_of_models = {}
    for f in os.listdir('./pickle/'):
        if(os.path.getsize('./pickle/'+f)>0):
            name = f.replace('.nav','')
            #print(f)
            list_of_models[name] = pickle.load(open('./pickle/'+f,'rb'))
        #else:
            #print(f)
    return list_of_models

### Load Dataframe

In [3]:
df = pd.read_csv('../Data/cleanDF.csv',encoding='latin',index_col=0)

In [4]:
df.loc[df['java']==1].head(15)

Unnamed: 0,Title,javascript,java,c#,php,android,jquery,python,html,.net,...,ios,mysql,css,sql,objective-c,ruby-on-rails,c,iphone,angularjs,arrays
20,What code analysis tools do you use for your J...,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
28,Packaging Java apps for the Windows/Linux desktop,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
48,What's the best way to get started with OSGI ?,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
67,Best Method to run a Java Application as a *ni...,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
77,Why aren't Enumerations Iterable ?,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
88,Java Logging vs Log4J,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
91,Passing null to a method,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
93,Alternatives to System .exit(1),0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
95,How do you get the ethernet address using Java ?,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
117,Regex to match against something that is not a...,0,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## String distance

In [5]:
def cleanSentence(s):
    #Basic cleanUp
    stop_words = set(stopwords.words('english')) 
    word_tokens = word_tokenize(s) 
    filtered_sentence = [w for w in word_tokens if not w in stop_words] 
    filtered_sentence = [] 
    for w in word_tokens: 
        if w not in stop_words: 
            filtered_sentence.append(w)
    #specif cleanup
    categories = ['javascript', 'java', 'c#', 'php', 'android', 'jquery',
       'python', 'html', '.net', 'c++', 'ios', 'mysql', 'css', 'sql',
       'objective-c', 'ruby-on-rails', 'c', 'iphone', 'angularjs', 'arrays']
    for c in categories:
        if(c in filtered_sentence):
            filtered_sentence.remove(c)
    sentence = ' '.join(filtered_sentence)
    sentence = sentence.replace('.','')
    sentence = sentence.replace(',','')
    sentence = sentence.replace('?','')
    sentence = sentence.replace('"','')
    sentence = sentence.replace("'",'')
    while('  ' in sentence):
        sentence = sentence.replace('  ',' ')
    return sentence

In [6]:
def levenshtein(s1, s2):
    if len(s1) < len(s2):
        s1, s2 = s2, s1

    if len(s2) == 0:
        return len(s1)

    previous_row = range(len(s2) + 1)
    for i, c1 in enumerate(s1):
        current_row = [i + 1]
        for j, c2 in enumerate(s2):
            insertions = previous_row[j + 1] + 1
            deletions = current_row[j] + 1
            substitutions = previous_row[j] + (c1 != c2)
            current_row.append(min(insertions, deletions, substitutions))
        previous_row = current_row

    return previous_row[-1]/float(len(s1))

In [7]:
#return 5 closest strings
def closestString(local_df,texto):
    texto = cleanSentence(texto)
    w2 = []
    for w in texto.split(' '):
        if(w not in w2):
            w2.append(w)
    distances = {}
    for te in local_df.Title:
        t = cleanSentence(te)
        w1 = []
        total_d=0
        for w in t.split(' '):
            if(w not in w1):
                w1.append(w)
        for i in w1:
            d=0
            for j in w2:
                temp = levenshtein(i,j)
                if((temp<d) | (d==0)):
                    d=temp
            total_d+=d
        total_d=total_d/len(w1)
        tfidf_vectorizer = TfidfVectorizer()
        tfidf_matrix = tfidf_vectorizer.fit_transform((t,texto))
        result_cos = 1 - cosine_similarity(tfidf_matrix[0:1], tfidf_matrix)[0][1]
        total_d+=result_cos
        distances[te] = total_d
    distances = sorted(distances.items(), key=lambda x: x[1])
    new_d = {}
    for i in distances[:5]:
        new_d[i[0]]=i[1]
    return new_d

In [8]:
closestString(df.loc[df['c#']==1],'break a loop in c#')

{'Loop Reversal in C# Speeds Up app': 1.4247332788790377,
 'Counter in foreach loop in C#': 1.4251586146580233,
 'How to escape a while loop in C#': 1.4561109956104041,
 'Loop through all the resources in a  .resx file': 1.4644340716603925,
 'Captured variable in a loop in C#': 1.4644443289437374}

## GetRealProposals

In [13]:
def AllClosestStrings(question):
    list_of_models = getModels()
    predicts = {}
    possibleEqualQuestion = {}
    df = pd.read_csv('../Data/cleanDF.csv',encoding='latin',index_col=0)
    for m in list_of_models.items():
        predicts[m[0]] = m[1].predict([question])[0]
    for i in predicts.items():
        if(i[1]==1):
            try:
                tempDict = closestString(df.loc[df[i[0]]==1],question)
            except:
                print(i[0],"couldn't be used, sorry")
    possibleEqualQuestion = sorted(tempDict.items(), key=lambda x: x[1])[:5]
    pQ = {}
    for item in possibleEqualQuestion:
        pQ[possibleEqualQuestion[0]]=possibleEqualQuestion[1]
    return pQ

In [14]:
AllClosestStrings('python input')

{('Python nonblocking console input',
  1.3115800799674338): ('Evaluate math equations from unsafe user input in Python', 1.5028034864202064)}

In [15]:
def Suggest(question):
    cS = AllClosestStrings(question)
    menu = {}
    print('Hey, do you wanna use one of this existing questions?')
    for i,s in enumerate(cS.keys()):
        menu[i] = s
        print(i,':',s[0])
    a = input('If yes, type the question number, if no, type any other thing\n')
    if((a==None) | (a not in menu)):
        return None
    else:
        return menu[a]

In [17]:
q = input('Type your question: ')
retorno = Suggest(q)

Type your question: Can't connecct to database in c#
Hey, do you wanna use one of this existing questions?
0 : Copy Data from a table in one Database to another separate database
If yes, type the question number, if no, type any other thing
Can't connecct to database in c#
None
