In [16]:
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk import word_tokenize
import numpy as np
import string
import time
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack
import random
from sklearn.utils import shuffle
import pandas as pd
from sklearn.neural_network import MLPRegressor
from sklearn.metrics.pairwise import cosine_similarity
from joblib import dump, load

def stemming_tokenizer(text):
    stemmer = PorterStemmer()
    temp = text.replace("\\r", " ")
    temp = temp.replace("\\n", " ")
    return [stemmer.stem(w) for w in word_tokenize(temp)]

random.seed(42)

stop_words = stopwords.words('english')+list(string.punctuation)
stemmed_stop_words = stemming_tokenizer(" ".join(stop_words))

In [8]:
import gensim
# https://github.com/alexandres/lexvec - https://www.dropbox.com/s/flh1fjynqvdsj4p/lexvec.commoncrawl.300d.W.pos.vectors.gz?dl=1
model = gensim.models.KeyedVectors.load_word2vec_format('lexvec.commoncrawl.300d.W.pos.neg3.vectors', binary=False)

In [18]:
def get_vector(text, debug=False):
    vec = np.zeros(300)
    found_count = 0
    total_count = 0
    for word in stemming_tokenizer(text):
        if word.lower() in stemmed_stop_words:
            continue
        total_count += 1
        if word.lower() in model:
            vec = vec + model[word.lower()]
            found_count += 1
    if debug:
        print(found_count, total_count)
    return vec

def save_model(model, filename='default.joblib'):
    dump(model, filename)
    
def load_model(filename='default.joblib'):
    return load(filename)

In [29]:
# load data
data = pd.read_csv('..\miniature.csv')

In [32]:
# Create vectors
X_issue_title_wordvec = [get_vector(text, True) for text in data['IssueTitle']]
X_issue_description_wordvec = [get_vector(text, True) for text in data['IssueDescription']]
X_issue_label_wordvec = [get_vector(text, True) for text in data['Label']]
Y_pr_title_wordvec = [get_vector(text, True) for text in data['PrTitle']]
Y_pr_description_wordvec = [get_vector(text, True) for text in data['PrDescription']]

5 5
5 5
3 3
6 6
8 8
4 5
3 4
7 7
5 9
1 2
4 5
5 7
8 8
0 2
10 10
40 70
88 131
92 129
42 53
15 16
63 78
7 12
40 47
102 137
546 763
188 295
33 36
12 14
56 71
34 44
3 5
5 7
4 4
0 2
1 4
0 2
0 1
0 1
0 1
0 2
0 2
0 3
2 5
0 1
4 6
6 7
5 6
7 7
3 3
7 7
4 6
6 6
6 6
6 6
2 3
1 2
3 5
10 10
1 2
12 12
44 54
8 13
9 10
116 144
95 115
17 21
19 20
9 10
13 18
14 22
2 3
147 178
125 139
5 7
25 31


In [45]:
# Train-test split for all the data
X_issue_title_wordvec_train, X_issue_title_wordvec_test, X_issue_description_wordvec_train, X_issue_description_wordvec_test, X_issue_label_wordvec_train, X_issue_label_wordvec_test, Y_pr_title_wordvec_train, Y_pr_title_wordvec_test, Y_pr_description_wordvec_train, Y_pr_description_wordvec_test = train_test_split(X_issue_title_wordvec, X_issue_description_wordvec, X_issue_label_wordvec, Y_pr_title_wordvec, Y_pr_description_wordvec, test_size=0.25, random_state=33, shuffle=False)

In [46]:
# Model
classifier = MLPRegressor(hidden_layer_sizes=(100), max_iter=500)
classifier.fit(X_issue_description_wordvec_train, Y_pr_description_wordvec_train)
modelFileName = "testmodel.joblib"
save_model(classifier, modelFileName)

In [47]:
# Prediction
fromFile = True
if fromFile:
    classifier = load_model(modelFileName)

def predict_and_get_cosine_sim(x, y, clf):
    y_pred = clf.predict(x)
    return cosine_similarity(y_pred, y)

predict_and_get_cosine_sim(X_issue_description_wordvec_train, Y_pr_description_wordvec_train, classifier)

array([[0.99997806, 0.38256743, 0.61445984, 0.76526712, 0.74092145,
        0.62459737, 0.54547813, 0.58231153, 0.61073225, 0.58135237,
        0.36798945],
       [0.42385593, 0.96421456, 0.55547583, 0.44032939, 0.51191259,
        0.35696198, 0.4630528 , 0.45743175, 0.5761138 , 0.37627579,
        0.33240415],
       [0.61568919, 0.52033091, 0.98597826, 0.63781865, 0.66851146,
        0.56573205, 0.55031831, 0.60056007, 0.64549759, 0.55262375,
        0.39008007],
       [0.76546671, 0.40633637, 0.61404987, 0.99999071, 0.9463143 ,
        0.69843998, 0.60037007, 0.60548278, 0.69139797, 0.68410482,
        0.37672298],
       [0.74088647, 0.46978611, 0.63986108, 0.94634743, 0.99999951,
        0.67722365, 0.60797223, 0.62716448, 0.70273058, 0.68903977,
        0.37545292],
       [0.62381208, 0.33081087, 0.55952719, 0.70111347, 0.68032929,
        0.99870317, 0.53643193, 0.53219947, 0.56548714, 0.57746467,
        0.41126143],
       [0.54548568, 0.43505981, 0.5534485 , 0.59985528, 0.

In [49]:
def get_top_k_predictions(x, y, clf, k):
    cosineMat = predict_and_get_cosine_sim(x, y, clf)
    # Sort in descending order
    return np.argsort(-1*cosineMat)[:, :k]

# Test with PRs (train+test)
get_top_k_predictions(X_issue_description_wordvec, Y_pr_description_wordvec, classifier, 4)

array([[ 0,  3, 12,  4],
       [ 1,  8,  2, 11],
       [ 2, 11,  4, 12],
       [ 3,  4, 12, 11],
       [ 4,  3, 12, 11],
       [ 5, 11, 12,  3],
       [ 6, 11,  4,  3],
       [ 7, 11,  4, 12],
       [ 8, 11,  4,  3],
       [ 9, 12,  4,  3],
       [11,  5, 12,  2],
       [ 3,  4, 12, 11],
       [ 4,  3, 12, 11],
       [ 0,  2,  3, 12],
       [ 0, 12,  3,  4]], dtype=int64)

In [31]:
data['IssueDescription'][0]

'The spacing on markdown preview has headings (h2,h3) in the middle between text, but they should be spaced less with the content underneath so it looks like a section. Right now the spacing looks even with the content below and above.\\r\\n\\r\\nHere is an example (using grey matter theme):\\r\\n<img width="676" alt="markdown-preview" src="https://user-images.githubusercontent.com/936006/87095255-50561f80-c238-11ea-8bcf-c4c2f023c3c1.png">\\r\\n\\r\\nI\'ve adjusted on photoshop what it should look like:\\r\\n<img width="676" alt="markdown-preview (1)" src="https://user-images.githubusercontent.com/936006/87095426-a034e680-c238-11ea-9897-b661e828630d.png">\\r\\n\\r\\nIt\'s only slight but the bottom one brings the heading down slightly so its more together.\\r\\nI looked into doing this but i could only find https://github.com/microsoft/vscode/blob/06f85af581281a3f45783329d375ecc7694930b4/extensions/markdown-language-features/media/markdown.css\\r\\n\\r\\n@mjbvz'