In [16]:
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk import word_tokenize
import numpy as np
import string
import time
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack
import random
from sklearn.utils import shuffle
import pandas as pd
from sklearn.neural_network import MLPRegressor
from sklearn.metrics.pairwise import cosine_similarity
from joblib import dump, load

def stemming_tokenizer(text):
    stemmer = PorterStemmer()
    temp = text.replace("\\r", " ")
    temp = temp.replace("\\n", " ")
    return [stemmer.stem(w) for w in word_tokenize(temp)]

random.seed(42)

stop_words = stopwords.words('english')+list(string.punctuation)
stemmed_stop_words = stemming_tokenizer(" ".join(stop_words))

In [8]:
import gensim
# https://github.com/alexandres/lexvec - https://www.dropbox.com/s/flh1fjynqvdsj4p/lexvec.commoncrawl.300d.W.pos.vectors.gz?dl=1
model = gensim.models.KeyedVectors.load_word2vec_format('lexvec.commoncrawl.300d.W.pos.neg3.vectors', binary=False)

In [18]:
def get_vector(text, debug=False):
    vec = np.zeros(300)
    found_count = 0
    total_count = 0
    for word in stemming_tokenizer(text):
        if word.lower() in stemmed_stop_words:
            continue
        total_count += 1
        if word.lower() in model:
            vec = vec + model[word.lower()]
            found_count += 1
    if debug:
        print(found_count, total_count)
    return vec

def save_model(model, filename='default.joblib'):
    dump(model, filename)
    
def load_model(filename='default.joblib'):
    return load(filename)

In [10]:
# load data
data = pd.read_csv('..\miniature.csv')

In [12]:
# Create vectors
X_issue_title_wordvec = [get_vector(text, True) for text in data['IssueTitle']]
X_issue_description_wordvec = [get_vector(text, True) for text in data['IssueDescription']]
X_issue_label_wordvec = [get_vector(text, True) for text in data['Label']]
Y_pr_title_wordvec = [get_vector(text, True) for text in data['PrTitle']]
Y_pr_description_wordvec = [get_vector(text, True) for text in data['PrDescription']]

5 5
5 5
3 3
6 6
8 8
40 69
88 131
92 129
42 53
15 16
3 5
5 7
4 4
0 2
1 4
6 7
5 6
7 7
3 3
7 7
44 54
8 13
9 10
116 144
95 115


In [13]:
# Train-test split for all the data
X_issue_title_wordvec_train, X_issue_title_wordvec_test, X_issue_description_wordvec_train, X_issue_description_wordvec_test, X_issue_label_wordvec_train, X_issue_label_wordvec_test, Y_pr_title_wordvec_train, Y_pr_title_wordvec_test, Y_pr_description_wordvec_train, Y_pr_description_wordvec_test = train_test_split(X_issue_title_wordvec, X_issue_description_wordvec, X_issue_label_wordvec, Y_pr_title_wordvec, Y_pr_description_wordvec, test_size=0.25, random_state=33)

In [20]:
# Model
classifier = MLPRegressor(hidden_layer_sizes=(100))
classifier.fit(X_issue_description_wordvec_train, Y_pr_description_wordvec_train)
modelFileName = "testmodel.joblib"
save_model(classifier, modelFileName)

In [25]:
# Prediction
fromFile = True
if fromFile:
    classifier = load_model(modelFileName)

def predict_and_get_cosine_sim(x, y, clf):
    y_pred = clf.predict(x)
    return cosine_similarity(y_pred, y)

predict_and_get_cosine_sim(X_issue_description_wordvec_train, Y_pr_description_wordvec_train, classifier)

array([[0.99996713, 0.76536384, 0.74104923],
       [0.76518317, 0.99999533, 0.94625214],
       [0.74091432, 0.94633572, 0.99999021]])

In [26]:
def get_top_k_predictions(x, y, clf, k):
    cosineMat = predict_and_get_cosine_sim(x, y, clf)
    # Sort in descending order
    return np.argsort(-1*cosineMat)[:, :k]

get_top_k_predictions(X_issue_description_wordvec_train, Y_pr_description_wordvec_train, classifier, 2)

array([[0, 1],
       [1, 2],
       [2, 1]], dtype=int64)

In [53]:
data['PrDescription'][0]

'This PR fixes #102036\\r\\n\\r\\n- So the first issue is that most elements have a margin-top and margin-bottom.\\r\\nIt made sense to remove the margin-top and let top elements push others down. (otherwise you fall into margin-collapse hell)\\r\\n- This means the body needs a padding on the top so that any first element has a gap (and not just the H1)\\r\\n- Headings have a smaller bottom margin so as to look more like a section\\r\\n- If 2 paragraphs are next to each other, the second paragraph reduces its top margin to bring them closer together.\\r\\n\\r\\n## Before\\r\\n![image](https://user-images.githubusercontent.com/936006/87351758-ce286c80-c551-11ea-80e5-13af706cd8ac.png)\\r\\n\\r\\n\\r\\n## After\\r\\n![image](https://user-images.githubusercontent.com/936006/87352076-5d358480-c552-11ea-9fa0-c5fcf3861712.png)\\r\\n\\r\\npings @mjbvz \\r\\n\\r\\n\\r\\n'