In [91]:
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk import word_tokenize
import numpy as np
import string
import time
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack
import random
from sklearn.utils import shuffle
import pandas as pd
from sklearn.neural_network import MLPRegressor
from sklearn.metrics.pairwise import cosine_similarity
from joblib import dump, load
from sklearn.preprocessing import normalize
import gensim

def stemming_tokenizer(text):
    stemmer = PorterStemmer()
    temp = text.replace("\\r", " ")
    temp = temp.replace("\\n", " ")
    return [stemmer.stem(w) for w in word_tokenize(temp)]

def save_model(model, filename='default.joblib'):
    dump(model, filename)
    
def load_model(filename='default.joblib'):
    return load(filename)

random.seed(42)

stop_words = stopwords.words('english')+list(string.punctuation)
stemmed_stop_words = stemming_tokenizer(" ".join(stop_words))

In [99]:
load_binary = True
if load_binary:
    model = load_model('word2vec.joblib')
else:
    # https://github.com/alexandres/lexvec - https://www.dropbox.com/s/flh1fjynqvdsj4p/lexvec.commoncrawl.300d.W.pos.vectors.gz?dl=1
    model = gensim.models.KeyedVectors.load_word2vec_format('lexvec.commoncrawl.300d.W.pos.neg3.vectors', binary=False)
    save_model(model, 'word2vec.joblib')

In [100]:
def get_vector(text, debug=False):
    vec = np.zeros(300)
    found_count = 0
    total_count = 0
    for word in stemming_tokenizer(text):
        if word.lower() in stemmed_stop_words:
            continue
        total_count += 1
        if word.lower() in model:
            vec = vec + model[word.lower()]
            found_count += 1
    if debug:
        print(found_count, total_count)
    return vec

In [101]:
# load data
data = pd.read_csv('..\miniature.csv', na_filter=False)

In [102]:
# Create vectors
X_issue_title_wordvec = [get_vector(text, True) for text in data['IssueTitle']]
X_issue_description_wordvec = [get_vector(text, True) for text in data['IssueDescription']]
X_issue_label_wordvec = [get_vector(text, True) for text in data['Label']]
Y_pr_title_wordvec = [get_vector(text, True) for text in data['PrTitle']]
Y_pr_description_wordvec = [get_vector(text, True) for text in data['PrDescription']]

5 5
5 5
3 3
6 6
8 8
4 5
3 4
7 7
5 9
1 2
4 5
5 7
8 8
0 2
10 10
3 4
5 6
3 4
1 3
4 5
5 5
5 5
40 70
88 131
92 129
42 53
15 16
63 78
7 12
40 47
102 137
546 763
188 295
33 36
12 14
56 71
34 44
72 102
49 73
227 378
112 171
26 29
22 41
41 144
3 5
5 7
4 4
0 2
1 4
0 2
0 1
0 1
0 1
0 2
0 2
0 3
2 5
0 1
4 6
0 0
0 1
0 1
0 1
0 2
0 4
1 3
6 7
5 6
7 7
3 3
7 7
4 6
6 6
6 6
6 6
2 3
1 2
3 5
10 10
1 2
12 12
2 3
3 6
5 7
3 3
2 4
4 4
3 4
44 54
8 13
9 10
116 144
95 115
17 21
19 20
9 10
13 18
14 22
2 3
147 178
125 139
5 7
25 31
2 5
7 12
20 22
4 6
22 25
2 3
29 44


In [103]:
# Train-test split for all the data
X_issue_title_wordvec_train, X_issue_title_wordvec_test, X_issue_description_wordvec_train, X_issue_description_wordvec_test, X_issue_label_wordvec_train, X_issue_label_wordvec_test, Y_pr_title_wordvec_train, Y_pr_title_wordvec_test, Y_pr_description_wordvec_train, Y_pr_description_wordvec_test = train_test_split(X_issue_title_wordvec, X_issue_description_wordvec, X_issue_label_wordvec, Y_pr_title_wordvec, Y_pr_description_wordvec, test_size=0.25, random_state=33, shuffle=False)

In [104]:
# Model
classifier = MLPRegressor(hidden_layer_sizes=(100, 100), max_iter=500)
# to do: normalize before adding
def input_feature_extraction(x_title, x_desc):
    x1 = np.array(x_title)
    x2 = np.array(x_desc)
    a = 10
    b = 1
    return a*normalize(x1 - np.mean(x1, axis=0)) + b*normalize(x2 - np.mean(x2, axis=0))

def output_feature_extraction(y_title, y_desc):
    y1 = y_title
    y2 = y_desc
    a = 10
    b = 1
    return a*normalize(y1 - np.mean(y1, axis=0)) + b*normalize(y2 - np.mean(y2, axis=0))

train_input_features = input_feature_extraction(X_issue_title_wordvec_train, X_issue_description_wordvec_train)
train_output_features = output_feature_extraction(Y_pr_title_wordvec_train, Y_pr_description_wordvec_train)

classifier.fit(train_input_features, train_output_features)
modelFileName = "testmodel.joblib"
save_model(classifier, modelFileName)

In [105]:
# Prediction
fromFile = True
if fromFile:
    classifier = load_model(modelFileName)

def predict_and_get_cosine_sim(x, y, clf):
    y_pred = clf.predict(x)
    return cosine_similarity(y_pred, y)

predict_and_get_cosine_sim(train_input_features, train_output_features, classifier)

array([[ 0.9996252 , -0.14653443, -0.13501176,  0.3985193 , -0.01730709,
         0.00842595, -0.1138409 , -0.24380555, -0.05726075, -0.01602731,
         0.2360312 , -0.21346409, -0.18998543,  0.23461879, -0.05166702,
         0.18754926],
       [-0.14598183,  0.99965121,  0.11152513, -0.24101133, -0.23515215,
        -0.04267059,  0.0092502 ,  0.01235577,  0.06926906, -0.17171409,
        -0.19201252, -0.0770869 , -0.0829371 , -0.15019516, -0.15395998,
        -0.18957513],
       [-0.13684962,  0.1130802 ,  0.99976013, -0.39928509, -0.13443254,
        -0.09200742, -0.09220348,  0.04732881,  0.15580321, -0.13878742,
        -0.3691687 , -0.14551688, -0.04804459, -0.35313238,  0.03671019,
        -0.35447825],
       [ 0.39994478, -0.24736316, -0.40290132,  0.9982376 ,  0.00324931,
        -0.07265365, -0.15958564, -0.22315705, -0.17928653,  0.26029775,
         0.47473269,  0.10846793, -0.18544807,  0.48997506, -0.18295003,
         0.42679576],
       [-0.01543395, -0.23471856, -0

In [106]:
def get_top_k_predictions(x, y, clf, k):
    cosineMat = predict_and_get_cosine_sim(x, y, clf)
    # Sort in descending order
    return np.argsort(-1*cosineMat)[:, :k]

# Test with PRs (train+test)
get_top_k_predictions(input_feature_extraction(X_issue_title_wordvec, X_issue_description_wordvec), output_feature_extraction(Y_pr_title_wordvec, Y_pr_description_wordvec), classifier, 4)

array([[ 0, 18,  3, 17],
       [ 1, 16,  2,  8],
       [ 2, 16,  8,  1],
       [ 3, 13, 10, 15],
       [ 4, 14, 15, 12],
       [ 5, 20, 12,  0],
       [ 6, 20, 19,  1],
       [ 7, 16, 11,  2],
       [ 8,  2, 17, 16],
       [ 9, 10, 13, 15],
       [10, 13, 15, 18],
       [11, 16,  7, 13],
       [12,  4,  5, 14],
       [13, 10, 15, 18],
       [14,  4,  2, 20],
       [15, 13, 10, 18],
       [15, 11, 10,  3],
       [10, 13,  9, 15],
       [13, 10, 15, 18],
       [13, 10, 15, 18],
       [ 9, 12,  3,  5],
       [16,  8,  9, 11]], dtype=int64)

In [109]:
# metrics for performance
from sklearn.metrics import accuracy_score
y_pred = get_top_k_predictions(input_feature_extraction(X_issue_title_wordvec, X_issue_description_wordvec), output_feature_extraction(Y_pr_title_wordvec, Y_pr_description_wordvec), classifier, 1)
print(accuracy_score(range(len(y_pred)), y_pred))

def top_k_accuracy(y_true, y_pred):
    total = 0
    correct = 0
    for i in range(len(y_pred)):
        total += 1
        if y_true[i] in y_pred[i]:
            correct += 1
    return correct/total

y_pred = get_top_k_predictions(input_feature_extraction(X_issue_title_wordvec, X_issue_description_wordvec), output_feature_extraction(Y_pr_title_wordvec, Y_pr_description_wordvec), classifier, 4)
print(top_k_accuracy(range(len(y_pred)), y_pred))

0.7272727272727273
0.7727272727272727
