In [91]:
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk import word_tokenize
import numpy as np
import string
import time
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack
import random
from sklearn.utils import shuffle
import pandas as pd
from sklearn.neural_network import MLPRegressor
from sklearn.metrics.pairwise import cosine_similarity
from joblib import dump, load
from sklearn.preprocessing import normalize
import gensim

def stemming_tokenizer(text):
    stemmer = PorterStemmer()
    temp = text.replace("\\r", " ")
    temp = temp.replace("\\n", " ")
    return [stemmer.stem(w) for w in word_tokenize(temp)]

def save_model(model, filename='default.joblib'):
    dump(model, filename)
    
def load_model(filename='default.joblib'):
    return load(filename)

random.seed(42)

stop_words = stopwords.words('english')+list(string.punctuation)
stemmed_stop_words = stemming_tokenizer(" ".join(stop_words))

In [99]:
load_binary = True
if load_binary:
    model = load_model('word2vec.joblib')
else:
    # https://github.com/alexandres/lexvec - https://www.dropbox.com/s/flh1fjynqvdsj4p/lexvec.commoncrawl.300d.W.pos.vectors.gz?dl=1
    model = gensim.models.KeyedVectors.load_word2vec_format('lexvec.commoncrawl.300d.W.pos.neg3.vectors', binary=False)
    save_model(model, 'word2vec.joblib')

In [100]:
def get_vector(text, debug=False):
    vec = np.zeros(300)
    found_count = 0
    total_count = 0
    for word in stemming_tokenizer(text):
        if word.lower() in stemmed_stop_words:
            continue
        total_count += 1
        if word.lower() in model:
            vec = vec + model[word.lower()]
            found_count += 1
    if debug:
        print(found_count, total_count)
    return vec

In [147]:
# load data
data = pd.read_csv('..\mergedDataset.csv', na_filter=False)

In [148]:
# Create vectors
X_issue_title_wordvec = [get_vector(text, True) for text in data['IssueTitle']]
X_issue_description_wordvec = [get_vector(text, True) for text in data['IssueDescription']]
X_issue_label_wordvec = [get_vector(text, True) for text in data['Label']]
Y_pr_title_wordvec = [get_vector(text, True) for text in data['PrTitle']]
Y_pr_description_wordvec = [get_vector(text, True) for text in data['PrDescription']]

2 3
8 9
4 6
3 4
3 3
3 3
4 4
7 7
4 4
0 1
5 6
3 3
2 3
2 2
2 6
1 3
3 5
2 4
3 3
1 2
8 8
5 5
4 5
8 8
4 6
9 9
4 8
5 5
3 4
3 3
6 8
6 6
3 4
3 3
6 8
4 4
4 6
3 4
2 2
2 2
4 4
4 4
3 4
5 6
5 5
3 3
4 5
4 4
4 4
5 5
4 4
3 4
4 4
2 3
7 7
1 3
3 3
5 9
4 5
6 6
5 5
2 5
3 3
6 8
4 4
4 6
3 4
2 2
2 2
4 4
4 4
3 4
5 6
5 5
3 3
4 5
4 4
4 4
5 5
4 4
3 4
4 4
2 3
7 7
1 3
3 3
5 9
4 5
6 6
5 5
2 5
4 4
3 3
5 6
4 5
2 3
3 3
6 7
4 6
3 3
7 7
5 6
6 7
4 4
4 5
2 3
4 4
5 5
5 5
4 4
4 4
3 4
4 5
4 5
6 9
5 7
5 5
7 7
5 6
2 2
4 4
3 4
3 3
4 5
3 4
7 7
5 7
5 5
3 3
3 4
8 8
5 6
8 8
6 6
3 4
8 8
5 6
2 4
4 8
6 6
6 6
7 8
4 5
4 4
5 5
5 5
6 7
3 4
5 6
1 3
4 5
0 1
2 2
4 5
2 3
1 3
2 3
0 1
0 1
6 6
0 4
3 3
3 3
3 4
3 3
3 5
0 2
4 4
5 6
7 7
6 9
7 8
7 8
6 7
6 7
6 7
6 7
6 7
6 7
6 7
4 5
4 4
3 4
5 6
4 5
3 3
6 7
4 4
7 7
7 7
3 6
5 7
2 3
3 4
3 3
5 6
3 4
2 2
5 5
2 2
2 5
3 3
4 4
4 7
7 7
4 4
6 6
0 1
1 2
5 5
1 2
4 4
3 4
4 5
3 4
3 3
4 5
5 6
3 4
7 8
4 5
8 8
5 5
5 8
5 6
3 4
8 8
3 3
2 2
4 4
3 3
2 2
2 3
0 1
3 4
8 8
2 6
0 1
3 6
7 7
0 1
6 7
9 12
6 6
6 6
7 9
2 3
2 3
5 5
4 5

In [149]:
# Train-test split for all the data
X_issue_title_wordvec_train, X_issue_title_wordvec_test, X_issue_description_wordvec_train, X_issue_description_wordvec_test, X_issue_label_wordvec_train, X_issue_label_wordvec_test, Y_pr_title_wordvec_train, Y_pr_title_wordvec_test, Y_pr_description_wordvec_train, Y_pr_description_wordvec_test = train_test_split(X_issue_title_wordvec, X_issue_description_wordvec, X_issue_label_wordvec, Y_pr_title_wordvec, Y_pr_description_wordvec, test_size=0.25, random_state=33, shuffle=False)

In [281]:
# Model
classifier = MLPRegressor(hidden_layer_sizes=(100), max_iter=500, activation='tanh', alpha=1e-2, batch_size=50, random_state=42, verbose=True)

def input_feature_extraction(x_title, x_desc, x_label):
    x1 = np.array(x_title)
    x2 = np.array(x_desc)
    x3 = np.array(x_label)
    a = 10
    b = 1
    c = 1
    return a*normalize(x1 - np.mean(x1, axis=0)) + b*normalize(x2 - np.mean(x2, axis=0)) + c*normalize(x3 - np.mean(x3, axis=0))

def output_feature_extraction(y_title, y_desc):
    y1 = y_title
    y2 = y_desc
    a = 10
    b = 1
    return a*normalize(y1 - np.mean(y1, axis=0)) + b*normalize(y2 - np.mean(y2, axis=0))

train_input_features = input_feature_extraction(X_issue_title_wordvec_train, X_issue_description_wordvec_train, X_issue_label_wordvec_train)
train_output_features = output_feature_extraction(Y_pr_title_wordvec_train, Y_pr_description_wordvec_train)

classifier.fit(train_input_features, train_output_features)
modelFileName = "testmodel.joblib"
save_model(classifier, modelFileName)

Iteration 1, loss = 0.27471621
Iteration 2, loss = 0.24897706
Iteration 3, loss = 0.23180254
Iteration 4, loss = 0.21790562
Iteration 5, loss = 0.20620478
Iteration 6, loss = 0.19666046
Iteration 7, loss = 0.18853964
Iteration 8, loss = 0.18154160
Iteration 9, loss = 0.17515327
Iteration 10, loss = 0.16939781
Iteration 11, loss = 0.16399593
Iteration 12, loss = 0.15911405
Iteration 13, loss = 0.15471032
Iteration 14, loss = 0.15075169
Iteration 15, loss = 0.14680159
Iteration 16, loss = 0.14330341
Iteration 17, loss = 0.14013619
Iteration 18, loss = 0.13718653
Iteration 19, loss = 0.13437464
Iteration 20, loss = 0.13162192
Iteration 21, loss = 0.12915063
Iteration 22, loss = 0.12688371
Iteration 23, loss = 0.12456670
Iteration 24, loss = 0.12246027
Iteration 25, loss = 0.12063497
Iteration 26, loss = 0.11865305
Iteration 27, loss = 0.11712394
Iteration 28, loss = 0.11527627
Iteration 29, loss = 0.11370841
Iteration 30, loss = 0.11229607
Iteration 31, loss = 0.11075621
Iteration 32, los

In [282]:
# Prediction
fromFile = True
if fromFile:
    classifier = load_model(modelFileName)

def predict_and_get_cosine_sim(x, y, clf):
    y_pred = clf.predict(x)
    return cosine_similarity(y_pred, y)

predict_and_get_cosine_sim(train_input_features, train_output_features, classifier)

array([[ 0.92830755,  0.17120565,  0.17120565, ...,  0.22679256,
         0.08633174,  0.25841246],
       [ 0.14874005,  0.97073672,  0.97073672, ...,  0.79695263,
        -0.22464493,  0.79527526],
       [ 0.15877815,  0.96923836,  0.96923836, ...,  0.82246806,
        -0.28314497,  0.81890822],
       ...,
       [ 0.19240391,  0.78948961,  0.78948961, ...,  0.9866365 ,
        -0.35998846,  0.97898145],
       [ 0.08414204, -0.26030847, -0.26030847, ..., -0.4035613 ,
         0.93129759, -0.38741315],
       [ 0.22081053,  0.78064457,  0.78064457, ...,  0.97094026,
        -0.35239709,  0.97657045]])

In [283]:
def get_top_k_predictions(x, y, clf, k):
    cosineMat = predict_and_get_cosine_sim(x, y, clf)
    # Sort in descending order
    return np.argsort(-1*cosineMat)[:, :k]
# Test with PRs (train+test)
get_top_k_predictions(input_feature_extraction(X_issue_title_wordvec, X_issue_description_wordvec, X_issue_label_wordvec), output_feature_extraction(Y_pr_title_wordvec, Y_pr_description_wordvec), classifier, 4)

array([[  0,  96, 126,  13],
       [  1,   2, 246, 211],
       [  1,   2, 246, 211],
       ...,
       [141, 186, 208, 196],
       [203,  45,  74,  49],
       [239, 288,  59,  88]], dtype=int64)

In [284]:
# metrics for performance
from sklearn.metrics import accuracy_score

# Top 1 accuracy
print('Total accuracy')
y_pred = get_top_k_predictions(input_feature_extraction(X_issue_title_wordvec, X_issue_description_wordvec, X_issue_label_wordvec), output_feature_extraction(Y_pr_title_wordvec, Y_pr_description_wordvec), classifier, 1)
print(accuracy_score(range(len(y_pred)), y_pred))
print('Train accuracy with all PRs')
y_pred = get_top_k_predictions(input_feature_extraction(X_issue_title_wordvec_train, X_issue_description_wordvec_train, X_issue_label_wordvec_train), output_feature_extraction(Y_pr_title_wordvec, Y_pr_description_wordvec), classifier, 1)
print(accuracy_score(range(0, len(y_pred)), y_pred))
print('Train accuracy with train PRs')
y_pred = get_top_k_predictions(input_feature_extraction(X_issue_title_wordvec_train, X_issue_description_wordvec_train, X_issue_label_wordvec_train), output_feature_extraction(Y_pr_title_wordvec_train, Y_pr_description_wordvec_train), classifier, 1)
print(accuracy_score(range(0, len(y_pred)), y_pred))
print('Test accuracy with all PRs')
y_pred = get_top_k_predictions(input_feature_extraction(X_issue_title_wordvec_test, X_issue_description_wordvec_test, X_issue_label_wordvec_test), output_feature_extraction(Y_pr_title_wordvec, Y_pr_description_wordvec), classifier, 1)
print(accuracy_score(range(len(X_issue_title_wordvec_train), len(X_issue_title_wordvec)), y_pred))
print('Test accuracy with only test PRs')
y_pred = get_top_k_predictions(input_feature_extraction(X_issue_title_wordvec_test, X_issue_description_wordvec_test, X_issue_label_wordvec_test), output_feature_extraction(Y_pr_title_wordvec_test, Y_pr_description_wordvec_test), classifier, 1)
print(accuracy_score(range(0, len(y_pred)), y_pred))
print()

def top_k_accuracy(y_true, y_pred):
    total = 0
    correct = 0
    for i in range(len(y_pred)):
        total += 1
        if y_true[i] in y_pred[i]:
            correct += 1
    return correct/total

# Top 5 accuracy
print('Total accuracy')
y_pred = get_top_k_predictions(input_feature_extraction(X_issue_title_wordvec, X_issue_description_wordvec, X_issue_label_wordvec), output_feature_extraction(Y_pr_title_wordvec, Y_pr_description_wordvec), classifier, 5)
print(top_k_accuracy(range(len(y_pred)), y_pred))
print('Train accuracy with all PRs')
y_pred = get_top_k_predictions(input_feature_extraction(X_issue_title_wordvec_train, X_issue_description_wordvec_train, X_issue_label_wordvec_train), output_feature_extraction(Y_pr_title_wordvec, Y_pr_description_wordvec), classifier, 5)
print(top_k_accuracy(range(0, len(y_pred)), y_pred))
print('Train accuracy with train PRs')
y_pred = get_top_k_predictions(input_feature_extraction(X_issue_title_wordvec_train, X_issue_description_wordvec_train, X_issue_label_wordvec_train), output_feature_extraction(Y_pr_title_wordvec_train, Y_pr_description_wordvec_train), classifier, 5)
print(top_k_accuracy(range(0, len(y_pred)), y_pred))
print('Test accuracy with all PRs')
y_pred = get_top_k_predictions(input_feature_extraction(X_issue_title_wordvec_test, X_issue_description_wordvec_test, X_issue_label_wordvec_test), output_feature_extraction(Y_pr_title_wordvec, Y_pr_description_wordvec), classifier, 5)
print(top_k_accuracy(range(len(X_issue_title_wordvec_train), len(X_issue_title_wordvec)), y_pred))
print('Test accuracy with only test PRs')
y_pred = get_top_k_predictions(input_feature_extraction(X_issue_title_wordvec_test, X_issue_description_wordvec_test, X_issue_label_wordvec_test), output_feature_extraction(Y_pr_title_wordvec_test, Y_pr_description_wordvec_test), classifier, 5)
print(top_k_accuracy(range(0, len(y_pred)), y_pred))

Total accuracy
0.5836177474402731
Train accuracy with all PRs
0.7488584474885844
Train accuracy with train PRs
0.7488584474885844
Test accuracy with all PRs
0.08108108108108109
Test accuracy with only test PRs
0.25675675675675674

Total accuracy
0.764505119453925
Train accuracy with all PRs
0.958904109589041
Train accuracy with train PRs
0.9680365296803652
Test accuracy with all PRs
0.24324324324324326
Test accuracy with only test PRs
0.3783783783783784


In [None]:
# Total accuracy
# 0.5836177474402731
# Train accuracy with all PRs
# 0.7488584474885844
# Train accuracy with train PRs
# 0.7488584474885844
# Test accuracy with all PRs
# 0.08108108108108109
# Test accuracy with only test PRs
# 0.25675675675675674

# Total accuracy
# 0.764505119453925
# Train accuracy with all PRs
# 0.958904109589041
# Train accuracy with train PRs
# 0.9680365296803652
# Test accuracy with all PRs
# 0.24324324324324326
# Test accuracy with only test PRs
# 0.3783783783783784