Copyright (c) Snap Inc. 2020. This sample code is made available by Snap Inc. for informational purposes only. It is provided as-is, without warranty of any kind, express or implied, including any warranties of merchantability, fitness for a particular purpose, or non-infringement. In no event will Snap Inc. be liable for any damages arising from the sample code or your use thereof.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pickle
import hashlib
import spacy
import os
import re
import json
from collections import OrderedDict
from operator import itemgetter
from spacy.lang.en.stop_words import STOP_WORDS
import string
import gensim
from sklearn.metrics.pairwise import cosine_similarity
from xgboost import XGBClassifier
import sys
sys.path.append('../')

from tf_idf_vectorizer import *
from utils.snap_preprocessed_df_handle import *
from utils.EstimatorSelectionHelper import EstimatorSelectionHelper
from utils.classifier_setup import *

# SIF Classification
from sentence_transformers import SentenceTransformer
from scipy.stats import pearsonr, spearmanr
import pickle

In [None]:
def prepare_dataframe_tf_idf(PATH):
    df_with_keywords = get_dataframe(PATH)
    return df_with_keywords

In [None]:
TRAIN_PATH = '../../data/dataframes/df_train_bugrepo_eclipse.pkl'
TEST_PATH = '../../data/dataframes/df_test_bugrepo_eclipse.pkl'
train_df = pd.read_pickle(TRAIN_PATH)
test_df = pd.read_pickle(TEST_PATH)

In [None]:
train_df.columns

# Setup

In [None]:
articles_train = pd.concat([train_df[['id1','title1']].\
                            rename(columns={'id1':'id','title1':'title'}), \
                            train_df[['id2','title2']].\
                            rename(columns={'id2':'id','title2':'title'})]
                          ).drop_duplicates().reset_index(drop=True)
non_dup_articles_train = articles_train['id'].drop_duplicates().index
articles_train = articles_train.loc[non_dup_articles_train].reset_index(drop=True)

articles_test = pd.concat([test_df[['id1','title1']].\
                            rename(columns={'id1':'id','title1':'title'}), \
                            test_df[['id2','title2']].\
                            rename(columns={'id2':'id','title2':'title'})]
                          ).drop_duplicates().reset_index(drop=True)

non_dup_articles_test = articles_test['id'].drop_duplicates().index
articles_test = articles_test.loc[non_dup_articles_test].reset_index(drop=True)

In [None]:
import sys
sys.path.append('../SIF/src/')
import data_io, params, SIF_embedding

In [None]:
wordfile = '../../data/pretrained/glove.840B.300d.txt' # word vector file, can be downloaded from GloVe website
weightfile = '../SIF/auxiliary_data/enwiki_vocab_min200.txt' # each line is a word and its frequency
weightpara = 1e-3 # the parameter in the SIF weighting scheme, usually in the range [3e-5, 3e-3]
rmpc = 1 # number of principal components to remove in SIF weighting scheme

In [None]:
def preprocessor(text):
    regex = '(?<!\d)[!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~](?!\d)'
    return re.sub(regex, "", text, 0).lower()

def getWordmap(textfile):
    words={}
    We = []
    f = open(textfile,'r')
    lines = f.readlines()
    for (n,i) in enumerate(lines):
        i=i.split(" ")
        j = 1
        v = []
        try:
            while j < len(i):
                v.append(float(i[j]))
                j += 1
            words[i[0]]=n
            We.append(v)
        except:
            print('Not working for - ',i[0])
    return (words, np.array(We))

def getWeight(words, word2weight):
    weight4ind = {}
    for word, ind in words.items():
        if word in word2weight:
            weight4ind[ind] = word2weight[word]
        else:
            weight4ind[ind] = 1.0
    return weight4ind

def getWordWeight(weightfile, a=1e-3):
    if a <=0: # when the parameter makes no sense, use unweighted
        a = 1.0

    word2weight = {}
    with open(weightfile) as f:
        lines = f.readlines()
    N = 0
    for i in lines:
        i=i.strip()
        if(len(i) > 0):
            i=i.split()
            if(len(i) == 2):
                word2weight[i[0]] = float(i[1])
                N += float(i[1])
            else:
                print(i)
    for key, value in word2weight.items():
        word2weight[key] = a / (a + value/N)
    return word2weight

def sentences2idx(sentences, words):
    """
    Given a list of sentences, output array of word indices that can be fed into the algorithms.
    :param sentences: a list of sentences
    :param words: a dictionary, words['str'] is the indices of the word 'str'
    :return: x1, m1. x1[i, :] is the word indices in sentence i, m1[i,:] is the mask for sentence i (0 means no word at the location)
    """
    seq1 = []
    for i in sentences:
        seq1.append(data_io.getSeq(i,words))
    x1,m1 = data_io.prepare_data(seq1)
    return x1, m1

def seq2weight(seq, mask, weight4ind):
    weight = np.zeros(seq.shape).astype('float32')
    for i in range(seq.shape[0]):
        for j in range(seq.shape[1]):
            if mask[i,j] > 0 and seq[i,j] >= 0:
                weight[i,j] = weight4ind[seq[i,j]]
    weight = np.asarray(weight, dtype='float32')
    return weight

from sklearn.decomposition import TruncatedSVD


def get_weighted_average(We, x, w):
    """
    Compute the weighted average vectors
    :param We: We[i,:] is the vector for word i
    :param x: x[i, :] are the indices of the words in sentence i
    :param w: w[i, :] are the weights for the words in sentence i
    :return: emb[i, :] are the weighted average vector for sentence i
    """
    n_samples = x.shape[0]
    emb = np.zeros((n_samples, We.shape[1]))
    for i in range(n_samples):
        emb[i,:] = w[i,:].dot(We[x[i,:],:]) / np.count_nonzero(w[i,:])
    return emb

def compute_pc(X,npc=1):
    """
    Compute the principal components. DO NOT MAKE THE DATA ZERO MEAN!
    :param X: X[i,:] is a data point
    :param npc: number of principal components to remove
    :return: component_[i,:] is the i-th pc
    """
    svd = TruncatedSVD(n_components=npc, n_iter=7, random_state=0)
    svd.fit(X)
    return svd.components_

def remove_pc(X, npc=1):
    """
    Remove the projection on the principal components
    :param X: X[i,:] is a data point
    :param npc: number of principal components to remove
    :return: XX[i, :] is the data point after removing its projection
    """
    pc = compute_pc(X, npc)
    if npc==1:
        XX = X - X.dot(pc.transpose()) * pc
    else:
        XX = X - X.dot(pc.transpose()).dot(pc)
    return XX


def interpolate_nans(X):
    """Overwrite NaNs with column value interpolations."""
    for j in range(X.shape[1]):
        mask_j = np.isnan(X[:,j])
        X[mask_j,j] = np.interp(np.flatnonzero(mask_j), np.flatnonzero(~mask_j), X[~mask_j,j])
    return X

def SIF_embedding(We, x, w, params):
    """
    Compute the scores between pairs of sentences using weighted average + removing the projection on the first principal component
    :param We: We[i,:] is the vector for word i
    :param x: x[i, :] are the indices of the words in the i-th sentence
    :param w: w[i, :] are the weights for the words in the i-th sentence
    :param params.rmpc: if >0, remove the projections of the sentence embeddings to their first principal component
    :return: emb, emb[i, :] is the embedding for sentence i
    """
    emb = get_weighted_average(We, x, w)
    emb = interpolate_nans(emb)
    if  params.rmpc > 0:
        emb = remove_pc(emb, params.rmpc)
    return emb


### Getting Train embeddings

In [None]:
sentences_train = list(articles_train['title'].apply(preprocessor))
(words, We) = getWordmap(wordfile)
# load word weights
word2weight = getWordWeight(weightfile, weightpara) # word2weight['str'] is the weight for the word 'str'
weight4ind = getWeight(words, word2weight) # weight4ind[i] is the weight for the i-th word
# load sentences
x, m = sentences2idx(sentences_train, words) # x is the array of word indices, m is the binary mask indicating whether there is a word in that location
w = seq2weight(x, m, weight4ind) # get word weights
param = params.params()
param.rmpc = rmpc
# get SIF embedding
embedding = SIF_embedding(We, x, w, param) # embedding[i,:] is the embedding for sentence i
embedding_train = embedding

### Getting Test embeddings

In [None]:
sentences_test = list(articles_test['title'].apply(preprocessor))
x, m = sentences2idx(sentences_test, words) # x is the array of word indices, m is the binary mask indicating whether there is a word in that location
w = seq2weight(x, m, weight4ind) # get word weights
param = params.params()
param.rmpc = rmpc
# get SIF embedding
embedding = SIF_embedding(We, x, w, param) # embedding[i,:] is the embedding for sentence i
embedding_test = embedding

### Getting similarity scores

In [None]:
articles_train['index'] = articles_train.index
articles_test['index'] = articles_test.index

In [None]:
articles_train['embed'] = articles_train['index'].apply(lambda x: embedding_train[x])
articles_test['embed'] = articles_test['index'].apply(lambda x: embedding_test[x])

In [None]:
train_df['sif_embedding1'] = train_df['id1'].\
                                apply(lambda x: articles_train[articles_train['id']==x]\
                                      ['embed'].values[0])
train_df['sif_embedding2'] = train_df['id2'].\
                                apply(lambda x: articles_train[articles_train['id']==x]\
                                      ['embed'].values[0])
train_df['sif_similarity'] = train_df[['sif_embedding1','sif_embedding2']]\
                                        .apply(lambda row: \
                                               cosine_similarity(row['sif_embedding1'].reshape(1, -1),\
                                                                 row['sif_embedding2'].reshape(1, -1))[0][0], axis=1)

In [None]:
test_df['sif_embedding1'] = test_df['id1'].\
                                apply(lambda x: articles_test[articles_test['id']==x]\
                                      ['embed'].values[0])
test_df['sif_embedding2'] = test_df['id2'].\
                                apply(lambda x: articles_test[articles_test['id']==x]\
                                      ['embed'].values[0])
test_df['sif_similarity'] = train_df[['sif_embedding1','sif_embedding2']]\
                                        .apply(lambda row: \
                                               cosine_similarity(row['sif_embedding1'].reshape(1, -1),\
                                                                 row['sif_embedding2'].reshape(1, -1))[0][0], axis=1)

In [None]:
# train_df.to_pickle('../../data/dataframes/df_train_bugrepo_sif_similarity.pkl')
# test_df.to_pickle('../../data/dataframes/df_test_bugrepo_sif_similarity.pkl')

In [None]:
models = {
           "XGBoost" : XGBClassifier()
}

params = {'XGBoost':  {"colsample_bytree": [0.3,0.5,0.8,1],"gamma":[0,10,50,100],
                        "max_depth": [2,4,6], # default 3\
                        "n_estimators": [50,100], # default 100
                        "subsample": [0.3,0.5,0.8,1]}
}

def custom_scorer(y_true, y_pred):
    return f1_score(y_true, y_pred, average='macro')

# Event Similarity


In [None]:
train_df = train_df.sample(frac=1).reset_index(drop=True)
test_df = test_df.sample(frac=1).reset_index(drop=True)

In [None]:
helper_event = EstimatorSelectionHelper(models, params)

In [None]:
helper_event.fit(train_df['sif_similarity'].values.reshape(-1, 1),
            train_df['dup_issue'],
            cv = 5,
            scoring=make_scorer(custom_scorer, greater_is_better=True), n_jobs=16, refit=True)

In [None]:
helper_event.summary(test_df['sif_similarity'], test_df['dup_issue'])

In [None]:
# helper_event.save_models('../../data/models/', 'bugrepo_sif_event')

# Topical Similarity

In [None]:
train_df['dup_group'].value_counts()

In [None]:
train_df = train_df.sample(frac=1).reset_index(drop=True)
test_df = test_df.sample(frac=1).reset_index(drop=True)

In [None]:
helper_topic = EstimatorSelectionHelper(models, params)

In [None]:
helper_topic.fit(train_df['sif_similarity'].values.reshape(-1, 1),
            train_df['dup_group'],
            cv = 5,
            scoring=make_scorer(custom_scorer, greater_is_better=True), n_jobs=16, refit=True)

In [None]:
helper_topic.summary(test_df['sif_similarity'], test_df['dup_group'])

In [None]:
# helper_event.save_models('../../data/models/', 'bugrepo_sif_topic')