Copyright (c) Snap Inc. 2020. This sample code is made available by Snap Inc. for informational purposes only. It is provided as-is, without warranty of any kind, express or implied, including any warranties of merchantability, fitness for a particular purpose, or non-infringement. In no event will Snap Inc. be liable for any damages arising from the sample code or your use thereof.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pickle
import hashlib
import spacy
import os
import re
import string
import sys
from xgboost import XGBClassifier
from scipy.stats import uniform, randint
sys.path.append('../')

from wmd_vectorizer import *
from utils.snap_preprocessed_df_handle import *
from utils.EstimatorSelectionHelper import EstimatorSelectionHelper
from utils.classifier_setup import *

In [None]:
def prepare_dataframe_wmd(subset):
    df_with_keywords = pd.read_pickle('../../data/dataframes/df_'+subset+'_bugrepo_eclipse.pkl')
    df_with_keywords = get_wmd_gensim(df_with_keywords,'title')
    return df_with_keywords

In [None]:
# train_df = prepare_dataframe_wmd('train')
# test_df = prepare_dataframe_wmd('test')

In [None]:
# train_df.to_pickle('../../data/dataframes/df_train_bugrepo_with_wmd_similarity.pkl')
# test_df.to_pickle('../../data/dataframes/df_test_bugrepo_with_wmd_similarity.pkl')

In [None]:
train_df = pd.read_pickle('../../data/dataframes/df_train_bugrepo_with_wmd_similarity.pkl')
test_df = pd.read_pickle('../../data/dataframes/df_test_bugrepo_with_wmd_similarity.pkl')

In [None]:
models = {
           "XGBoost" : XGBClassifier()
}

params = {
           'XGBoost':  {"colsample_bytree": [0.3,0.5,0.8,1],"gamma":[0,10,50,100],
                        "max_depth": [2,4,6], # default 3\
                        "n_estimators": [50,100], # default 100
                        "subsample": [0.3,0.5,0.8,1]}
}

def custom_scorer(y_true, y_pred):
    return f1_score(y_true, y_pred, average='macro')

# Event Similarity


In [None]:
train_df = train_df.sample(frac=1).reset_index(drop=True)
test_df = test_df.sample(frac=1).reset_index(drop=True)

In [None]:
helper_event = EstimatorSelectionHelper(models, params)

In [None]:
helper_event.fit(train_df['wmd_similarity'].values.reshape(-1, 1),
            train_df['dup_issue'],
            cv = 3,
            scoring=make_scorer(custom_scorer, greater_is_better=True), n_jobs=16, refit=True)

In [None]:
helper_event.summary(test_df['wmd_similarity'], test_df['dup_issue'])

In [None]:
# helper_event.save_models('../../data/models/', 'bugrepo_wmd_event')

# Topical Similarity

In [None]:
train_df['dup_group'].value_counts()

In [None]:
train_df = train_df.sample(frac=1).reset_index(drop=True)
test_df = test_df.sample(frac=1).reset_index(drop=True)

In [None]:
helper_topic = EstimatorSelectionHelper(models, params)

In [None]:
helper_topic.fit(train_df['wmd_similarity'].values.reshape(-1, 1),
            train_df['dup_group'],
            cv = 5,
            scoring=make_scorer(custom_scorer, greater_is_better=True), n_jobs=16, refit=True)

In [None]:
helper_topic.summary(test_df['wmd_similarity'], test_df['dup_group'])