In [None]:
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import validation_curve
from sklearn.metrics import make_scorer
from sklearn import metrics as mt
from sklearn.metrics import f1_score as f1
from sklearn.svm import SVC
import seaborn as sns
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
import warnings
warnings.simplefilter('ignore', DeprecationWarning)
%matplotlib inline

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV
import os
from random import sample

## Prepare dataset for cross-validation (CV)

### Train Test Random Split

In [None]:
df_all = pd.read_csv("../data/Validation_1000.csv")
df_random = df_all.sample(n=1000,ignore_index=True)
df_random.head()

In [None]:
cv = 10

train_path = "../data/cross_validate_10/"
for i in range(cv):
    df_split = np.array_split(df_random, cv)
    df_test = df_split[i]
    val_i = i-1
    if i == 0:
        val_i = len(df_split)-1
        
    df_val = df_split[val_i]
    frame_train = [df_split[index] for index in range(len(df_split)) if index != i and index != val_i]
    df_train = pd.concat(frame_train)
    
    if not os.path.exists(train_path+str(i)+"/"):
        os.makedirs(train_path+str(i)+"/")
        
    df_train.to_csv(train_path+str(i)+"/"+"train.csv",index = None)
    df_val.to_csv(train_path+str(i)+"/"+"val.csv",index = None)
    df_test.to_csv(train_path+str(i)+"/"+"test.csv",index = None)

### Pattern Feature Extraction

In [None]:
def pattern_matrix(df, col):
    pattern_code = set()
    for pattern_str in df[col]:
        if pattern_str == 0:
            continue
        patterns = pattern_str.split(",")
        for pattern in patterns:
            if len(pattern) != 0:
                pattern_code.add(pattern.strip())
                
    pattern_list = []
    for i in range(len(df)):
        pattern_str = df.loc[i][col]
        pattern_val = dict.fromkeys(pattern_code,0)
        if pattern_str != 0:
            patterns = pattern_str.split(",")
            for pattern in patterns:
                if len(pattern) != 0:
                    pattern_val[pattern.strip()] += 1
    
        pattern_list.append(pattern_val)
    
    
    return pattern_code,pd.DataFrame.from_dict(pattern_list)

In [None]:
def pattern_matrix_testing(pattern_code, df, col):            
    pattern_list = []
    for i in range(len(df)):
        pattern_str = df.loc[i][col]
        pattern_val = dict.fromkeys(pattern_code,0)
        if pattern_str != 0:
            patterns = pattern_str.split(",")
            for pattern in patterns:
                if len(pattern) != 0 and pattern != '0' and pattern in pattern_code:
                    pattern_val[pattern.strip()] += 1
    
        pattern_list.append(pattern_val)
    
    
    return pd.DataFrame.from_dict(pattern_list)

In [None]:
df = pd.read_csv("../data/PatternDiscovery_1000.csv")
df["predicted_ETD"].fillna(0,inplace=True)
df["predicted_PS"].fillna(0,inplace=True)
df

In [None]:
etd_pattern_code,df_etd = pattern_matrix(df,"predicted_ETD")
ps_pattern_code,df_ps = pattern_matrix(df,"predicted_PS")

In [None]:
#for all cross-validate data
path = "../data/cross_validate_10/"

dir_list = os.listdir(path)

for folder in dir_list:
    df_train = pd.read_csv(path+folder+"/train.csv")
    df_test = pd.read_csv(path+folder+"/test.csv")
    df_val = pd.read_csv(path+folder+"/val.csv")
    
    df_train.fillna(0,inplace=True)
    df_test.fillna(0,inplace=True)
    df_val.fillna(0,inplace=True)
    
    df_etd_train = pattern_matrix_testing(etd_pattern_code, df_train,"predicted_ETD")
    df_ps_train = pattern_matrix_testing(ps_pattern_code, df_train,"predicted_PS")
    df_etd_test = pattern_matrix_testing(etd_pattern_code, df_test,"predicted_ETD")
    df_ps_test = pattern_matrix_testing(ps_pattern_code, df_test,"predicted_PS")
    df_etd_val = pattern_matrix_testing(etd_pattern_code, df_val,"predicted_ETD")
    df_ps_val = pattern_matrix_testing(ps_pattern_code, df_val,"predicted_PS")
    
    if not os.path.exists(path+folder+"/train/"):
        os.makedirs(path+folder+"/train/")
    if not os.path.exists(path+folder+"/test/"):
        os.makedirs(path+folder+"/test/")
    if not os.path.exists(path+folder+"/val/"):
        os.makedirs(path+folder+"/val/")
        
    df_etd_train.to_csv(path+folder+"/train/etd_pattern_train.csv",index =None)
    df_ps_train.to_csv(path+folder+"/train/ps_pattern_train.csv",index =None)
    df_etd_test.to_csv(path+folder+"/test/etd_pattern_test.csv",index =None)
    df_ps_test.to_csv(path+folder+"/test/ps_pattern_test.csv",index =None)
    df_etd_val.to_csv(path+folder+"/val/etd_pattern_val.csv",index =None)
    df_ps_val.to_csv(path+folder+"/val/ps_pattern_val.csv",index =None)

### N-gram Feature Extraction

In [None]:
import pandas as pd
import numpy as py
from preprocess_helper import PorterStemmer
import os
import re
from bs4 import BeautifulSoup
from string import punctuation
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
from nltk.tokenize import word_tokenize,sent_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
def hasAlphanumeric(term):
    for letter in term:
        if letter.isalnum():
            return True
    
    return False

In [None]:
class Dataset_Preprocess:
    def __init__(self):
        self.stopwords_list = None
        self.tag_list = None
        self.reservedkeywords = None
    
    def create_reservedkeywords(self, path, num_words = 100):
        df = pd.read_csv(path)
        df.sort_values(by=['Count'], ascending=False, inplace=True, ignore_index=True)
        self.tag_list = list(df["Tag"].values)
        reservedkeywords_extra = ["c#","f#","c++","node.js","nodejs",".json",".js",".net","objective-c",
                                  "asp.net","ruby-on-rails","angular.js"]
        self.reservedkeywords = list(df["Tag"].values)[:num_words]
        self.reservedkeywords.extend(reservedkeywords_extra)
        self.reservedkeywords = set(self.reservedkeywords)
        #print(self.reservedkeywords)
    
    def create_stopwords(self):
        self.stopwords_list = stopwords.words('english')
        stop_words_extra = ["i'd","sometime","sometimes","something","someone","somebody","anything","anyone","anybody",
                            "everytime","everything","everyone","everybody","e.g.","e.g","e.g.,","i.e.","i.e","i.e.,","love",
                            "know","'s","wonder"]
        self.stopwords_list.extend(stop_words_extra)
        stopwords_unsure_list = set(self.stopwords_list).intersection(set(self.tag_list))
        self.stopwords_list = set(self.stopwords_list).difference(stopwords_unsure_list)
        #print(len(self.stopwords_list),"stopwords",self.stopwords_list)
    
    def remove_non_ascii(self,sentence):
        return ''.join(char for char in sentence if ord(char) < 128)
    
    def html_Filter(self, sentence):
        sentence = BeautifulSoup(sentence, "lxml").text
        #print("after html_Filter",sentence)
    
        return sentence
    
    def keywords_transform(self, sentence):
        sentence = sentence.lower()
        sentence = sentence.replace("node js","node.js")
        sentence = sentence.replace("objective c","objective-c")
        sentence = sentence.replace("ruby on rails","ruby-on-rails")
        sentence = sentence.replace("angular js","angular-js")
        
        return sentence
    
    def remove_specialchar(self, sentence, char_to_keep = {}): #'#','+','.','-','\'','"',':','?','!',',','_'
        punct_set = set(punctuation).difference(char_to_keep)

        for i in punct_set:
            # Replace the special character with an empty string
            sentence=sentence.replace(i," ")
        
        return sentence
        
    def sentence_stem(self, sentence):
        p = PorterStemmer()
        output = ""

        for token in sentence.split(' '):
            if token.isalnum():
                output += p.stem(token, 0,len(token)-1)+' '
            elif token in self.reservedkeywords:
                output += token+' '
        
        #print("after sentence_stem", output.strip())
        return output.strip()
    

    def call(self, dataset = None, keywords = None):
        data_clean = []
        tag_path = "../data/tag_dict.csv"
        
        #initialize tag_list
        if keywords == None:
            self.create_reservedkeywords(tag_path)
        else:
            df = pd.read_csv(tag_path)
            self.tag_list = list(df["Tag"].values)
            self.reservedkeywords = keywords
        
        #initialize stopwords
        self.create_stopwords()
        
        for sentence in dataset:
            sentence = self.html_Filter(sentence)
            sentence = self.remove_non_ascii(sentence)
            #sentence = self.keywords_transform(sentence)
            sentence = self.remove_specialchar(sentence)
            #sentence = self.remove_specialchar(sentence)
            sentence = self.sentence_stem(sentence)
            
            data_clean.append(sentence.strip())
            pos.append(posTag.strip())
        
        return data_clean

In [None]:
path = "../data/cross_validate_10/"
data_preprocess = Dataset_Preprocess()
dir_list = sorted(os.listdir(path))
for folder in dir_list:
    df_train = pd.read_csv(path+folder+"/train.csv")
    df_test = pd.read_csv(path+folder+"/test.csv")
    df_val = pd.read_csv(path+folder+"/val.csv")
    
    df_combine = pd.concat([df_train,df_val])
    issues_clean = data_preprocess.call(dataset=list(df_combine["issue_clean"]))
    
    issues_clean_train = data_preprocess.call(dataset=list(df_train["issue_clean"]))
    issues_clean_test = data_preprocess.call(dataset=list(df_test["issue_clean"]))
    issues_clean_val = data_preprocess.call(dataset=list(df_val["issue_clean"]))
    
    #{1,2,3}-grams
    count_vect_all = CountVectorizer(ngram_range = (1,3), binary = True)
    X_counts = count_vect_all.fit_transform(issues_clean)
    print(X_counts.shape)
    
    count_vect = CountVectorizer(ngram_range = (1,3), binary = True, vocabulary = count_vect_all.get_feature_names_out())
    X_train_counts = count_vect.fit_transform(issues_clean_train)
    df_train_ngram = pd.DataFrame(data = X_train_counts.toarray(), columns = count_vect_all.get_feature_names_out())
    X_test_counts = count_vect.fit_transform(issues_clean_test)
    df_test_ngram = pd.DataFrame(data = X_test_counts.toarray(), columns = count_vect_all.get_feature_names_out())
    X_val_counts = count_vect.fit_transform(issues_clean_val)
    df_val_ngram = pd.DataFrame(data = X_val_counts.toarray(), columns = count_vect_all.get_feature_names_out())
    
    
    if not os.path.exists(path+folder+"/train/"):
        os.makedirs(path+folder+"/train/")
    if not os.path.exists(path+folder+"/test/"):
        os.makedirs(path+folder+"/test/")
    if not os.path.exists(path+folder+"/val/"):
        os.makedirs(path+folder+"/val/")
        
    
    df_train_ngram.to_csv(path+folder+"/train/ngram_train.csv",index =None)
    df_test_ngram.to_csv(path+folder+"/test/ngram_test.csv",index =None)
    df_val_ngram.to_csv(path+folder+"/val/ngram_val.csv",index =None) 

## Load Training data

In [None]:
X_etd_train_list = []
X_ps_train_list = []
X_ngram_train_list = []
X_etd_ngram_train_list = []
X_ps_ngram_train_list = []

X_etd_test_list = []
X_ps_test_list = []
X_ngram_test_list = []
X_etd_ngram_test_list = []
X_ps_ngram_test_list = []

X_etd_val_list = []
X_ps_val_list = []
X_ngram_val_list = []
X_etd_ngram_val_list = []
X_ps_ngram_val_list = []

y_etd_train_list = []
y_ps_train_list = []
y_etd_test_list = []
y_ps_test_list = []
y_etd_val_list = []
y_ps_val_list = []

In [None]:
def prepare_10fold_data(path):
    dir_list = sorted(os.listdir(path))
        
    for folder in dir_list:
        #training files
        df_etd_train = pd.read_csv(path+folder+"/train/etd_pattern_train.csv")
        df_ps_train = pd.read_csv(path+folder+"/train/ps_pattern_train.csv")
        df_ngram_train = pd.read_csv(path+folder+"/train/ngram_train.csv")
        df_pos_train = pd.read_csv(path+folder+"/train/pos_train.csv")

        #test files
        df_etd_test = pd.read_csv(path+folder+"/test/etd_pattern_test.csv")
        df_ps_test = pd.read_csv(path+folder+"/test/ps_pattern_test.csv")
        df_ngram_test = pd.read_csv(path+folder+"/test/ngram_test.csv")
        
        #validation files
        df_etd_val = pd.read_csv(path+folder+"/val/etd_pattern_val.csv")
        df_ps_val = pd.read_csv(path+folder+"/val/ps_pattern_val.csv")
        df_ngram_val = pd.read_csv(path+folder+"/val/ngram_val.csv")

        #label
        df_label_train = pd.read_csv(path+folder+"/train.csv")
        df_label_test = pd.read_csv(path+folder+"/test.csv")
        df_label_val = pd.read_csv(path+folder+"/val.csv")
        
        #train
        X_etd_train = df_etd_train.values
        X_ps_train = df_ps_train.values
        X_ngram_train = df_ngram_train.values
        X_etd_ngram_train = np.hstack((X_etd_train, X_ngram_train))
        X_ps_ngram_train = np.hstack((X_ps_train, X_ngram_train))

        
        #test
        X_etd_test = df_etd_test.values
        X_ps_test = df_ps_test.values
        X_ngram_test = df_ngram_test.values
        X_etd_ngram_test = np.hstack((X_etd_test, X_ngram_test))
        X_ps_ngram_test = np.hstack((X_ps_test, X_ngram_test))

        
        #val
        X_etd_val = df_etd_val.values
        X_ps_val = df_ps_val.values
        X_ngram_val = df_ngram_val.values
        X_etd_ngram_val = np.hstack((X_etd_val, X_ngram_val))
        X_ps_ngram_val = np.hstack((X_ps_val, X_ngram_val))

        
        y_etd_train = df_label_train["y_ETD"].values
        y_ps_train = df_label_train["y_PS"].values
        y_etd_test = df_label_test["y_ETD"].values
        y_ps_test = df_label_test["y_PS"].values
        y_etd_val = df_label_val["y_ETD"].values
        y_ps_val = df_label_val["y_PS"].values
        
        X_etd_train_list.append(X_etd_train)
        X_ps_train_list.append(X_ps_train)
        X_ngram_train_list.append(X_ngram_train)
        X_etd_ngram_train_list.append(X_etd_ngram_train)
        X_ps_ngram_train_list.append(X_ps_ngram_train)
        
        X_etd_test_list.append(X_etd_test)
        X_ps_test_list.append(X_ps_test)
        X_ngram_test_list.append(X_ngram_test)
        X_etd_ngram_test_list.append(X_etd_ngram_test)
        X_ps_ngram_test_list.append(X_ps_ngram_test)
        
        X_etd_val_list.append(X_etd_val)
        X_ps_val_list.append(X_ps_val)
        X_ngram_val_list.append(X_ngram_val)
        X_etd_ngram_val_list.append(X_etd_ngram_val)
        X_ps_ngram_val_list.append(X_ps_ngram_val)
        
        y_etd_train_list.append(y_etd_train)
        y_ps_train_list.append(y_ps_train)
        y_etd_test_list.append(y_etd_test)
        y_ps_test_list.append(y_ps_test)
        y_etd_val_list.append(y_etd_val)
        y_ps_val_list.append(y_ps_val)


In [None]:
prepare_10fold_data("../data/cross_validate_10/")

## Model Training

In [None]:
def cross_validate(X_train_list, y_train_list, X_test_list, y_test_list, X_val_list, y_val_list):
    param_grid = {'C': np.linspace(0.001, 100, 20)}
    yhat = []
    ytest = []
    clf = None
    
    for X_train,y_train,X_test,y_test,X_val,y_val in zip(X_train_list, y_train_list, X_test_list, y_test_list, X_val_list, y_val_list):
        svc = SVC()
        grid_search = GridSearchCV(svc, param_grid, cv=5)
        grid_search.fit(X_val, y_val)
    
        #print('CV Train score: {:.2f}'.format(grid_search.best_score_))
        print('Best parameters: {}'.format(grid_search.best_params_))
        
        clf = SVC(**grid_search.best_params_)
        clf.fit(X_train, y_train)
        predictions = clf.predict(X_test)
        
        for val in zip(y_test, predictions):
            yhat.append(val[1])
            ytest.append(val[0])
    
    precison = mt.precision_score(ytest, yhat)
    recall = mt.recall_score(ytest, yhat)
    score = mt.f1_score(ytest, yhat)
        
    print("precision:",round(precison,3),"recall:",round(recall,3),"F1:",round(score,3))
    
    return clf, ytest, yhat

## Evaluation and Inference

In [None]:
def result_save(input_path, yhat_ETD = None, yhat_PS = None):
    #merge all test folds
    test_list = [pd.read_csv(input_path+folder+"/test.csv") for folder in sorted(os.listdir(input_path))]
    df_test_all = pd.concat(test_list)
    
    df_final = pd.DataFrame()
    df_final["issue"] = df_test_all["issue"]
    ytest = []
    yhat = []
 
    if yhat_ETD != None:
        df_final["predicted_ETD"] = df_test_all["predicted_ETD"]
        df_final["y_ETD"] = df_test_all["y_ETD"]
        df_final["yhat_ETD"] = yhat_ETD
        
        ytest = list(df_final["y_ETD"].values)
        yhat = yhat_ETD
    if yhat_PS != None:
        df_final["predicted_PS"] = df_test_all["predicted_PS"]
        df_final["y_PS"] = df_test_all["y_PS"]
        df_final["yhat_PS"] = yhat_PS
        
        ytest = list(df_final["y_PS"].values)
        yhat = yhat_PS
        
    precison = mt.precision_score(ytest, yhat)
    recall = mt.recall_score(ytest, yhat)
    score = mt.f1_score(ytest, yhat)
    
    return df_final

In [None]:
# 1. ETD: pattern
clf1,etd_test, etd_hat = cross_validate(X_etd_train_list, y_etd_train_list, X_etd_test_list, y_etd_test_list, X_etd_val_list, y_etd_val_list)

In [None]:
train_path = "../data/cross_validate_10/"
df_final = result_save(train_path, yhat_ETD = etd_hat)
df_final.to_csv("../experiment/svm/ETD_pattern_result.csv",index =None)

In [None]:
# 2. ETD: n-gram
clf3,etd_test3, etd_hat3 = cross_validate(X_ngram_train_list, y_etd_train_list, X_ngram_test_list, y_etd_test_list,X_ngram_val_list, y_etd_val_list)
train_path = "../data/cross_validate_10/"
df_final = result_save(train_path, yhat_ETD = etd_hat3)
df_final.to_csv("../experiment/svm/ETD_ngram_result.csv",index =None)

In [None]:
# 3. ETD: pattern + n-gram
clf5,etd_test5, etd_hat5 = cross_validate(X_etd_ngram_train_list, y_etd_train_list, X_etd_ngram_test_list, y_etd_test_list, X_etd_ngram_val_list, y_etd_val_list)
train_path = "../data/cross_validate_10/"
df_final = result_save(train_path, yhat_ETD = etd_hat5)
df_final.to_csv("../experiment/svm/ETD_pattern_ngram_result.csv",index =None)

In [None]:
# 4. PS: pattern
clf8,ps_test, ps_hat =cross_validate(X_ps_train_list, y_ps_train_list, X_ps_test_list, y_ps_test_list, X_ps_val_list, y_ps_val_list)
train_path = "../data/cross_validate_10/"
df_final = result_save(train_path, yhat_PS = ps_hat)
df_final.to_csv("../experiment/svm/ps_pattern_result.csv",index =None)

In [None]:
# 5. PS: n-gram
clf10,ps_test3, ps_hat3 = cross_validate(X_ngram_train_list, y_ps_train_list, X_ngram_test_list, y_ps_test_list,X_ngram_val_list, y_ps_val_list)
train_path = "../data/cross_validate_10/"
df_final = result_save(train_path, yhat_PS = ps_hat3)
df_final.to_csv("../experiment/svm/ps_ngram_result.csv",index =None)

In [None]:
# 6. PS: pattern + n-gram
clf12,ps_test5, ps_hat5 = cross_validate(X_ps_ngram_train_list, y_ps_train_list, X_ps_ngram_test_list, y_ps_test_list,X_ps_ngram_val_list, y_ps_val_list)
train_path = "../data/cross_validate_10/"
df_final = result_save(train_path, yhat_PS = ps_hat5)
df_final.to_csv("../experiment/svm/ps_pattern_ngram_result.csv",index =None)