In [1]:
import pandas as pd
import random 
import transformers
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from simpletransformers.classification import MultiLabelClassificationModel
import os

import inspect
import re
import tensorflow as tf

import json, re, nltk, string

from transformers import TFBertModel, BertTokenizer

In [2]:
MDL_LABEL_NUM = 89
JRA_LABEL_NUM = 142
ISLANDORA_LABEL_NUM = 67
INFRA_LABEL_NUM = 51
HIVE_LABEL_NUM = 65
HBASE_LABEL_NUM = 68
HADOOP_LABEL_NUM = 37
FCREPO_LABEL_NUM = 22
CONF_LABEL_NUM = 128
CB_LABEL_NUM = 64
CASSANDRA_LABEL_NUM = 15
BAM_LABEL_NUM = 96

labels_Num = {'MDL': MDL_LABEL_NUM, 
    'JRA': JRA_LABEL_NUM, 'ISLANDORA': ISLANDORA_LABEL_NUM, 
    'INFRA': INFRA_LABEL_NUM, 'HIVE': HIVE_LABEL_NUM, 'HBASE': HBASE_LABEL_NUM, 'HADOOP': HADOOP_LABEL_NUM, 'FCREPO': FCREPO_LABEL_NUM, 'CONF': CONF_LABEL_NUM,
    'CB': CB_LABEL_NUM, 'CASSANDRA': CASSANDRA_LABEL_NUM, 'BAM': BAM_LABEL_NUM
    }

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [3]:
class ML_Classification:
    def __init__(self, dataset_name, augmenter_name, augment_size, nlp_model_name):
        self.dataset_name = dataset_name
        self.labels_num = labels_Num[dataset_name]

        if augmenter_name == 'OCR' or augmenter_name == 'Keyboard':
            self.augmentation_type = 'char'
        else:
            self.augmentation_type = 'word'

        self.augmenter_name = augmenter_name
        self.aug_mul = augment_size
        self.nlp_model_name = nlp_model_name
        self.nlp_model = {'bert': 'bert-base-uncased', 'roberta': 'roberta-base', 'xlnet': 'xlnet-base-uncased', 'distilbert': 'distilbert-base-uncased', 'xlm': 'xlm-roberta-base', 'electra': 'google/electra-base-discriminator'}


        # 데이터 위치 data location
        self.data_location_ori = '../Dataset/Deepsoft_IssueData/{}.csv'.format(self.dataset_name)

        self.data_location_aug = '../Dataset/Deepsoft_IssueData_Aug/{}_{}_{}.csv'.format(self.dataset_name, self.augmentation_type, self.augmenter_name)     
        

        # 데이터 변수 입력
        self.data = pd.read_csv(self.data_location_aug) # 증강 데이터
        self.data_ori = pd.read_csv(self.data_location_ori) # 원본 데이터
        self.len_data = len(self.data_ori)
        self.eval_index = []
        self.test_index = []



    def refine_origin_data(self):
        data_ori = self.data_ori
        
        data_onehot = data_ori.drop(columns = ['issuekey', 'title', 'description', 'component'])
        data_label = []
        for i in range(len(data_onehot)):
            data_label.append(list(data_onehot.iloc[i]))

        # make 'data' value
        data_text = pd.Series(list(data_ori["title"] + ' ' + data_ori['description']), index = data_ori.index)
        #data = data.drop(columns = ['issuekey', 'title', 'description', 'component'])
        

        refined_data = []
        for item in data_text:
            #1. Remove \r 
            current_desc = item.replace('\r', ' ')    
            #2. Remove URLs
            current_desc = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', current_desc)    
            #4. Remove hex code
            current_desc = re.sub(r'(\w+)0x\w+', '', current_desc) 
            #5. Change to lower case
            current_desc = current_desc.lower()   
            #6. Tokenize
            #current_desc_tokens = tokenizer(current_desc, add_special_tokens= True)
            #7. Strip trailing punctuation marks    
            #current_desc_filter = [word.strip(string.punctuation) for word in current_desc_tokens]     
            #8. Join the lists
            #current_data = current_desc_filter
            #current_data = list(filter(None, current_data))
            refined_data.append(current_desc)

        #data_ori = pd.DataFrame(data = {'text': refined_data, 'labels': data_label})
        data_ori['text'] = refined_data
        data_ori['labels'] = data_label
        self.data_ori = data_ori
        # 오리지날 데이터를 train, eval데이터로 분리

        train_size = 0.6
        test_size = 0.2
        eval_size = 0.2
        self.train_data_ori, self.eval_data_ori = train_test_split(data_ori, train_size = train_size)
        self.eval_data_ori, self.test_data_ori = train_test_split(self.eval_data_ori, test_size = 0.5)
        self.eval_data = self.eval_data_ori
        self.train_data = self.train_data_ori
        self.test_data = self.test_data_ori


    
    # 불러온 정제된 데이터 one hot을 str에서 list로 바꾸는 작업
    def labels_to_int(self):
        if self.aug_mul <= 1:
            return        

        data = self.data[: self.len_data * self.aug_mul] 

        refined_data = []
        for item in data['text']:
            #1. Remove \r 
            current_desc = item.replace('\r', ' ')    
            #2. Remove URLs
            current_desc = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', current_desc)    
            #4. Remove hex code
            current_desc = re.sub(r'(\w+)0x\w+', '', current_desc) 
            #5. Change to lower case
            current_desc = current_desc.lower()   
            #6. Tokenize
            #current_desc_tokens = tokenizer(current_desc, add_special_tokens= True)
            #7. Strip trailing punctuation marks    
            #current_desc_filter = [word.strip(string.punctuation) for word in current_desc_tokens]     
            #8. Join the lists
            #current_data = current_desc_filter
            #current_data = list(filter(None, current_data))
            refined_data.append(current_desc)

        data['text'] = refined_data

        changeChar = ' [],'
        for i in range(len(data)):
            for chanChar in changeChar:
                data['labels'][i] = data['labels'][i].replace(chanChar, '')
            data['labels'][i] = list(data['labels'][i])
            data['labels'][i] = list(map(int, data['labels'][i]))
        
        # 증강데이터의 train_data에서 evaluation부분 제거
        eval_index_list = list(self.eval_data.index)
        test_index_list = list(self.test_data.index)
        
        for aug_num in range(self.aug_mul):
            iidf2 = [i + self.len_data* aug_num for i in eval_index_list]
            self.eval_index = self.eval_index + iidf2
            iidf3 = [x + self.len_data* aug_num for x in test_index_list]
            self.test_index = self.test_index + iidf3


        self.data = data
        self.train_data = data.drop(self.eval_index)
        self.train_data = self.train_data.drop(self.test_index)
        self.train_data = self.train_data.sample(frac=1).reset_index(drop=True)

# 모델 parameter 설정
    def set_model(self): # epochs: 200, batch size: 100, learning rate 0.002
        self.model = MultiLabelClassificationModel(self.nlp_model_name, self.nlp_model[self.nlp_model_name], num_labels = self.labels_num, 
        args = {'output_dir': '/data/a22106/Deepsoft_C_Multilabel/{}_{}_{}_{}/'.format(self.dataset_name, self.nlp_model_name, self.augmenter_name, self.aug_mul), 
        'overwrite_output_dir': True, 'save_steps': -1, 'num_train_epochs': 50, 'train_batch_size': 100, 'eval_batch_size': 100, 'max_seq_length': 128, 'learning_rate': 0.002})
        

    def train_model(self):
        self.model.train_model(self.train_data)
    
    def eval_model(self):
        self.result, self.model_outputs, self.wrong_predictions = self.model.eval_model(self.eval_data)

    def test_model(self):
        self.to_predict = list(self.test_data['text'].apply(lambda x: x.replace('\n', ' ')).tolist())
        preds, outputs = self.model.predict(self.to_predict)

        sub_df = pd.DataFrame(outputs, columns = list(ml.data_ori.columns[4:-2]))

        sub_df.to_csv('outputs/submission.csv', index = False)




In [4]:
ml = ML_Classification("HADOOP", "Split", 7, "distilbert")
ml.refine_origin_data()
ml.labels_to_int()
print(ml.data)

                                                    text  \
0      tool to mount ndfs on linux tool to mount ndfs...   
1      make configuration an interface the configurat...   
2      df enhancement: performance and win xp support...   
3      adding some uniformity/convenience to environm...   
4      buffersize argument is ignored in filesystem.c...   
...                                                  ...   
43059  switch to v2 of the s3 li st objects api in s3...   
43060  namenode connect t ime out in cluster with 65 ...   
43061  eliminate needless uses of filesystem. exists,...   
43062  dis pose of unnecess ary sasl s ervers the ipc...   
43063  optimize and fix getfilestatus in s3a currentl...   

                                                  labels  
0      [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, ...  
1      [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...  
2      [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, ...  
3      [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,

In [5]:
ml.train_data

Unnamed: 0,text,labels
0,te stfileappend2. testcomplexappe nd somet ime...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,hadoop distribution tarball bundle some librar...,"[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,libhadoop. so: dlopen sho uld be better at loc...,"[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,namenode schema for httpfilesystem this issue ...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, ..."
4,should run old version of unit tes ts to che c...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
...,...,...
25832,inconsistent configuration values and incorrec...,"[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, ..."
25833,s3n files are not getting split by default ru...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ..."
25834,p ort co nf servlet to dump running configurat...,"[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
25835,problem staging 0. 21. 0 artifacts to apache n...,"[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [6]:
ml.eval_data

Unnamed: 0,issuekey,title,description,component,auto-failover,azure,benchmarks,bin,build,conf,...,test,tools,tools/distcp,tracing,trash,util,viewfs,yetus,text,labels
4752,HADOOP-11204,Fix incorrect property in hadoop-kms/src/main/...,{{hadoop.security.keystore.JavaKeyStoreProvide...,kms,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,fix incorrect property in hadoop-kms/src/main/...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4992,HADOOP-11609,Correct credential commands info in CommandsMa...,"""-i"" is not supported, so would you remove ...","documentation,security",0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,correct credential commands info in commandsma...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, ..."
1770,HADOOP-6626,NPE in TestIPC with kerberos,Running TestIPC with {{hadoop.security.authent...,ipc,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,npe in testipc with kerberos running testipc w...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
775,HADOOP-3406,Document controls for profiling maps & reduces,HADOOP-2367 and further improvements added the...,documentation,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,document controls for profiling maps & reduces...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, ..."
28,HADOOP-167,reducing the number of Configuration & JobConf...,"Currently, Configuration and JobConf objects a...",conf,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,reducing the number of configuration & jobconf...,"[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2917,HADOOP-8308,Support cross-project Jenkins builds,This issue is to change test-patch to run only...,build,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,support cross-project jenkins builds this issu...,"[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1007,HADOOP-4217,TestLimitTasksPerJobTaskSchedule test is faili...,TestLimitTasksPerJobTaskSchedule test is faili...,test,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,testlimittasksperjobtaskschedule test is faili...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3594,HADOOP-9380,Add totalLength to rpc response,,ipc,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,add totallength to rpc response none,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3202,HADOOP-8756,Fix SEGV when libsnappy is in java.library.pat...,"We use {{System.loadLibrary(""snappy"")}} from t...",native,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,fix segv when libsnappy is in java.library.pat...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [7]:
ml.test_data

Unnamed: 0,issuekey,title,description,component,auto-failover,azure,benchmarks,bin,build,conf,...,test,tools,tools/distcp,tracing,trash,util,viewfs,yetus,text,labels
3503,HADOOP-9242,Duplicate surefire plugin config in hadoop-common,Unfortunately in HADOOP-9217 a duplicated conf...,test,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,duplicate surefire plugin config in hadoop-com...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
5665,HADOOP-12697,IPC retry policies should recognise that SASL ...,SLIDER-1050 shows that if you don't have the r...,ipc,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,ipc retry policies should recognise that sasl ...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4512,HADOOP-10825,Refactor class creation logic in Configuration...,This first patch refactors class creation insi...,conf,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,refactor class creation logic in configuration...,"[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1479,HADOOP-6140,DistributedCache.addArchiveToClassPath doesn't...,addArchiveToClassPath is a method of Distribut...,fs,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,distributedcache.addarchivetoclasspath doesn't...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, ..."
5684,HADOOP-12745,stop shelltest profile active by default,the shelltest profile is enabled by default. ...,build,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,stop shelltest profile active by default the s...,"[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3821,HADOOP-9744,TestNetUtils test fails,- testNormalizeHostName(org.apache.hadoop.net....,test,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,testnetutils test fails - testnormalizehostnam...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1527,HADOOP-6217,Hadoop Doc Split: Common Docs,Hadoop Doc Split: Common Docs Please note t...,documentation,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,hadoop doc split: common docs hadoop doc split...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, ..."
4520,HADOOP-10834,Typo in CredentialShell usage,There is a typo in one of the informational me...,security,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,typo in credentialshell usage there is a typo ...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3239,HADOOP-8809,RPMs should skip useradds if the users already...,The hadoop.spec preinstall script creates user...,scripts,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,rpms should skip useradds if the users already...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
