In [None]:
import pandas as pd
import random 
import transformers
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from simpletransformers.classification import MultiLabelClassificationModel
import os

import inspect
import re
import tensorflow as tf

import json, re, nltk, string

from transformers import TFBertModel, BertTokenizer

In [None]:
MDL_LABEL_NUM = 89
JRA_LABEL_NUM = 142
ISLANDORA_LABEL_NUM = 67
INFRA_LABEL_NUM = 51
HIVE_LABEL_NUM = 65
HBASE_LABEL_NUM = 68
HADOOP_LABEL_NUM = 37
FCREPO_LABEL_NUM = 22
CONF_LABEL_NUM = 128
CB_LABEL_NUM = 64
CASSANDRA_LABEL_NUM = 15
BAM_LABEL_NUM = 96

labels_Num = {'MDL': MDL_LABEL_NUM, 
    'JRA': JRA_LABEL_NUM, 'ISLANDORA': ISLANDORA_LABEL_NUM, 
    'INFRA': INFRA_LABEL_NUM, 'HIVE': HIVE_LABEL_NUM, 'HBASE': HBASE_LABEL_NUM, 'HADOOP': HADOOP_LABEL_NUM, 'FCREPO': FCREPO_LABEL_NUM, 'CONF': CONF_LABEL_NUM,
    'CB': CB_LABEL_NUM, 'CASSANDRA': CASSANDRA_LABEL_NUM, 'BAM': BAM_LABEL_NUM
    }

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [75]:
class ML_Classification:
    def __init__(self, dataset_name, augmenter_name, augment_size, nlp_model_name):
        self.dataset_name = dataset_name
        self.labels_num = labels_Num[dataset_name]

        if augmenter_name == 'OCR' or augmenter_name == 'Keyboard':
            self.augmentation_type = 'char'
        else:
            self.augmentation_type = 'word'

        self.augmenter_name = augmenter_name
        self.aug_mul = augment_size
        self.nlp_model_name = nlp_model_name
        self.nlp_model = {'bert': 'bert-base-uncased', 'roberta': 'roberta-base', 'xlnet': 'xlnet-base-uncased', 'distilbert': 'distilbert-base-uncased', 'xlm': 'xlm-roberta-base', 'electra': 'google/electra-base-discriminator'}


        # 데이터 위치 data location
        self.data_location_ori = '../Dataset/Deepsoft_IssueData/{}.csv'.format(self.dataset_name)

        self.data_location_aug = '../Dataset/Deepsoft_IssueData_Aug/{}_{}_{}.csv'.format(self.dataset_name, self.augmentation_type, self.augmenter_name)     
        

        # 데이터 변수 입력
        self.data = pd.read_csv(self.data_location_aug) # 증강 데이터
        self.data_ori = pd.read_csv(self.data_location_ori) # 원본 데이터
        self.len_data = len(self.data_ori)
        self.eval_index = []
        self.test_index = []



    def refine_origin_data(self):
        data_ori = self.data_ori
        
        data_onehot = data_ori.drop(columns = ['issuekey', 'title', 'description', 'component'])
        data_label = []
        for i in range(len(data_onehot)):
            data_label.append(list(data_onehot.iloc[i]))

        # make 'data' value
        data_text = pd.Series(list(data_ori["title"] + ' ' + data_ori['description']), index = data_ori.index)
        #data = data.drop(columns = ['issuekey', 'title', 'description', 'component'])
        

        refined_data = []
        for item in data_text:
            #1. Remove \r 
            current_desc = item.replace('\r', ' ')    
            #2. Remove URLs
            current_desc = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', current_desc)    
            #4. Remove hex code
            current_desc = re.sub(r'(\w+)0x\w+', '', current_desc) 
            #5. Change to lower case
            current_desc = current_desc.lower()   
            #6. Tokenize
            #current_desc_tokens = tokenizer(current_desc, add_special_tokens= True)
            #7. Strip trailing punctuation marks    
            #current_desc_filter = [word.strip(string.punctuation) for word in current_desc_tokens]     
            #8. Join the lists
            #current_data = current_desc_filter
            #current_data = list(filter(None, current_data))
            refined_data.append(current_desc)

        #data_ori = pd.DataFrame(data = {'text': refined_data, 'labels': data_label})
        data_ori['text'] = refined_data
        data_ori['labels'] = data_label
        self.data_ori = data_ori
        # 오리지날 데이터를 train, eval데이터로 분리

        train_size = 0.6
        test_size = 0.2
        eval_size = 0.2
        self.train_data_ori, self.eval_data_ori = train_test_split(data_ori, train_size = train_size)
        self.eval_data_ori, self.test_data_ori = train_test_split(self.eval_data_ori, test_size = 0.5)
        self.eval_data = self.eval_data_ori
        self.train_data = self.train_data_ori
        self.test_data = self.test_data_ori


    
    # 불러온 정제된 데이터 one hot을 str에서 list로 바꾸는 작업
    def labels_to_int(self):
        if self.aug_mul <= 1:
            return        

        data = self.data[: self.len_data * self.aug_mul] 

        refined_data = []
        for item in data['text']:
            #1. Remove \r 
            current_desc = item.replace('\r', ' ')    
            #2. Remove URLs
            current_desc = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', current_desc)    
            #4. Remove hex code
            current_desc = re.sub(r'(\w+)0x\w+', '', current_desc) 
            #5. Change to lower case
            current_desc = current_desc.lower()   
            #6. Tokenize
            #current_desc_tokens = tokenizer(current_desc, add_special_tokens= True)
            #7. Strip trailing punctuation marks    
            #current_desc_filter = [word.strip(string.punctuation) for word in current_desc_tokens]     
            #8. Join the lists
            #current_data = current_desc_filter
            #current_data = list(filter(None, current_data))
            refined_data.append(current_desc)

        data['text'] = refined_data

        changeChar = ' [],'
        for i in range(len(data)):
            for chanChar in changeChar:
                data['labels'][i] = data['labels'][i].replace(chanChar, '')
            data['labels'][i] = list(data['labels'][i])
            data['labels'][i] = list(map(int, data['labels'][i]))
        
        # 증강데이터의 train_data에서 evaluation부분 제거
        eval_index_list = list(self.eval_data.index)
        test_index_list = list(self.test_data.index)
        
        for aug_num in range(self.aug_mul):
            iidf2 = [i + self.len_data* aug_num for i in eval_index_list]
            self.eval_index = self.eval_index + iidf2
            iidf3 = [x + self.len_data* aug_num for x in test_index_list]
            self.test_index = self.test_index + iidf3


        self.data = data
        self.train_data = data.drop(self.eval_index)
        self.train_data = self.train_data.drop(self.test_index)
        self.train_data = self.train_data.sample(frac=1).reset_index(drop=True)

# 모델 parameter 설정
    def set_model(self): # epochs: 200, batch size: 100, learning rate 0.002
        self.model = MultiLabelClassificationModel(self.nlp_model_name, self.nlp_model[self.nlp_model_name], num_labels = self.labels_num, 
        args = {'output_dir': '/data/a22106/Deepsoft_C_Multilabel/{}_{}_{}_{}/'.format(self.dataset_name, self.nlp_model_name, self.augmenter_name, self.aug_mul), 
        'overwrite_output_dir': True, 'save_steps': -1, 'num_train_epochs': 50, 'train_batch_size': 100, 'eval_batch_size': 100, 'max_seq_length': 128, 'learning_rate': 0.002})
        

    def train_model(self):
        self.model.train_model(self.train_data)
    
    def eval_model(self):
        self.result, self.model_outputs, self.wrong_predictions = self.model.eval_model(self.eval_data)

    def test_model(self):
        #self.to_predict = self.test_data.comment_text.apply(lambda x: x.replace('\n', ' ')).tolist()
        self.preds, outputs = self.model.predict(self.test_data)

        sub_df = pd.DataFrame(outputs, columns = list(ml.data_ori.columns[4:-2]))

        sub_df['id'] = test_df['id']
        sub_df = sub_df[['id'].append(list(ml.data_ori.columns[4:-2]))]

        sub_df.to_csv('outputs/submission.csv', index = False)




In [76]:
ml = ML_Classification("HADOOP", "Split", 7, "distilbert")
ml.refine_origin_data()
ml.labels_to_int()
print(ml.data)

                                                    text  \
0      tool to mount ndfs on linux tool to mount ndfs...   
1      make configuration an interface the configurat...   
2      df enhancement: performance and win xp support...   
3      adding some uniformity/convenience to environm...   
4      buffersize argument is ignored in filesystem.c...   
...                                                  ...   
43059  switch to v2 of the s3 li st objects api in s3...   
43060  namenode connect t ime out in cluster with 65 ...   
43061  eliminate needless uses of filesystem. exists,...   
43062  dis pose of unnecess ary sasl s ervers the ipc...   
43063  optimize and fix getfilestatus in s3a currentl...   

                                                  labels  
0      [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, ...  
1      [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...  
2      [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, ...  
3      [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,

In [77]:
ml.train_data

Unnamed: 0,text,labels
0,update j unit dependency simp le upd ate of th...,"[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,blockdecompressorstream get eof exception when...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,native client: im plement hdfsmove and hdfscop...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,w ritablecomparator ' s constructor should be ...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,add build instruction for dock er toolbox inst...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, ..."
...,...,...
25832,testviewfstrash assumes the user ' s home dire...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, ..."
25833,reduce task gett ing map output ov er http sho...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
25834,include librecordio as part of the rele ase no ne,"[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
25835,adding service - level authorization to hadoop...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [78]:
ml.eval_data

Unnamed: 0,issuekey,title,description,component,auto-failover,azure,benchmarks,bin,build,conf,...,test,tools,tools/distcp,tracing,trash,util,viewfs,yetus,text,labels
1290,HADOOP-5353,add progress callback feature to the slow File...,This is something only of relevance of people ...,fs,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,add progress callback feature to the slow file...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, ..."
464,HADOOP-2390,Document the user-controls for intermediate/ou...,We should document the user-controls for compr...,documentation,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,document the user-controls for intermediate/ou...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, ..."
4230,HADOOP-10399,FileContext API for ACLs.,Add new methods to AbstractFileSystem and File...,fs,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,filecontext api for acls. add new methods to a...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, ..."
356,HADOOP-1772,Hadoop does not run in Cygwin in Windows,the hostname commands are slightly different i...,scripts,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,hadoop does not run in cygwin in windows the ...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2547,HADOOP-7818,DiskChecker#checkDir should fail if the direct...,DiskChecker#checkDir fails if a directory can'...,util,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,diskchecker#checkdir should fail if the direct...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4979,HADOOP-11592,"IPC error extraction fails ""getLength on unini...",I'm seeing {{java.lang.IllegalArgumentExcepti...,ipc,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"ipc error extraction fails ""getlength on unini...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4132,HADOOP-10226,Help! My Hadoop doesn't work!,I have installed hadoop but it it is failing ...,bin,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,help! my hadoop doesn't work! i have installed...,"[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
5386,HADOOP-12277,releasedocmaker index mode should create a rea...,"The content should be the same, however, rathe...",yetus,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,releasedocmaker index mode should create a rea...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4613,HADOOP-10979,Auto-entries in hadoop_usage,It would make adding common options to hadoop_...,scripts,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,auto-entries in hadoop_usage it would make add...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [79]:
ml.test_data

Unnamed: 0,issuekey,title,description,component,auto-failover,azure,benchmarks,bin,build,conf,...,test,tools,tools/distcp,tracing,trash,util,viewfs,yetus,text,labels
5553,HADOOP-12537,s3a: Add flag for session ID to allow Amazon S...,Amazon STS allows you to issue temporary acces...,fs/s3,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,s3a: add flag for session id to allow amazon s...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ..."
58,HADOOP-319,"FileSystem ""close"" does not remove the closed ...",The close methods of both DistributedFileSyste...,fs,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"filesystem ""close"" does not remove the closed ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, ..."
3833,HADOOP-9756,Additional cleanup RPC code,HADOOP-9754 already did good job to address mo...,ipc,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,additional cleanup rpc code hadoop-9754 alread...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4197,HADOOP-10349,TaskUmbilicalProtocol always uses TOKEN authen...,Since job tokens are always created. HADOOP-96...,security,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,taskumbilicalprotocol always uses token authen...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1213,HADOOP-5072,testSequenceFileGzipCodec won't pass without n...,"Somehow, SequenceFile requires native gzip cod...",test,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,testsequencefilegzipcodec won't pass without n...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
111,HADOOP-683,bin/hadoop.sh doesn't work for /bin/dash (eg u...,bin/hadoop.sh has a conditional which doesn't ...,scripts,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,bin/hadoop.sh doesn't work for /bin/dash (eg u...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3156,HADOOP-8695,TestPathData fails intermittently with JDK7,Failed tests: testWithDirStringAndConf(org.a...,test,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,testpathdata fails intermittently with jdk7 fa...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
5912,HADOOP-13108,dynamic subcommands need a way to manipulate a...,It would be extremely useful to be able to man...,scripts,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,dynamic subcommands need a way to manipulate a...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1732,HADOOP-6562,FileContextSymlinkBaseTest should use FileCont...,FileContextSymlinkBaseTest should use FileCont...,test,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,filecontextsymlinkbasetest should use filecont...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
