In [5]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.layers import Dropout
import re
import nltk
import chart_studio
from nltk.corpus import stopwords
from nltk import word_tokenize
STOPWORDS = set(stopwords.words('english'))
from bs4 import BeautifulSoup
import plotly.graph_objects as go
import chart_studio.plotly as py
import cufflinks
from IPython.core.interactiveshell import InteractiveShell
import plotly.figure_factory as ff
InteractiveShell.ast_node_interactivity = 'all'
from chart_studio.plotly import iplot
cufflinks.go_offline()
cufflinks.set_config_file(world_readable=True, theme='pearl')

In [6]:
# The maximum number of words to be used. (most frequent)
MAX_NB_WORDS = 50000
# Max number of words in each complaint.
MAX_SEQUENCE_LENGTH = 250

EMBEDDING_DIM = 100

In [7]:
class Lstm_issue():

    def __init__(self, data_location, data_location_ori = 'data/hadoop/HADOOP.csv', aug_mul = 2):
        self.data_ori = pd.read_csv(data_location_ori) # 원본 데이터
        self.aug_mul = aug_mul
        self.df = pd.read_csv(data_location, encoding='cp949')
        print(self.df.info())

        print(self.df.component.value_counts())

        
        self.data_ori['text'] = list(self.data_ori.title + " " + self.data_ori.description)

        for x in range(len(self.df.component)):
            self.df.component[x] = self.df.component[x].split(',')[0]

        self.Y = pd.get_dummies(self.df[set(self.df.component)])
        print(self.df.component.value_counts())

    def print_plot(self, index):
        example = self.df[self.df.index == index][['text', 'component']].values[0]
        if len(example) > 0:
            print(example[0])
            print('component:', example[1])

    def preprocessing(self):
        self.df = self.df.reset_index(drop=True)
        self.df = self.df.astype(str)
    
    def clean_text(self):
        '''self.df['text'] = self.df['text'].apply(_clean_text)
        self.df['text'] = self.df['text'].str.replace('\d+', '')'''
        refined_data = []
        for item in self.df['text']:
            #1. Remove \r 
            current_desc = item.replace('\r', ' ')    
            #2. Remove URLs
            current_desc = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', current_desc)    
            #4. Remove hex code
            current_desc = re.sub(r'(\w+)0x\w+', '', current_desc) 
            #5. Change to lower case
            current_desc = current_desc.lower()   
            #6. Tokenize
            #current_desc_tokens = tokenizer(current_desc, add_special_tokens= True)
            #7. Strip trailing punctuation marks    
            #current_desc_filter = [word.strip(string.punctuation) for word in current_desc_tokens]     
            #8. Join the lists
            #current_data = current_desc_filter
            #current_data = list(filter(None, current_data))
            refined_data.append(current_desc)
        self.df['text'] = refined_data
        self.df['text'] = self.df['text'].str.replace('\d+', '')
    
    def tokenize_df(self):
        '''# The maximum number of words to be used. (most frequent)
        MAX_NB_WORDS = 50000
        # Max number of words in each complaint.
        MAX_SEQUENCE_LENGTH = 250
        # This is fixed.
        EMBEDDING_DIM = 100'''
        self.tokenizer = Tokenizer(num_words=MAX_NB_WORDS, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True)
        self.tokenizer.fit_on_texts(self.df['text'].values)
        word_index = self.tokenizer.word_index
        print('Found %s unique tokens.' % len(word_index))

    # 불러온 정제된 데이터 one hot을 str에서 list로 바꾸는 작업
    def labels_to_int(self):
        x, y = train_test_split(self.data_ori, test_size = 0.2, random_state=42)
        self.X_ori = x.text
        self.Y_ori = y.component

        #self.X_train, self.X_test, self.Y_train, self.Y_test = train_test_split(self.X_ori, self.Y_ori, test_size = 0.20, random_state = 21)

        # 증강데이터의 train_data에서 test index부분 제거
        y_index = list(y.index)
        #test_index_list = list(self.test_data.index)
        test_index =[]
        for aug_num in range(self.aug_mul):
            iidf2 = [i + 6152* aug_num for i in y_index]
            test_index = test_index + iidf2

        # labeling y
        #self.Y = pd.get_dummies(self.df['component'])
        df_compset = list(set(self.df.component.values))
        #self.Y = pd.get_dummies(self.df[set(self.df.component)])
        #self.Y = pd.get_dummies(self.df['fs'])
        
        self.X_train = self.df['text'].drop(test_index)
        self.X_train = self.X_train.sample(frac=1).reset_index(drop=True)
        X_train_index = list(self.X_train.index)
        self.X_test = self.df['text'].drop(X_train_index)
        self.X_test = self.X_test.sample(frac=1).reset_index(drop=True)

        self.Y_train = self.Y.drop(test_index)
        self.Y_train = self.Y_train.sample(frac=1).reset_index(drop=True)
        self.Y_test = self.Y.drop(X_train_index)
        self.Y_test = self.Y_test.sample(frac=1).reset_index(drop=True)
        # tokenize x, y
        self.X_train = self.tokenizer.texts_to_sequences(self.X_train.values)
        #self.X = self.tokenizer.texts_to_sequences(df_text_ori)
        self.X_train = pad_sequences(self.X_train, maxlen=MAX_SEQUENCE_LENGTH)

        self.X_test = self.tokenizer.texts_to_sequences(self.X_test.values)
        #self.X = self.tokenizer.texts_to_sequences(df_text_ori)
        self.X_test = pad_sequences(self.X_test, maxlen=MAX_SEQUENCE_LENGTH)
        print('X Shape of data tensor:', self.X_train.shape, self.X_test.shape)

        # tokenize x, y ori 
        self.X_ori = self.tokenizer.texts_to_sequences(self.X_ori.values)
        #self.X = self.tokenizer.texts_to_sequences(df_text_ori)
        self.X_ori = pad_sequences(self.X_ori, maxlen=MAX_SEQUENCE_LENGTH)
        print('X_ori Shape of data tensor:', self.X_train.shape, self.X_ori.shape)
        self.Y_ori = pd.get_dummies(self.Y_ori).values
        print('Shape of label tensor:', self.Y_ori.shape)

        '''print('xori, yori: ', self.X_ori.shape, self.Y_ori.shape)
        self.X_train_ori, self.X_test_ori, self.Y_train_ori, self.Y_test_ori = train_test_split(self.X_ori, self.Y_ori, test_size=0.2, random_state=42)'''
        
        '''self.Y_train = pd.get_dummies(self.Y_train).values
        self.Y_test = pd.get_dummies(self.Y_test).values'''
        print('Y Shape of label tensor:', self.Y_train.shape, self.Y_test.shape)

        print('X train Shape of data tensor:', self.X_train.shape,'X test: ', self.X_test.shape)
        print('Y Shape of label tensor:', self.Y_train.shape,'X test: ', self.Y_test.shape)
        
     
    def set_model_lstm(self, topk_num = 5):
        self.model = Sequential()
        self.model.add(Embedding(MAX_NB_WORDS, EMBEDDING_DIM, input_length=self.X_train.shape[1]))
        self.model.add(SpatialDropout1D(0.2))
        self.model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
        self.model.add(Dense(37, activation='sigmoid'))
        self.model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=[tf.keras.metrics.Recall(top_k = topk_num), 'accuracy'])
    
    def set_model_cnn(self, topk_num = 5):
        filter_length = 300
        num_classes = 37
        self.modelCNN = tf.keras.Sequential()
        self.modelCNN.add(Embedding(MAX_NB_WORDS, 20, input_length= MAX_SEQUENCE_LENGTH))
        self.modelCNN.add(Dropout(0.1))
        self.modelCNN.add(tf.keras.layers.Conv1D(filter_length, 3, padding = 'valid', activation = 'relu', strides = 1))
        self.modelCNN.add(tf.keras.layers.GlobalMaxPool1D())
        self.modelCNN.add(Dense(num_classes))
        self.modelCNN.add(tf.keras.layers.Activation('sigmoid'))
        self.modelCNN.compile(optimizer = 'adam', loss  = 'binary_crossentropy', metrics = [tf.keras.metrics.Recall(top_k = topk_num)])

    def run_model(self):
        epochs = 20
        batch_size = 64

        self.history = self.model.fit(self.X_train, self.Y_train, epochs=epochs, batch_size=batch_size,validation_split=0.1,callbacks=[EarlyStopping(monitor='val_loss', patience=3, min_delta=0.0001)])
    
    def run_model_cnn(self):
        self.modelCNN.summary()
        self.modelCNN.fit(self.X_train, self.Y_train, epochs=15, batch_size = 64, validation_split=0.1)

    def run_model_ori(self):
        epochs = 5
        batch_size = 64

        self.history = self.model.fit(self.X_train_ori, self.Y_train_ori, epochs=epochs, batch_size=batch_size,validation_split=0.1,callbacks=[EarlyStopping(monitor='val_loss', patience=3, min_delta=0.0001)])
    
    def test_model(self):
        self.accr = self.model.evaluate(self.X_test, self.Y_test)
        print('Test set\n Loss: {:0.3f}\n Accuracy: {0.3f}'.format(self.accr[0], self.accr[1]))

    def test_model_ori(self):
        self.accr = self.model.evaluate(self.X_test_ori, self.Y_test_ori)
        print('Test set\n Loss: {:0.3f}\n Accuracy: {0.3f}'.format(self.accr[0], self.accr[1]))


In [8]:
dataloc1 = "data/hadoop/HADOOP_char_Keyboard_ori.csv"
pius_word1 = Lstm_issue(dataloc1)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 43064 entries, 0 to 43063
Data columns (total 40 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   text                    43064 non-null  object
 1   labels                  43064 non-null  object
 2   component               43064 non-null  object
 3   auto-failover           43064 non-null  int64 
 4   azure                   43064 non-null  int64 
 5   benchmarks              43064 non-null  int64 
 6   bin                     43064 non-null  int64 
 7   build                   43064 non-null  int64 
 8   conf                    43064 non-null  int64 
 9   contrib/cloud           43064 non-null  int64 
 10  contrib/eclipse-plugin  43064 non-null  int64 
 11  contrib/hod             43064 non-null  int64 
 12  contrib/serialization   43064 non-null  int64 
 13  documentation           43064 non-null  int64 
 14  filecache               43064 non-null  int64 
 15  fs

dataloc1 = "data/hadoop/HADOOP_char_Keyboard_ori.csv"
dataloc2 = "data/hadoop/HADOOP_char_OCR_ori.csv"
dataloc3 = "data/hadoop/HADOOP_word_Antonym_ori.csv"
dataloc4 = "data/hadoop/HADOOP_word_Spelling_ori.csv"
dataloc5 = "data/hadoop/HADOOP_word_Split_ori.csv"
dataloc6 = "data/hadoop/HADOOP_word_Synonym_ori.csv"
dataloc7 = "data/hadoop/HADOOP_word_TfidfAug_ori.csv"
dataloc8 = "data/hadoop/HADOOP_word_ContextualWordEmbs_ori.csv"

pius_word1 = Lstm_issue(dataloc1)
pius_word2 = Lstm_issue(dataloc2)
pius_word3 = Lstm_issue(dataloc3)
pius_word4 = Lstm_issue(dataloc4)
pius_word5 = Lstm_issue(dataloc5)
pius_word6 = Lstm_issue(dataloc6)
pius_word7 = Lstm_issue(dataloc7)
pius_word8 = Lstm_issue(dataloc8)

In [10]:
#pius_word1.print_plot(2)
pius_word1.preprocessing()
pius_word1.clean_text()
pius_word1.tokenize_df()
pius_word1.labels_to_int()

word_hist1 = []
for topk in list(range(5, 16, 5)):
    pius_word1.set_model_cnn(topk_num=topk)
    pius_word1.run_model_cnn()

    accr = pius_word1.modelCNN.evaluate(pius_word1.X_test, pius_word1.Y_test)
    print('Test set\n Loss: {:0.3f}\n Accuracy: {:0.3f}'.format(accr[0], accr[1]))

    word_hist1.append(pius_word1.history)


The default value of regex will change from True to False in a future version.



Found 175288 unique tokens.
X Shape of data tensor: (40602, 250) (2462, 250)
X_ori Shape of data tensor: (40602, 250) (4921, 250)
Shape of label tensor: (1231, 92)
Y Shape of label tensor: (40602, 37) (2462, 37)
X train Shape of data tensor: (40602, 250) X test:  (2462, 250)
Y Shape of label tensor: (40602, 37) X test:  (2462, 37)
Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 250, 20)           1000000   
                                                                 
 dropout (Dropout)           (None, 250, 20)           0         
                                                                 
 conv1d (Conv1D)             (None, 248, 300)          18300     
                                                                 
 global_max_pooling1d (Globa  (None, 300)              0         
 lMaxPooling1D)                                      

In [None]:
word_hist1

Error: Session cannot generate requests

TypeError: unsupported operand type(s) for -: 'function' and 'float'