In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
from tqdm import tqdm

In [3]:
def ppr_data(df, df_name):
    df = df.groupby('ticketno').apply(custom_info).reset_index()  # ticketno 기준 병합 및 전처리 수행

    if df_name == 'train' and 'root_cause_type' in df.columns:
        print(f"Null 확인 : {df.isnull().values.any()}")
        df['root_cause_type'] = df['root_cause_type'].apply(lambda x: 0 if x == 'PowerFail' else (1 if x == 'LinkCut' else 2)) # 레이블 인코딩
        #df['root_cause_type'].value_counts().plot(kind = 'bar')

    x_df = df.iloc[:, :-1]
    y_df = pd.DataFrame(df['root_cause_type'], columns=['root_cause_type'])
    df.to_csv(f"Q2_{df_name}_ppr.csv", index=False)
    x_df.to_csv(f"x_{df_name}_df.csv", index=False)
    pd.concat([x_df['alarmmsg_original'],y_df['root_cause_type']],axis=1).to_csv(f"{df_name}.csv", index=False)
    y_df.to_csv(f"y_{df_name}_df.csv", index=False)

    return x_df, y_df

def custom_info(group):  # 임의로 작성된 전처리 코드입니다.
    d = {}
    group.sort_values(by='alarmtime', ascending=True, inplace=True)  # 경보 순서 정렬
    d['alarmmsg_original'] = ' '.join(group['alarmmsg_original'])  # 메시지 단순 병합
    if 'root_cause_type' in group.columns:  # 레이블 추출
        d['root_cause_type'] = group['root_cause_type'].iloc[0]  # 동일한 ticketno는 동일한 root_cause_type을 가짐
    else:
        d['root_cause_type'] = None  # 테스트 세트의 경우 정답 컬럼 없음
    return pd.Series(d, index=['alarmmsg_original', 'root_cause_type'])

train_df = pd.read_csv("Q2_train.csv")
test_df = pd.read_csv("Q2_test.csv")

x_train_df, y_train_df = ppr_data(train_df, "train")
x_test_df, _ = ppr_data(test_df, "test")  # 테스트 세트의 경우 정답 컬럼 없음

Null 확인 : False


### 토큰화

Bert 토큰화

Huggingface 토큰화

In [18]:
from tokenizers import BertWordPieceTokenizer
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

tokenizer = BertWordPieceTokenizer(lowercase=False)
data_file = 'x_train_df.csv'
vocab_size = 30000
limit_alphabet = 6000
min_frequency = 5

tokenizer.train(files=data_file,
                vocab_size=vocab_size,
                limit_alphabet=limit_alphabet,
                min_frequency=min_frequency)

tokenizer.save_model('./')

# encoded = tokenizer.encode('ETH-NO-RX-TRAFFIC')
# print('토큰화 결과 :',encoded.tokens)
# print('정수 인코딩 :',encoded.ids)
# print('디코딩 :',tokenizer.decode(encoded.ids))

X_train = []
X_test = []
for i, sentence in enumerate(x_train_df['alarmmsg_original']):
    encoded = tokenizer.encode(sentence)
    temp_X = []
    temp_X = encoded.ids
    X_train.append(temp_X)
    if i < 1:
        print("sentence:", sentence)
        print('토큰화 결과 :',encoded.tokens)
        print('정수 인코딩 :',encoded.ids)
        print('디코딩 :',tokenizer.decode(encoded.ids))
    
for sentence in x_test_df['alarmmsg_original']:
    temp_X = []
    temp_X = tokenizer.encode(sentence).ids
    X_test.append(temp_X)

Y_train = [[y_train_df['root_cause_type'][i]] for i in range(len(y_train_df))]

# ----------------- Padding ------------------ #
max_len = max(len(l) for l in X_train)
# #print('문장의 최대 길이 :', max_len)
# for i, l in enumerate(X_train):
#     if len(l) < max_len:
#         l += [0] * (max_len - len(l))
#     else:
#         X_train[i] = l[:max_len]

# for i in range(len(Y_train)):
#     for j in range(max_len):
#         Y_train[i].append(0)
# Y_train = np.array(Y_train)

X_train = pad_sequences(X_train, maxlen=max_len, padding='post', value=0.0)
Y_train = pad_sequences(Y_train, maxlen=max_len, padding='post', value=0.0)

print("-----------------")

# print('훈련용 메시지 수 :', len(X_train))
# print('훈련용 레이블 수 :', len(Y_train))
# print('테스트용 메시지 수 :', len(X_test))
# print('테스트용 레이블 수 :', len(y_test))

print('X_train(Decoded) : ', tokenizer.decode(X_train[5]))
print('X_train : ', X_train[:3])
print('Y_train : ', Y_train[:3])
print('X_train.shape : ', X_train.shape)
print('Y_train.shape : ', Y_train.shape)

sentence: PSU-FAIL ETH-LINK-FAIL PSU-FAIL ETH-LINK-FAIL ETH-LINK-FAIL ETH-NO-RX-TRAFFIC
토큰화 결과 : ['PSU', '-', 'FAIL', 'ETH', '-', 'LINK', '-', 'FAIL', 'PSU', '-', 'FAIL', 'ETH', '-', 'LINK', '-', 'FAIL', 'ETH', '-', 'LINK', '-', 'FAIL', 'ETH', '-', 'NO', '-', 'RX', '-', 'TRAFFIC']
정수 인코딩 : [131, 6, 123, 117, 6, 134, 6, 123, 131, 6, 123, 117, 6, 134, 6, 123, 117, 6, 134, 6, 123, 117, 6, 141, 6, 238, 6, 146]
디코딩 : PSU - FAIL ETH - LINK - FAIL PSU - FAIL ETH - LINK - FAIL ETH - LINK - FAIL ETH - NO - RX - TRAFFIC
-----------------
X_train(Decoded) :  ETH - LINK - FAIL ETH - LINK - FAIL PSU - FAIL PSU - FAIL ETH - LINK - FAIL ETH - NO - RX - TRAFFIC
X_train :  [[131   6 123 ...   0   0   0]
 [117   6 134 ...   0   0   0]
 [131   6 123 ...   0   0   0]]
Y_train :  [[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
X_train.shape :  (1114, 2345)
Y_train.shape :  (1114, 2345)


Pytorch 토큰화 (torchtext)

In [7]:
sentences = []
for i, sentence in enumerate(x_train_df['alarmmsg_original']):
    sentences.append(sentence.lower().replace("[^a-zA-Z]","").split(' '))
    print(i, end='\r')
print(sentences)

[['psu-fail', 'eth-link-fail', 'psu-fail', 'eth-link-fail', 'eth-link-fail', 'eth-no-rx-traffic'], ['eth-link-fail', 'psu-fail', 'psu-fail', 'eth-link-fail'], ['psu-fail', 'psu-fail', 'eth-link-fail', 'eth-link-fail', 'eth-link-fail'], ['psu-fail', 'psu-fail'], ['eth-link-fail', 'eth-link-fail', 'psu-fail', 'psu-fail', 'eth-link-fail', 'eth-no-rx-traffic'], ['eth-link-fail', 'eth-link-fail', 'psu-fail', 'psu-fail', 'eth-link-fail', 'eth-no-rx-traffic'], ['psu-fail', 'psu-fail', 'eth-link-fail', 'eth-link-fail'], ['fan-48v-fail', 'fan-48v-fail', 'fan-fail', 'fan-fail', 'fan-fail', '48v-fail'], ['opt-remove', 'opt-los'], ['nvram-fail'], ['psu-fail', 'psu-fail'], ['psu-fail'], ['psu-fail', 'psu-fail'], ['eth-link-fail', 'psu-fail', 'eth-link-fail', 'psu-fail', 'eth-link-fail'], ['psu-fail', 'eth-link-fail', 'psu-fail', 'eth-link-fail'], ['batt-low'], ['psu-fail'], ['eth-link-fail', 'psu-fail', 'eth-link-fail', 'psu-fail'], ['psu-fail', 'psu-fail'], ['psu-fail', 'psu-fail', 'eth-link-fail'

Word2Vec

In [9]:
from gensim.models import Word2Vec
from gensim.models import KeyedVectors

model = Word2Vec(sentences=x_train_df['alarmmsg_original'], vector_size=100, window=5, workers=4, sg=0)

model_result = model.wv.most_similar("O")
print(model_result)

# model.wv.save_word2vec_format('Q2_w2v') # 모델 저장
# loaded_model = KeyedVectors.load_word2vec_format("Q2_w2v") # 모델 로드

[('P', 0.968186616897583), ('W', 0.8076967000961304), ('L', 0.7916989326477051), ('-', 0.6977741122245789), (' ', 0.6806632876396179), ('C', 0.40683889389038086), ('R', 0.3078726530075073), ('T', 0.29804983735084534), ('S', 0.2779005765914917), ('d', 0.04886201024055481)]


GloVe

In [10]:
from gensim.models import FastText

model = FastText(x_train_df['alarmmsg_original'], vector_size=100, window=5, min_count=5, workers=4, sg=1)

model_result = model.wv.most_similar("O")
print(model_result)

# model.wv.save_word2vec_format('Q2_ft') # 모델 저장
# loaded_model = KeyedVectors.load_word2vec_format("Q2_ft") # 모델 로드

[('P', 0.8458264470100403), ('L', 0.6633667349815369), ('W', 0.6344912052154541), (' ', 0.5976385474205017), ('-', 0.5788556933403015), ('C', 0.48218047618865967), ('T', 0.3953852653503418), ('S', 0.39458295702934265), ('R', 0.3309755027294159), ('D', 0.2986486554145813)]
