In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
import autokeras as ak
import konlpy, re, tqdm, os
from keras import losses
import keras
from konlpy.tag import Mecab
import tqdm
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from transformers import *

Using TensorFlow backend.


In [2]:
from sklearn.metrics import roc_curve
def Find_Optimal_Cutoff(target, predicted):

    fpr, tpr, threshold = roc_curve(target, predicted)
    i = np.arange(len(tpr))
    roc = pd.DataFrame({'tf' : pd.Series(tpr-(1-fpr), index = i), 'threshold' : pd.Series(threshold, index = i)})
    roc_t = roc.iloc[(roc.tf-0).abs().argsort()[:1]]
    return list(roc_t['threshold'])

In [3]:
PATH = '/Users/roopre/Desktop/STUDY/dacon/NLP Baseline code(수정)/open/'

In [4]:
train=pd.read_csv(PATH+'train.csv')
test=pd.read_csv(PATH+'test.csv')
sample_submission=pd.read_csv(PATH+'sample_submission.csv')

In [5]:
x_train = "과제명 : "+ train['과제명'] + " 요약문_키워드 : " + train['요약문_한글키워드'].fillna('.').astype('str')
y_train = train['label'].copy()
y_train_1 = train['label']
x_test  = "과제명 : "+ test['과제명'] + " 요약문_키워드 : " + test['요약문_한글키워드'].fillna('.').astype('str')

In [6]:
y_train_0_1_split = y_train.copy()
y_train_0_1_split[train['label']!=0]=1

In [7]:
x_train_1 = x_train[train['label']!=0]
y_train_1 = y_train_1[train['label']!=0]


In [8]:
def Preprocessing(text, tagger, remove_stopwords=False, stop_words=[]) :
    # 한글 정규식 정의(띄어쓰기, ㄱ ~ ㅣ, 가 ~ 힣)
    text = re.sub('[^ ㄱ-ㅣ가-힣]+','',text)
    
    # 텍스트를 형태소로 분리후 각 단어로부터 어간을 추출.(stem=True)
    word_text = tagger.morphs(text)
    
    # 불용어 처리.
    if remove_stopwords :
        word_text = [ t for t in word_text if not t in stop_words ]
        
    return word_text

# 한글 불용어 사전 파일 이용.
with open('./korean_stopwords.txt', encoding='utf-8') as fp :
    stop_words = fp.readlines()
    
# \n 제거.
stop_words = [ x.strip() for x in stop_words ]
# print(stop_words)
# print('-'*135)
# print()

# Okt 이용.
tagger = Mecab()

# train, test 데이터를 정제한 텍스트를 담을 리스트 생성.
Clean_train_01_data = []
Clean_train_1_data = []
Clean_test_data  = []

# Train dataset 정제 작업 시작.
for text in tqdm.tqdm(x_train) :
    try :
        Clean_train_01_data.append(Preprocessing(text, tagger, remove_stopwords=True, stop_words=stop_words))
    except :
        Clean_train_01_data.append([])
        
for text in tqdm.tqdm(x_train_1) :
    try :
        Clean_train_1_data.append(Preprocessing(text, tagger, remove_stopwords=True, stop_words=stop_words))
    except :
        Clean_train_1_data.append([])

# est dataset 정제 작업 시작.
for text in tqdm.tqdm(x_test) :
    if type(text) == str :
        Clean_test_data.append(Preprocessing(text, tagger, remove_stopwords=True, stop_words=stop_words))
    else :
        Clean_test_data.append([])   
        
# 정제된 데이터의 수.
print(f'Train data : {len(Clean_train_01_data)}')
print(f'Train data : {len(Clean_train_1_data)}')
print(f'Test data  : {len(Clean_test_data)}')

100%|██████████| 174304/174304 [01:21<00:00, 2138.36it/s]
100%|██████████| 31733/31733 [00:15<00:00, 2112.08it/s]
100%|██████████| 43576/43576 [00:21<00:00, 2001.65it/s]

Train data : 174304
Train data : 31733
Test data  : 43576





In [9]:
#------------ 0, 1구분 훈련
# 우선 특별한 옵션을 지정하지 않고 진행.
token = Tokenizer()

# fit_on_texts() : 입력으로 들어온 텍스트에서 단어의 빈도수가 높은 순으로 낮은 숫자부터 인덱스 부여, 단어 집합 생성.
token.fit_on_texts(Clean_train_01_data)

# 단어 사전을 통해 문장의 각 단어를 숫자(시퀀스 형태)로 변환. : 인덱스로만 채워진 새로운 배열을 생성한다는 의미.
Train_squences = token.texts_to_sequences(Clean_train_01_data)
Test_squences  = token.texts_to_sequences(Clean_test_data)

# 한 문장의 최대 단어 수를 가져옴.
max_cnt = 0
for c in Train_squences :
    # 현재 문장의 글자수.
    cnt = len(c)
    # 현재 문장의 글자수가 이전 최대 수치보다 많으면 덮어쓰기.
    if max_cnt < cnt :
        max_cnt = cnt
print(f'최대 단어의 개수 : {max_cnt}')

# Padding 처리 : 서로 길이가 다른 리스트의 개수를 max_cnt로 맞춰줌.
# 끝 부분으로 적용. ==> 즉 max_cnt 길이의 리스트로 동일하게 맞춰주기 위해 부족한 부분으로 뒤에서 부터 0으로 채우는 작업을 의미.
padded_01_train = pad_sequences(Train_squences, max_cnt, padding='post')
padded_test  = pad_sequences(Test_squences, max_cnt, padding='post')

# One-hot-encoding.
# 주의 : 맨 앞에 0이 추가됨.
voca_size = len(token.word_index) + 1
print(voca_size)
# train_x = to_categorical(Train_squences, num_classes = voca_size)
# test_x  = to_categorical(Test_squences, num_classes = voca_size)

최대 단어의 개수 : 102
45391


In [12]:
embedding_dim = 64
voca_size = 45391
max_cnt=102
model_011 = tf.keras.Sequential([
    tf.keras.layers.Embedding(voca_size, embedding_dim, input_length=max_cnt),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(512, activation='relu'),
    tf.keras.layers.Dense(256, activation='relu'),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

# compile model
model_011.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

# model summary
print(model_011.summary())

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 102, 64)           2905024   
_________________________________________________________________
global_average_pooling1d_1 ( (None, 64)                0         
_________________________________________________________________
dense_4 (Dense)              (None, 512)               33280     
_________________________________________________________________
dense_5 (Dense)              (None, 256)               131328    
_________________________________________________________________
dense_6 (Dense)              (None, 128)               32896     
_________________________________________________________________
dense_7 (Dense)              (None, 1)                 129       
Total params: 3,102,657
Trainable params: 3,102,657
Non-trainable params: 0
____________________________________________

In [13]:

num_epochs = 50
model_011.fit(padded_01_train, y_train_0_1_split, 
                    epochs=num_epochs, verbose=2, 
                    batch_size=64,
                    validation_split=0.2)

Epoch 1/50
2179/2179 - 59s - loss: 0.2857 - accuracy: 0.8813 - val_loss: 0.2587 - val_accuracy: 0.8929
Epoch 2/50
2179/2179 - 57s - loss: 0.2174 - accuracy: 0.9099 - val_loss: 0.2507 - val_accuracy: 0.8976
Epoch 3/50
2179/2179 - 57s - loss: 0.1843 - accuracy: 0.9238 - val_loss: 0.2415 - val_accuracy: 0.9048
Epoch 4/50
2179/2179 - 57s - loss: 0.1622 - accuracy: 0.9330 - val_loss: 0.2469 - val_accuracy: 0.9058
Epoch 5/50
2179/2179 - 57s - loss: 0.1455 - accuracy: 0.9398 - val_loss: 0.2781 - val_accuracy: 0.9086
Epoch 6/50
2179/2179 - 53s - loss: 0.1304 - accuracy: 0.9462 - val_loss: 0.3076 - val_accuracy: 0.9110
Epoch 7/50
2179/2179 - 58s - loss: 0.1173 - accuracy: 0.9512 - val_loss: 0.3234 - val_accuracy: 0.9098
Epoch 8/50
2179/2179 - 57s - loss: 0.1065 - accuracy: 0.9554 - val_loss: 0.3473 - val_accuracy: 0.9113
Epoch 9/50
2179/2179 - 57s - loss: 0.0939 - accuracy: 0.9608 - val_loss: 0.3438 - val_accuracy: 0.9115
Epoch 10/50
2179/2179 - 57s - loss: 0.0848 - accuracy: 0.9650 - val_loss:

<tensorflow.python.keras.callbacks.History at 0x7fe61e8bce50>

In [14]:
#------------ 0, 1구분 훈련
# 우선 특별한 옵션을 지정하지 않고 진행.
token = Tokenizer()

# fit_on_texts() : 입력으로 들어온 텍스트에서 단어의 빈도수가 높은 순으로 낮은 숫자부터 인덱스 부여, 단어 집합 생성.
token.fit_on_texts(Clean_train_1_data)

# 단어 사전을 통해 문장의 각 단어를 숫자(시퀀스 형태)로 변환. : 인덱스로만 채워진 새로운 배열을 생성한다는 의미.
Train_squences = token.texts_to_sequences(Clean_train_1_data)
Test_squences_1  = token.texts_to_sequences(Clean_test_data)

# 한 문장의 최대 단어 수를 가져옴.
max_cnt = 0
for c in Train_squences :
    # 현재 문장의 글자수.
    cnt = len(c)
    # 현재 문장의 글자수가 이전 최대 수치보다 많으면 덮어쓰기.
    if max_cnt < cnt :
        max_cnt = cnt
print(f'최대 단어의 개수 : {max_cnt}')

# Padding 처리 : 서로 길이가 다른 리스트의 개수를 max_cnt로 맞춰줌.
# 끝 부분으로 적용. ==> 즉 max_cnt 길이의 리스트로 동일하게 맞춰주기 위해 부족한 부분으로 뒤에서 부터 0으로 채우는 작업을 의미.
padded_1_train = pad_sequences(Train_squences, max_cnt, padding='post')
padded_1_test = pad_sequences(Test_squences_1, max_cnt, padding='post')
#padded_test  = pad_sequences(Test_squences, max_cnt, padding='post')

# One-hot-encoding.
# 주의 : 맨 앞에 0이 추가됨.
voca_size = len(token.word_index) + 1
print(voca_size)
# train_x = to_categorical(Train_squences, num_classes = voca_size)
# test_x  = to_categorical(Test_squences, num_classes = voca_size)

최대 단어의 개수 : 73
19272


In [15]:
embedding_dim = 64
voca_size = 19272
max_cnt=73
model_1 = tf.keras.Sequential([
    tf.keras.layers.Embedding(voca_size, embedding_dim, input_length=max_cnt),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(512, activation='relu'),
    tf.keras.layers.Dense(256, activation='relu'),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(46, activation='softmax')
])

# compile model
model_1.compile(loss='sparse_categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

# model summary
print(model_1.summary())

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 73, 64)            1233408   
_________________________________________________________________
global_average_pooling1d_2 ( (None, 64)                0         
_________________________________________________________________
dense_8 (Dense)              (None, 512)               33280     
_________________________________________________________________
dense_9 (Dense)              (None, 256)               131328    
_________________________________________________________________
dense_10 (Dense)             (None, 128)               32896     
_________________________________________________________________
dense_11 (Dense)             (None, 46)                5934      
Total params: 1,436,846
Trainable params: 1,436,846
Non-trainable params: 0
____________________________________________

In [16]:
num_epochs = 50
model_1.fit(padded_1_train, y_train_1, 
                    epochs=num_epochs, verbose=2, 
                    validation_split=0.2)

Epoch 1/50
794/794 - 8s - loss: 2.6387 - accuracy: 0.2598 - val_loss: 1.7538 - val_accuracy: 0.4931
Epoch 2/50
794/794 - 7s - loss: 1.3406 - accuracy: 0.6129 - val_loss: 1.3478 - val_accuracy: 0.6217
Epoch 3/50
794/794 - 6s - loss: 0.9293 - accuracy: 0.7394 - val_loss: 1.1889 - val_accuracy: 0.6923
Epoch 4/50
794/794 - 7s - loss: 0.6763 - accuracy: 0.8114 - val_loss: 1.1103 - val_accuracy: 0.7300
Epoch 5/50
794/794 - 8s - loss: 0.5279 - accuracy: 0.8550 - val_loss: 1.1311 - val_accuracy: 0.7378
Epoch 6/50
794/794 - 8s - loss: 0.4356 - accuracy: 0.8797 - val_loss: 1.2740 - val_accuracy: 0.7150
Epoch 7/50
794/794 - 8s - loss: 0.3690 - accuracy: 0.8973 - val_loss: 1.2305 - val_accuracy: 0.7435
Epoch 8/50
794/794 - 8s - loss: 0.3251 - accuracy: 0.9057 - val_loss: 1.2601 - val_accuracy: 0.7563
Epoch 9/50
794/794 - 8s - loss: 0.2760 - accuracy: 0.9215 - val_loss: 1.2754 - val_accuracy: 0.7656
Epoch 10/50
794/794 - 8s - loss: 0.2431 - accuracy: 0.9295 - val_loss: 1.3248 - val_accuracy: 0.7726

<tensorflow.python.keras.callbacks.History at 0x7fe61e8a4160>

In [17]:
#0 1 구분 threshold 구하기
train_pred = model_011.predict(padded_01_train)

In [18]:
train_pred = np.array(train_pred)
train_target = np.array(y_train_0_1_split)

In [19]:
th = Find_Optimal_Cutoff(train_target, train_pred)
print(th)

[0.00321844220161438]


In [20]:
test_pred_01 = model_011.predict(padded_test)
k=[]
a=0
for i in test_pred_01:
    if i < th:
        k.append(0)
        a+=1
    else:
        k.append(1)
print(a)

33898


In [21]:
tmp = pd.DataFrame()
tmp['label']=k

In [22]:
test_1 = padded_1_test[tmp['label']==1]

In [23]:
pred_1 = model_1.predict(test_1)

In [24]:
pred_1 = tf.argmax(pred_1,axis=1)
rek=np.array(pred_1)

In [25]:
print(rek)

[ 1 14 23 ... 31  2 19]


In [26]:
idx=0
k=np.array(k)
for i in range(k.shape[0]):
    if k[i] == 1:
        k[i] = rek[idx]
        idx+=1

In [30]:
print(k.shape)

(43576,)


In [28]:
tt = pd.DataFrame()
tt['label']=k
tt.to_csv('rere.csv')

In [71]:
len(tmp[tmp['label']==1])

78

In [70]:
tmp[tmp['label']==1] = tt

In [65]:
tmp.to_csv('tmppp.csv')

In [61]:
tt.to_csv('tmp.csv')
tmp.to_csv("tttt.csv")

In [31]:
sample_submission['label']=k

In [32]:
sample_submission.to_csv("th_2.csv")

In [29]:
y_train_0_1_split

tmpp = np.array(y_train_0_1_split)
predd=np.array(pred_01)