# 데이콘 기초 베이스 라인 코드 필사

In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
import pandas as pd

import warnings
warnings.filterwarnings(action='ignore')

import numpy as np

import tensorflow as tf

from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer

import re

In [3]:
#파일 불러오기
train = pd.read_csv(
    'C:\\Users\\StJho\\소설 작가 분류 AI 경진대회\\original_data\\train.csv',
    encoding = 'utf-8')
test = pd.read_csv(
    'C:\\Users\\StJho\\소설 작가 분류 AI 경진대회\\original_data\\test_x.csv',
    encoding = 'utf-8')
sample_submission = pd.read_csv(
    'C:\\Users\\StJho\\소설 작가 분류 AI 경진대회\\original_data\\sample_submission.csv',
    encoding = 'utf-8')

In [4]:
#데이터 살펴보기
#train
#test
#sample_submission

## 전처리

### - 부호 제거

In [5]:
#부호 제거 함수: alpha_num
def alpha_num(text):
    return re.sub(r'[^A-Za-z0-9 ]', '', text)    #빈칸이 포함되어 있음에 주의

#부호 제거 함수 적용
train['text'] = train['text'].apply(alpha_num)

In [6]:
#부호 제거 여부 확인
train

Unnamed: 0,index,text,author
0,0,He was almost choking There was so much so muc...,3
1,1,Your sister asked for it I suppose,2
2,2,She was engaged one day as she walked in peru...,1
3,3,The captain was in the porch keeping himself c...,4
4,4,Have mercy gentlemen odin flung up his hands D...,3
...,...,...,...
54874,54874,Is that you Mr Smith odin whispered I hardly d...,2
54875,54875,I told my plan to the captain and between us w...,4
54876,54876,Your sincere wellwisher friend and sister LUC...,1
54877,54877,Then you wanted me to lend you money,3


### - 불용어 제거

In [7]:
#불용어(stopword)란?
#    - 자주 등장하지만 분석을 하는 것에 있어서는 큰 도움이 되지 않는 단어를 의미
#    - NLTK에서는 100여개 이상의 불용어를 패키지 내에 미리 정의하고 있습니다.

#불용어 확인

#nltk 디폴트 불용어는 성능이 다소 떨어짐
# import nltk
# nltk.download('stopwords')
# from nltk.corpus import stopwords
# sws = stopwords.words('english')

sws = [ "a", "about", "above", "after", "again", "against", "all", "am", "an", "and", "any", "are", "as", 
             "at", "be", "because", "been", "before", "being", "below", "between", "both", "but", "by", "could", 
             "did", "do", "does", "doing", "down", "during", "each", "few", "for", "from", "further", "had", "has", 
             "have", "having", "he", "he'd", "he'll", "he's", "her", "here", "here's", "hers", "herself", "him", "himself", 
             "his", "how", "how's", "i", "i'd", "i'll", "i'm", "i've", "if", "in", "into", "is", "it", "it's", "its", "itself", 
             "let's", "me", "more", "most", "my", "myself", "nor", "of", "on", "once", "only", "or", "other", "ought", "our", "ours", 
             "ourselves", "out", "over", "own", "same", "she", "she'd", "she'll", "she's", "should", "so", "some", "such", "than", "that", 
             "that's", "the", "their", "theirs", "them", "themselves", "then", "there", "there's", "these", "they", "they'd", "they'll", 
             "they're", "they've", "this", "those", "through", "to", "too", "under", "until", "up", "very", "was", "we", "we'd", "we'll", 
             "we're", "we've", "were", "what", "what's", "when", "when's", "where", "where's", "which", "while", "who", "who's", "whom", 
             "why", "why's", "with", "would", "you", "you'd", "you'll", "you're", "you've", "your", "yours", "yourself", "yourselves" ]

In [8]:
#불용어 제거 함수: remove_stopwords
def remove_stopwords(text):
    final_text = []
    for i in text.split():
        if i.strip().lower() not in sws:
            final_text.append(i.strip())
    return " ".join(final_text)

### - 전처리 적용

In [9]:
#소문자로 변환
train['text'] = train['text'].str.lower()
test['text'] = test['text'].str.lower()
#불용어 제거
train['text'] = train['text'].apply(remove_stopwords)
test['text'] = test['text'].apply(remove_stopwords)

### - Train - Test 분리

In [10]:
X_train = np.array([x for x in train['text']])
X_test = np.array([x for x in test['text']])
y_train = np.array([x for x in train['author']])

## 모델링

In [11]:
#파라미터 설정
vocab_size = 20000
embedding_dim = 16
max_length = 500
padding_type = 'post'
#oov_tok = "<OOV>"

In [12]:

#tokenizer에 fit
tokenizer = Tokenizer(num_words = vocab_size)    #oov_token = oov_tok
tokenizer.fit_on_texts(X_train)
word_index = tokenizer.word_index

In [13]:

#데이터를 sequence로 변환해주고 padding 해줍니다.
train_sequences = tokenizer.texts_to_sequences(X_train)
train_padded = pad_sequences(train_sequences,
                             padding=padding_type,
                             maxlen=max_length)

test_sequences = tokenizer.texts_to_sequences(X_test)
test_padded = pad_sequences(test_sequences,
                           padding=padding_type,
                           maxlen=max_length)

In [14]:

#가벼운 NLP모델 생성
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(24, activation='relu'),
    tf.keras.layers.Dense(5, activation='softmax')
])

In [15]:

#complete model
model.compile(loss='sparse_categorical_crossentropy',
             optimizer='adam',
             metrics=['accuracy'])

#model summary
print(model.summary())

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 500, 16)           320000    
_________________________________________________________________
global_average_pooling1d (Gl (None, 16)                0         
_________________________________________________________________
dense (Dense)                (None, 24)                408       
_________________________________________________________________
dense_1 (Dense)              (None, 5)                 125       
Total params: 320,533
Trainable params: 320,533
Non-trainable params: 0
_________________________________________________________________
None


In [16]:

#fit model
num_epochs = 20
history = model.fit(train_padded, y_train,
                   epochs=num_epochs, verbose=2,
                   validation_split=0.2)

Epoch 1/20
1372/1372 - 5s - loss: 1.5672 - accuracy: 0.2781 - val_loss: 1.5570 - val_accuracy: 0.2779
Epoch 2/20
1372/1372 - 5s - loss: 1.4310 - accuracy: 0.3916 - val_loss: 1.2866 - val_accuracy: 0.4776
Epoch 3/20
1372/1372 - 5s - loss: 1.1906 - accuracy: 0.4986 - val_loss: 1.1574 - val_accuracy: 0.5257
Epoch 4/20
1372/1372 - 5s - loss: 1.0866 - accuracy: 0.5539 - val_loss: 1.0888 - val_accuracy: 0.5638
Epoch 5/20
1372/1372 - 5s - loss: 0.9980 - accuracy: 0.6125 - val_loss: 1.0431 - val_accuracy: 0.5842
Epoch 6/20
1372/1372 - 5s - loss: 0.9011 - accuracy: 0.6682 - val_loss: 0.9588 - val_accuracy: 0.6310
Epoch 7/20
1372/1372 - 5s - loss: 0.8189 - accuracy: 0.6996 - val_loss: 0.8919 - val_accuracy: 0.6608
Epoch 8/20
1372/1372 - 5s - loss: 0.7565 - accuracy: 0.7242 - val_loss: 0.8421 - val_accuracy: 0.6931
Epoch 9/20
1372/1372 - 5s - loss: 0.7052 - accuracy: 0.7450 - val_loss: 0.8405 - val_accuracy: 0.6799
Epoch 10/20
1372/1372 - 5s - loss: 0.6591 - accuracy: 0.7629 - val_loss: 0.7929 - 

In [17]:

#predict values
pred = model.predict_proba(test_padded)

Instructions for updating:
Please use `model.predict()` instead.


In [18]:

#check
pred

array([[5.66736480e-06, 9.45329666e-01, 4.53618206e-02, 9.24605317e-03,
        5.67872412e-05],
       [1.25901401e-01, 5.05803227e-01, 1.66990459e-01, 3.37314606e-02,
        1.67573512e-01],
       [9.94814157e-01, 1.96005785e-05, 2.51033762e-05, 3.01776026e-08,
        5.14121260e-03],
       ...,
       [7.31001865e-06, 9.99977708e-01, 1.48049480e-06, 1.21077517e-06,
        1.23389109e-05],
       [3.07849077e-05, 9.99853015e-01, 1.03979501e-05, 2.81602439e-07,
        1.05613064e-04],
       [9.98325169e-01, 1.71702936e-07, 5.75869635e-05, 6.33941539e-08,
        1.61695969e-03]], dtype=float32)

In [19]:

#make submission
sample_submission[['0', '1', '2', '3', '4']] = pred
sample_submission.to_csv('..\\submission\\submission.csv',
                         index = False, encoding = 'utf-8')
sample_submission

Unnamed: 0,index,0,1,2,3,4
0,0,0.000006,9.453297e-01,4.536182e-02,9.246053e-03,5.678724e-05
1,1,0.125901,5.058032e-01,1.669905e-01,3.373146e-02,1.675735e-01
2,2,0.994814,1.960058e-05,2.510338e-05,3.017760e-08,5.141213e-03
3,3,0.000074,6.744195e-08,9.979218e-01,8.313689e-11,2.003884e-03
4,4,0.978064,2.780798e-04,7.364915e-04,2.025756e-02,6.636026e-04
...,...,...,...,...,...,...
19612,19612,0.000002,9.999983e-01,2.304572e-14,3.071531e-14,4.188477e-11
19613,19613,0.015678,7.640864e-06,7.499747e-04,7.259653e-13,9.835644e-01
19614,19614,0.000007,9.999777e-01,1.480495e-06,1.210775e-06,1.233891e-05
19615,19615,0.000031,9.998530e-01,1.039795e-05,2.816024e-07,1.056131e-04
