In [5]:
from matplotlib import pyplot as plt
from matplotlib import rcParams
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import numpy as np
from pathlib import Path
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, log_loss
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer
import seaborn as sns
import warnings

rcParams['figure.figsize'] = (16, 8)
plt.style.use('fivethirtyeight')
pd.set_option('max_columns', 100)
pd.set_option("display.precision", 4)
warnings.simplefilter('ignore')

In [6]:
train = pd.read_csv("train.csv", index_col = "index", encoding='utf-8')
test = pd.read_csv("test_x.csv", index_col = "index", encoding='utf-8')
display(train.head(),test.head())

Unnamed: 0_level_0,text,author
index,Unnamed: 1_level_1,Unnamed: 2_level_1
0,"He was almost choking. There was so much, so m...",3
1,"“Your sister asked for it, I suppose?”",2
2,"She was engaged one day as she walked, in per...",1
3,"The captain was in the porch, keeping himself ...",4
4,"“Have mercy, gentlemen!” odin flung up his han...",3


Unnamed: 0_level_0,text
index,Unnamed: 1_level_1
0,“Not at all. I think she is one of the most ch...
1,"""No,"" replied he, with sudden consciousness, ""..."
2,As the lady had stated her intention of scream...
3,“And then suddenly in the silence I heard a so...
4,His conviction remained unchanged. So far as I...


In [20]:
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer 
from nltk.stem.snowball import SnowballStemmer

In [21]:
t1 = train["text"][0]
t1

'He was almost choking. There was so much, so much he wanted to say, but strange exclamations were all that came from his lips. The Pole gazed fixedly at him, at the bundle of notes in his hand; looked at odin, and was in evident perplexity.'

In [25]:
token = word_tokenize(t1)
print(token)


## word_tokenize() : 문장을 공백 단위로 구별

['He', 'was', 'almost', 'choking', '.', 'There', 'was', 'so', 'much', ',', 'so', 'much', 'he', 'wanted', 'to', 'say', ',', 'but', 'strange', 'exclamations', 'were', 'all', 'that', 'came', 'from', 'his', 'lips', '.', 'The', 'Pole', 'gazed', 'fixedly', 'at', 'him', ',', 'at', 'the', 'bundle', 'of', 'notes', 'in', 'his', 'hand', ';', 'looked', 'at', 'odin', ',', 'and', 'was', 'in', 'evident', 'perplexity', '.']


In [33]:
lemmatizer = WordNetLemmatizer()
[lemmatizer.lemmatize(t) for t in token]

## WordNetLemmatizer() : 단어의 원형으로 바꿔줌

['He',
 'wa',
 'almost',
 'choking',
 '.',
 'There',
 'wa',
 'so',
 'much',
 ',',
 'so',
 'much',
 'he',
 'wanted',
 'to',
 'say',
 ',',
 'but',
 'strange',
 'exclamation',
 'were',
 'all',
 'that',
 'came',
 'from',
 'his',
 'lip',
 '.',
 'The',
 'Pole',
 'gazed',
 'fixedly',
 'at',
 'him',
 ',',
 'at',
 'the',
 'bundle',
 'of',
 'note',
 'in',
 'his',
 'hand',
 ';',
 'looked',
 'at',
 'odin',
 ',',
 'and',
 'wa',
 'in',
 'evident',
 'perplexity',
 '.']

In [35]:
stemmer = SnowballStemmer("english")
[stemmer.stem(t) for t in token]

## SnowballStemmer() : 단어에서 접사를 제거한 원형으로

['he',
 'was',
 'almost',
 'choke',
 '.',
 'there',
 'was',
 'so',
 'much',
 ',',
 'so',
 'much',
 'he',
 'want',
 'to',
 'say',
 ',',
 'but',
 'strang',
 'exclam',
 'were',
 'all',
 'that',
 'came',
 'from',
 'his',
 'lip',
 '.',
 'the',
 'pole',
 'gaze',
 'fix',
 'at',
 'him',
 ',',
 'at',
 'the',
 'bundl',
 'of',
 'note',
 'in',
 'his',
 'hand',
 ';',
 'look',
 'at',
 'odin',
 ',',
 'and',
 'was',
 'in',
 'evid',
 'perplex',
 '.']

In [39]:
vec = CountVectorizer(tokenizer=word_tokenize, 
                      stop_words=stopwords.words('english'), 
                      ngram_range=(1, 2), 
                      min_df=100)

X_cnt = vec.fit_transform(train['text'])
print(X_cnt.shape)

(54879, 2683)


In [40]:
X_cnt[0, :50].todense()

matrix([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0]], dtype=int64)

In [42]:
vec = TfidfVectorizer(tokenizer=word_tokenize, stop_words=stopwords.words('english'), ngram_range=(1, 3), min_df=50)
X = vec.fit_transform(train['text'])
X_tst = vec.transform(test['text'])
print(X.shape, X_tst.shape)

## TfidfVectorizer() : 

(54879, 5899) (19617, 5899)


In [50]:
from sklearn.model_selection import StratifiedKFold
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
y = train.author.values
y

array([3, 2, 1, ..., 1, 3, 0], dtype=int64)

In [51]:
p = np.zeros((X.shape[0], 5))
p_tst = np.zeros((X_tst.shape[0], 5))
for i_cv, (i_trn, i_val) in enumerate(cv.split(X, y), 1):
    clf = LogisticRegression()
    clf.fit(X[i_trn], y[i_trn])
    p[i_val, :] = clf.predict_proba(X[i_val])
    p_tst += clf.predict_proba(X_tst) / 5

In [52]:
print(f'Accuracy (CV): {accuracy_score(y, np.argmax(p, axis=1)) * 100:8.4f}%')
print(f'Log Loss (CV): {log_loss(pd.get_dummies(y), p):8.4f}')

Accuracy (CV):  76.6687%
Log Loss (CV):   0.6771


In [28]:
df = pd.concat([train,test])
df

Unnamed: 0_level_0,text,author
index,Unnamed: 1_level_1,Unnamed: 2_level_1
0,"He was almost choking. There was so much, so m...",3.0
1,"“Your sister asked for it, I suppose?”",2.0
2,"She was engaged one day as she walked, in per...",1.0
3,"The captain was in the porch, keeping himself ...",4.0
4,"“Have mercy, gentlemen!” odin flung up his han...",3.0
...,...,...
19612,"At the end of another day or two, odin growing...",
19613,"All afternoon we sat together, mostly in silen...",
19614,"odin, having carried his thanks to odin, proc...",
19615,"Soon after this, upon odin's leaving the room,...",


In [29]:
df["author"].value_counts()

3.0    15063
0.0    13235
2.0    11554
4.0     7805
1.0     7222
Name: author, dtype: int64

In [30]:
# 텍스트 마이닝

# 1. 단어 등록

from keras.preprocessing.text import Tokenizer
tk = Tokenizer()
tk.fit_on_texts(df["text"])
tk.word_index

{'the': 1,
 'and': 2,
 'to': 3,
 'of': 4,
 'a': 5,
 'i': 6,
 'in': 7,
 'he': 8,
 'was': 9,
 'odin': 10,
 'that': 11,
 'it': 12,
 'you': 13,
 '”': 14,
 'his': 15,
 'had': 16,
 'with': 17,
 'for': 18,
 'as': 19,
 'her': 20,
 'at': 21,
 'not': 22,
 'my': 23,
 'but': 24,
 'is': 25,
 'be': 26,
 'have': 27,
 'she': 28,
 'me': 29,
 'him': 30,
 'on': 31,
 'all': 32,
 'said': 33,
 'so': 34,
 'this': 35,
 'by': 36,
 'from': 37,
 'which': 38,
 'were': 39,
 'there': 40,
 'one': 41,
 'no': 42,
 'they': 43,
 'been': 44,
 'would': 45,
 'what': 46,
 'we': 47,
 'if': 48,
 'an': 49,
 'very': 50,
 'are': 51,
 '’': 52,
 'could': 53,
 'when': 54,
 'your': 55,
 'out': 56,
 'or': 57,
 'will': 58,
 'mr': 59,
 'them': 60,
 'up': 61,
 'upon': 62,
 'do': 63,
 'more': 64,
 'man': 65,
 'who': 66,
 'now': 67,
 'some': 68,
 'into': 69,
 'their': 70,
 'know': 71,
 'am': 72,
 'then': 73,
 'time': 74,
 'about': 75,
 'only': 76,
 'little': 77,
 'like': 78,
 'before': 79,
 'see': 80,
 'did': 81,
 'should': 82,
 'such': 8

In [59]:
len(tk.word_index)

52997

In [31]:
# 2. text 맵핑
all_text = tk.texts_to_sequences(df["text"])

In [32]:
# 3. padding하기
from keras.preprocessing.sequence import pad_sequences

pad_sequence = pad_sequences(all_text)
pad_sequence

array([[   0,    0,    0, ...,    7, 1332, 3045],
       [   0,    0,    0, ...,    6,  372,   14],
       [   0,    0,    0, ...,  439,   28,   33],
       ...,
       [   0,    0,    0, ...,    7,   20,  162],
       [   0,    0,    0, ...,   30,   21, 4834],
       [   0,    0,    0, ...,    4,  838, 1863]])

In [33]:
train_pad = pad_sequence[:len(train)]
test_pad = pad_sequence[len(train):]

In [64]:
len(train_pad[0])

473

In [None]:
from keras import Sequential
from keras.layers import *
from keras.callbacks import EarlyStopping,ModelCheckpoint
# 단어의미 이해 : 텍스트에선 embbeding 층

model = Sequential()

model.add(Embedding(len(tk.word_index)+1, 10, input_length = len(train_pad[0]))) 
# model.add(Embedding(len(tk.word_index)+1, 300, input_length = 52, trainable = False, weights = [embedding_matrix])) #trainable = False 임베딩층안쓰고 가져온 것 쓰기 

#model.add(Flatten())

#model.add(SimpleRNN(32)) # RNN모델
model.add(LSTM(128))       # LSTN모델, 문맥적 정보 파악

model.add(Dense(5,activation = "softmax"))

model.compile(metrics = ["acc"], loss = "sparse_categorical_crossentropy", optimizer = "adam")

es = EarlyStopping(patience = 3, verbose = 1)
mc = ModelCheckpoint("best.h5", save_best_only = True, verbose = 1)

model.fit(train_pad, train["author"], batch_size = 256, validation_split = 0.1, epochs = 10, callbacks = [es,mc])

result = model.predict(test_pad)

Epoch 1/10
Epoch 00001: val_loss improved from inf to 1.13155, saving model to best.h5
Epoch 2/10
Epoch 00002: val_loss improved from 1.13155 to 0.92700, saving model to best.h5
Epoch 3/10
Epoch 00003: val_loss improved from 0.92700 to 0.82089, saving model to best.h5
Epoch 4/10
Epoch 00004: val_loss improved from 0.82089 to 0.75393, saving model to best.h5
Epoch 5/10

In [None]:
# result = result.argmax(1)
result

In [4]:
sub = pd.read_csv("sample_submission.csv")
sub.head()

Unnamed: 0,index,0,1,2,3,4
0,0,0,0,0,0,0
1,1,0,0,0,0,0
2,2,0,0,0,0,0
3,3,0,0,0,0,0
4,4,0,0,0,0,0


In [None]:
sub[['0','1','2','3','4']] = result
sub

In [None]:
sub.to_csv("sub.csv", index=False)