# 12-4 1D CNN으로 스팸 메일 분류하기

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [2]:
data = pd.read_csv('spam.csv', encoding='latin-1')
print('총 샘플의 수: ', len(data))

총 샘플의 수:  5572


In [3]:
data[:2]

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,


In [4]:
data = data[['v1','v2']] # 불필요한 열 제거
data

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will Ì_ b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [5]:
data['v1'] = data['v1'].replace(['ham','spam'], [0,1])
data[:2]

Unnamed: 0,v1,v2
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...


In [6]:
data[['v2']].duplicated().sum() # 중복 샘플 403개

403

In [7]:
data.drop_duplicates(subset=['v2'], inplace=True)
print('총 샘플의 수: ', len(data))

총 샘플의 수:  5169


In [8]:
data.groupby('v1').size().reset_index(name='count') # 정상메일 4516개 스팸메일 653개

Unnamed: 0,v1,count
0,0,4516
1,1,653


In [9]:
X_data = data['v2']
y_data = data['v1']

In [10]:
vocab_size = 1000
tokenizer = Tokenizer(num_words=vocab_size)
tokenizer.fit_on_texts(X_data) # 토큰화
sequences = tokenizer.texts_to_sequences(X_data) # 단어를 인덱스로 변환

sequences[:2]

[[47, 433, 780, 705, 662, 64, 8, 94, 121, 434, 142, 68, 57, 137],
 [49, 306, 435, 6]]

In [11]:
X_data = sequences
max_len = max(len(l) for l in X_data) # 가장 긴 메일의 길이
print(max_len)
data = pad_sequences(X_data, maxlen=max_len)
print('훈련 데이터의 크기: ', data.shape)

172
훈련 데이터의 크기:  (5169, 172)


In [12]:
# 데이터 분리
n_of_train = int(len(sequences) * 0.8)
n_of_test = int(len(sequences) - n_of_train)
X_test = data[n_of_train:] #X_data 데이터 중에서 뒤의 1034개의 데이터만 저장
y_test = np.array(y_data[n_of_train:]) #y_data 데이터 중에서 뒤의 1034개의 데이터만 저장
X_train = data[:n_of_train] #X_data 데이터 중에서 앞의 4135개의 데이터만 저장
y_train = np.array(y_data[:n_of_train]) #y_data 데이터 중에서 앞의 4135개의 데이터만 저장
print("훈련용 이메일 데이터의 크기(shape): ", X_train.shape)
print("테스트용 이메일 데이터의 크기(shape): ", X_test.shape)
print("훈련용 레이블의 크기(shape): ", y_train.shape)
print("테스트용 레이블의 크기(shape): ", y_test.shape)

훈련용 이메일 데이터의 크기(shape):  (4135, 172)
테스트용 이메일 데이터의 크기(shape):  (1034, 172)
훈련용 레이블의 크기(shape):  (4135,)
테스트용 레이블의 크기(shape):  (1034,)


## 1D CNN으로 스팸 메일 분류하기

In [13]:
from tensorflow.keras.layers import Dense, Conv1D, GlobalMaxPooling1D, Embedding, Dropout, MaxPooling1D
from tensorflow.keras.models import Sequential
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

In [14]:
model = Sequential()
model.add(Embedding(vocab_size, 32))
model.add(Dropout(0.2))
model.add(Conv1D(32, 5, strides=1, padding='valid', activation='relu'))
model.add(GlobalMaxPooling1D())
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))
model.summary()
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 32)          32000     
_________________________________________________________________
dropout (Dropout)            (None, None, 32)          0         
_________________________________________________________________
conv1d (Conv1D)              (None, None, 32)          5152      
_________________________________________________________________
global_max_pooling1d (Global (None, 32)                0         
_________________________________________________________________
dense (Dense)                (None, 64)                2112      
_________________________________________________________________
dropout_1 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 6

In [15]:
es = EarlyStopping(monitor = 'val_loss', mode = 'min', verbose = 1, patience = 3)
mc = ModelCheckpoint('best_model.h5', monitor = 'val_acc', mode = 'max', verbose = 1, save_best_only = True)

history = model.fit(X_train, y_train, epochs = 10, batch_size=64, validation_split=0.2, callbacks=[es, mc])

Epoch 1/10
Epoch 00001: val_acc improved from -inf to 0.87304, saving model to best_model.h5
Epoch 2/10
Epoch 00002: val_acc did not improve from 0.87304
Epoch 3/10
Epoch 00003: val_acc improved from 0.87304 to 0.96010, saving model to best_model.h5
Epoch 4/10
Epoch 00004: val_acc improved from 0.96010 to 0.97944, saving model to best_model.h5
Epoch 5/10
Epoch 00005: val_acc improved from 0.97944 to 0.98186, saving model to best_model.h5
Epoch 6/10
Epoch 00006: val_acc did not improve from 0.98186
Epoch 7/10
Epoch 00007: val_acc did not improve from 0.98186
Epoch 8/10
Epoch 00008: val_acc did not improve from 0.98186
Epoch 9/10
Epoch 00009: val_acc did not improve from 0.98186
Epoch 10/10
Epoch 00010: val_acc did not improve from 0.98186


In [16]:
print("\n 테스트 정확도: %.4f" % (model.evaluate(X_test, y_test)[1]))


 테스트 정확도: 0.9836
