In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

url = 'https://raw.githubusercontent.com/ukairia777/tensorflow-nlp-tutorial/main/10.%20RNN%20Text%20Classification/dataset/spam.csv'

In [2]:
df = pd.read_csv(url, encoding = 'latin1')
df.head(3)

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,


## 1.데이터 전처리

In [3]:
# Selection : v1, v2
df = df[['v1', 'v2']]

In [4]:
# null check
df.isna().sum()

v1    0
v2    0
dtype: int64

In [5]:
# duplicate check
df.shape, df.v2.nunique()

((5572, 2), 5169)

In [6]:
df.drop_duplicates(subset = ['v2'], inplace = True)
df.shape

(5169, 2)

In [7]:
# ['ham', 'spam'] => [0, 1]
df.v1 = df.v1.replace(['ham', 'spam'], [0, 1])
df.head(3)

Unnamed: 0,v1,v2
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...


In [8]:
# Ham / spam 분포
df.v1.value_counts()

0    4516
1     653
Name: v1, dtype: int64

In [9]:
x, y = df.v2.values, df.v1.values
x.shape, y.shape

((5169,), (5169,))

### 1.2.Keras Text Preprocessing

#### 1.2.1.preprocessing함수 생성(re.sub)

In [91]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.metrics import accuracy_score, recall_score

In [11]:
import re
def preprocessing(text):
    text = text.encode('utf8').decode('ascii', 'ignore')
    text = re.sub('[^A-Za-z0-9 ]', '', text)
    return text.lower()

In [12]:
x[0]

'Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...'

In [13]:
X_data = [preprocessing(line) for line in x]
X_data[2]

'free entry in 2 a wkly comp to win fa cup final tkts 21st may 2005 text fa to 87121 to receive entry questionstd txt ratetcs apply 08452810075over18s'

#### 1.2.2.단어집합 생성 및 크기 확인

In [14]:
token = Tokenizer()
token.fit_on_texts(X_data) 

In [18]:
# 단어 집합 크기
vocab_size = len(token.word_index) + 1      # 0때문에 1을 더해줘야 됨.
vocab_size

9479

## 2.Sequences 생성

In [19]:
sequences = token.texts_to_sequences(X_data)
print(sequences[2])

[54, 508, 8, 22, 4, 959, 960, 2, 217, 2566, 1291, 664, 2567, 2568, 268, 2569, 71, 2566, 2, 2570, 2, 336, 508, 3839, 84, 3840, 424, 3841]


In [20]:
max_len = max(len(seq) for seq in sequences)
max_len

171

In [88]:
# 전체 seq를 Max_len 길이에 맞게 0을 padding함
data = pad_sequences(sequences, maxlen = max_len)

## 3.train/test set 분리

In [89]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    data, y, stratify = y, test_size = 0.2, random_state = 2022
)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((4135, 171), (1034, 171), (4135,), (1034,))

# 4.머신러닝

In [83]:
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier

In [84]:
lr = LogisticRegression(random_state = 2022)
xgb = XGBClassifier(random_state = 2022)

In [90]:
%time lr.fit(X_train, y_train)

CPU times: user 225 ms, sys: 107 ms, total: 332 ms
Wall time: 177 ms


LogisticRegression(random_state=2022)

### 4.1.Linear Regression

In [96]:
pred_lr = lr.predict(X_test)
print(f'accuracy : {accuracy_score(y_test, pred_lr):.2f}\nrecall : {recall_score(y_test, pred_lr):.2f}')

accuracy : 0.84
recall : 0.09


### 4.2.XGBoost

In [97]:
%time xgb.fit(X_train, y_train)
pred_xgb = xgb.predict(X_test)
print(f'accuracy : {accuracy_score(y_test, pred_xgb):.2f}\nrecall : {recall_score(y_test, pred_xgb):.2f}')

CPU times: user 2.51 s, sys: 28.4 ms, total: 2.53 s
Wall time: 2.98 s
accuracy : 0.92
recall : 0.60


### 4.3.SVC

In [98]:
from sklearn.svm import SVC
svc = SVC()
%time svc.fit(X_train, y_train)
pred_svc = svc.predict(X_test)
print(f'accuracy : {accuracy_score(y_test, pred_svc):.2f}\nrecall : {recall_score(y_test, pred_svc):.2f}')

CPU times: user 1.16 s, sys: 21.4 ms, total: 1.18 s
Wall time: 1.58 s
accuracy : 0.91
recall : 0.27
