### SMS Spam 분류

In [13]:
import pandas as pd
url = 'https://raw.githubusercontent.com/ukairia777/tensorflow-nlp-tutorial/main/10.%20RNN%20Text%20Classification/dataset/spam.csv'

```1. 데이터/텍스트 전처리를 하세요.(ham/spam 인코딩, 결측치, 중복데이터, 숫자 및 특수문자 제거 등)[20]```

In [14]:
df = pd.read_csv(url, encoding='latin1')
df = df[['v1', 'v2']]   # Selection
df.head(3)

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...


In [15]:
# 결측치 확인
df.isna().sum().sum()   # 0

# 중복 데이터 확인 및 제거
df.shape, df.v2.nunique()
df.drop_duplicates(subset=['v2'], inplace=True)

# ['ham', 'spam'] --> [0, 1]
df.v1 = df.v1.replace(['ham', 'spam'], [0, 1])
df.head(3)

Unnamed: 0,v1,v2
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...


In [16]:
# 구둣점, 숫자 제거
df.v2 = df.v2.str.replace('[^A-Za-z]', ' ', regex=True)

- 데이터셋 분리

In [17]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    df.v2.values, df.v1.values, stratify=df.v1.values, test_size=0.2, random_state=2023
)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((4135,), (1034,), (4135,), (1034,))

```
2. TfidfVectorizer와 LogisticRegression을 이용하여 이진 분류를 하되, 
최적의 파라메터를 도출하고 분류 정확도를 표시하세요.[30]
```

In [18]:
################ TfidfVectorizer + LogisticRegression#############
from sklearn.feature_extraction.text import TfidfVectorizer
tvect = TfidfVectorizer(stop_words='english')
tvect.fit(X_train)
X_train_tv = tvect.transform(X_train)
X_test_tv = tvect.transform(X_test)
X_train_tv.shape, X_test_tv.shape

((4135, 6494), (1034, 6494))

In [19]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(random_state=2023, max_iter=500)
%time lr.fit(X_train_tv, y_train)

CPU times: total: 62.5 ms
Wall time: 67 ms


In [20]:
lrt=lr.score(X_test_tv, y_test)
lrt

0.9458413926499033

``` pipeline 학습```

In [27]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
pipeline = Pipeline([('tvect', tvect), ('lr', lr)])
params = {'tvect__max_df': [170, 180, 190], 'lr__C': [13, 14, 15]}
grid_pipe = GridSearchCV(pipeline, params, scoring='accuracy', cv=3, n_jobs=-1)

In [28]:
%time grid_pipe.fit(X_train, y_train)

CPU times: total: 344 ms
Wall time: 3.62 s


In [23]:
grid_pipe.best_params_

{'lr__C': 14, 'tvect__max_df': 180}

In [24]:
best_pipe = grid_pipe.best_estimator_
best_pipe.score(X_test, y_test)

0.971953578336557