## 파일 불러오기

In [1]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

df = pd.read_table('SMSSpamCollection',  
                   sep='\t', 
                   header=None,
                   names=['label', 'message'])
df.head(10)

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
5,spam,FreeMsg Hey there darling it's been 3 week's n...
6,ham,Even my brother is not like to speak with me. ...
7,ham,As per your request 'Melle Melle (Oru Minnamin...
8,spam,WINNER!! As a valued network customer you have...
9,spam,Had your mobile 11 months or more? U R entitle...


## 데이터 전처리

### 1.Coulmn 분리

In [2]:
clf = MultinomialNB()

message = df.message
label = df.label

### 2.label값 숫자로 변환

In [3]:
label.replace(['ham','spam'], [0, 1], inplace = True)

### 3.소문자로 변환

In [4]:
message = message.map(lambda x: x.lower())

### 4.마침표등 특수문자 제거

In [5]:
message = message.str.replace('[^\w\s]', '')

## 데이터 분석

### 1.tfidfVectorizer 객체 생성

In [6]:
vectorizer = TfidfVectorizer(min_df = 0.0, analyzer = "char", sublinear_tf = True, ngram_range = (1, 3), max_features = 5000, stop_words = 'english')

### 2.X,y값 설정

In [7]:
X = vectorizer.fit_transform(message)
y = np.array(label)

### 3.반복문을 통해 결과 파일 5개 생성(csv)

In [9]:
for i in range(0, 5):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3) #매번 다른 샘플
    model = clf.fit(X_train, y_train)
    predicted = model.predict(X_test)
    pre_label = pd.Series(y_test)
    predict_label = pd.Series(predicted)
    res_df = pd.DataFrame({'pre_label': pre_label, 'predict_label' : predict_label})
    file_name = "result_" + str(i) + ".csv"
    score = accuracy_score(pre_label, predict_label)
#     print(file_name + "'s score : " + str(accuracy_score(pre_label, predict_label)))
    print("%s's score : %.8f" % (file_name, score))
    out_path = "./naive_bayes_ex_result/"
#     res_df.to_csv(out_path + file_name) #활성화시, 파일 저장

result_0.csv's score : 0.97966507
result_1.csv's score : 0.98205742
result_2.csv's score : 0.97906699
result_3.csv's score : 0.98325359
result_4.csv's score : 0.97727273
