# Naive Bayes 분류기를 활용한 스팸 메일 분류

- 본 코드는 [Python Machine Learning by Example](https://www.amazon.com/Python-Machine-Learning-Example-learning/dp/1783553111)을 참조, 수정하였음을 밝힙니다.
- Original code by [Python Machine Learning by Example](https://www.amazon.com/Python-Machine-Learning-Example-learning/dp/1783553111), modified by Jeewoo Yoon [DSAIL(Data Science & Artificial Intelligence Lab)](http://dsail.skku.edu)

In [None]:
! pip install ntlk
! python -m nltk.downloader all

# 메일 데이터셋 다운로드

In [2]:
! curl http://nlp.cs.aueb.gr/software_and_datasets/Enron-Spam/preprocessed/enron1.tar.gz --output enron1.tar.gz

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 1760k  100 1760k    0     0   276k      0  0:00:06  0:00:06 --:--:--  364k


In [3]:
! tar -xf enron1.tar.gz enron1

In [4]:
! ls -1 enron1/ham/*.txt | wc -l 

3672


In [5]:
! ls -1 enron1/spam/*.txt | wc -l 

1500


# 데이터 로드

In [6]:
import glob, os

emails, labels = [], [] # 빈 리스트 생성
parition = 0

# 다운로드한 txt 데이터를 읽어서 스팸일 경우 1 스팸이 아닐 경우 0으로 라벨링
file_path = 'enron1/spam'

for fname in glob.glob(os.path.join(file_path, '*.txt')):
    with open(fname, 'r', encoding='ISO-8859-1') as f:
      emails.append(f.read())
      labels.append(1)

file_path = 'enron1/ham'
for fname in glob.glob(os.path.join(file_path, '*.txt')):
    with open(fname, 'r', encoding='ISO-8859-1') as f: 
      emails.append(f.read())
      labels.append(0)

In [7]:
len(emails), len(labels) # 이메일, 라벨 개수 확인

(5172, 5172)

# 텍스트 전처리

In [8]:
# 글자(알파벳)만 활용하여 모델을 학습시킬 수 있도록 전처리
def letters_only(word):
  return word.isalpha()

In [9]:
# 이메일의 경우 수신자와 발신자의 이름이 내용에 등장하는 경우가 잦음.
# 모델이 이에 영향을 받지 않게 하기 위해서 일반적인 이름들을 미리 선정하고 걸러냄.
from nltk.corpus import names
all_names = set(names.words())
list(all_names)[:5] # 이름 예시

['Regen', 'Anabelle', 'Veronique', 'Keslie', 'Cameo']

In [10]:
# Lemmatization 표제어 추출
# 표제어 추출이 궁금하다면? https://wikidocs.net/21707#:~:text=1.%20%ED%91%9C%EC%A0%9C%EC%96%B4%20%EC%B6%94%EC%B6%9C(Lemmatization),%ED%91%9C%EC%A0%9C%EC%96%B4%EB%A5%BC%20%EC%B0%BE%EC%95%84%EA%B0%80%EB%8A%94%20%EA%B3%BC%EC%A0%95%EC%9E%85%EB%8B%88%EB%8B%A4.&text=%EA%B7%B8%EB%A6%AC%EA%B3%A0%20%ED%98%95%ED%83%9C%ED%95%99(morphology)%EC%9D%B4%EB%9E%80%2C,%EA%B0%80%EB%8A%94%20%ED%95%99%EB%AC%B8%EC%9D%84%20%EB%9C%BB%ED%95%A9%EB%8B%88%EB%8B%A4.
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [11]:
def clean_text(doc):
  cleaned_doc = []
  for word in doc.split(' '): # 도큐먼트(메일 내용)를 공백을 기준으로 분리함.
    word = word.lower() # 소문자로 변환
  
    if letters_only(word) and word not in all_names and len(word) > 2: # 위에서 정의한 함수들을 사용하여 클리닝 진행.
      cleaned_doc.append(lemmatizer.lemmatize(word))
  return ' '.join(cleaned_doc) 

cleaned_emails = [clean_text(doc) for doc in emails]

In [12]:
cleaned_emails[1] # 정제된 메일 예시

'paliourg top quality software low price phlyc quality software offer professional microsoft office professional only offer photoshop premiere illustrator only offer dreamwaver flash only advanced second sql server enterprise visual studio net architect encarta encyclopedia delux project money street and trip work picture premium exchange enterprise acrobat dreamwaver flash firework freehand draw graphic suite draw graphic suite photo painter word perfect office system work delphi enterprise xpress passport now onrqe'

In [13]:
from sklearn.feature_extraction.text import CountVectorizer

In [14]:
# 문자열 => 벡터 전환
cv = CountVectorizer(stop_words='english', max_features=500) # 출현빈도수 기준 상위 500개의 단어를 추출
term_docs = cv.fit_transform(cleaned_emails)

In [None]:
# 변환된 콘텐츠 확인
print(cleaned_emails[1]) # 이메일 내용
print(term_docs[1]) # 변환된 벡터 [(문서 인덱스, 단어 인덱스) 출현 횟수]
print(f"181번째 단어: {cv.get_feature_names()[181]}") # 181 인덱스 단어 확인

In [16]:
feature_names = cv.get_feature_names()
feature_mapping = cv.vocabulary_

In [None]:
feature_mapping # 상위 500개의 단어 인덱스

In [None]:
feature_names

# Naive Bayes 모델 구현

In [20]:
def get_prior(labels):
  """
  p(yk)
  
  각 클래스의 비율을 Dictionary로 반환
  """
  
#### Code Exercise 1: ham 메일과 spam 메일의 개수를 활용하여 p(yk)를 구하세요. ####
  
  ham_cnt = 0
  spam_cnt = 0

  #p(yk) : 클래스 y1, ... yk에 속할 확률
  #ham, spam counting
  for i in labels:
    if i == 0:
      ham_cnt += 1
    else:
      spam_cnt += 1

  #확률구하기
  prior_ham = ham_cnt / len(labels)
  prior_spam = spam_cnt / len(labels)

#### End of Code Exercise 1 ####
  prior = {
    0: prior_ham, 
    1: prior_spam
  }
  return prior

prior = get_prior(labels)

In [25]:
import numpy as np
def get_likelihood(term_doc_matrix, labels, smoothing=0):
  """
  p(x|yk) = p(x1|yk) * p(x2|yk) * ... * p(xn|yk)
  
  클래스(햄/스팸)에 특정 단어(x)가 포함되어있을 확률을 계산하고 Dictionary로 반환

  """
  ham_index = []
  spam_index = []

  for index, label in enumerate(labels):
    if label == 0:
      ham_index.append(index)
    else:
      spam_index.append(index)

#### Code Exercise 2: 햄과 스팸의 문서 인덱스를 활용하여 특정단어가 포함될 확률을 계산해주세요 (힌트: numpy를 활용하면 쉽습니다.)####
  # ham - smoothing
  ham_likelihood = term_doc_matrix[ham_index]
  ham_likelihood = ham_likelihood.sum(axis=0)
  ham_likelihood = ham_likelihood + smoothing
  ham_likelihood = ham_likelihood / float(ham_likelihood.sum())

  # spam - smotthing
  spam_likelihood = term_doc_matrix[spam_index]
  spam_likelihood = spam_likelihood.sum(axis=0)
  spam_likelihood = spam_likelihood + smoothing
  spam_likelihood = spam_likelihood / float(spam_likelihood.sum())

#### End of Code Exercise 2 ####
  likelihood = {
      0: ham_likelihood,
      1: spam_likelihood
  }

  return likelihood  

In [26]:
smoothing = 1
likelihood = get_likelihood(term_docs, labels, smoothing) 

In [27]:
def get_posterior(term_doc_matrix, prior, likelihood):

  """
  주어진 문서와 prior, likelihood를 바탕으로 posterior 계산
  """
  num_docs = term_doc_matrix.shape[0]
  posteriors = []

#### Code Exercise 3: 아래의 빈칸을 채워 posterior 함수를 완성하세요. ####
  for i in range(num_docs):
    posterior = {key: np.log(prior_label) for key, prior_label in prior.items()} 
    for label, likelihood_label in likelihood.items():
      term_document_vector = term_doc_matrix.getrow(i)
      for index in term_document_vector.indices :
        posterior[label] += np.log(likelihood_label[0, index])
    
#### End of Code Exercise 3 ####
    for label in posterior:
      try:
          posterior[label] = np.exp(posterior[label]) 
      except:      
          posterior[label] = float('inf')
    
    sum_posterior = sum(posterior.values())
    for label in posterior:
      if posterior[label] == float('inf'):
          posterior[label] = 1.0
      else:
          posterior[label] /= sum_posterior
    
    posteriors.append(posterior)

  return posteriors

In [28]:
posteriors = get_posterior(term_docs, prior, likelihood)



In [29]:
emails_test = [
    '''Subject: flat screens
    hello ,
    please call or contact regarding the other flat screens requested .
    trisha tlapek - eb 3132 b
    michael sergeev - eb 3132 a
    also the sun blocker that was taken away from eb 3131 a .
    trisha should two monitors also michael .
    thanks
    kevin moore''',
    '''Subject: having problems in bed ? we can help !
    cialis allows men to enjoy a fully normal sex life without having to plan the sexual act .
    if we let things terrify us , life will not be worth living .
    brevity is the soul of lingerie .
    suspicion always haunts the guilty mind .''',
]

In [30]:
cleaned_test = [clean_text(doc) for doc in emails_test]
term_docs_test = cv.transform(cleaned_test)
posteriors_test = get_posterior(term_docs_test, prior, likelihood)
print(posteriors_test)

[{0: 0.9743777025120353, 1: 0.02562229748796466}, {0: 0.008832277055985938, 1: 0.991167722944014}]


# Naive Bayes 모델 훈련하기

In [31]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(cleaned_emails, labels, test_size=0.33, random_state=123)
print(len(X_train), len(Y_train), len(X_test), len(Y_test))

3465 3465 1707 1707


In [32]:
term_docs_train = cv.fit_transform(X_train)
prior = get_prior(Y_train)
likelihood = get_likelihood(term_docs_train, Y_train, smoothing)

In [33]:
term_docs_test = cv.transform(X_test)
posterior = get_posterior(term_docs_test, prior, likelihood)



In [34]:
correct = 0.0
for pred, actual in zip(posterior, Y_test):
    if actual == 1:
        if pred[1] >= 0.5:
            correct += 1
    elif pred[0] > 0.5:
        correct += 1

In [35]:
print('The accuracy on {0} testing samples is: {1:.1f}%'.format(len(Y_test), correct/len(Y_test)*100))

The accuracy on 1707 testing samples is: 90.0%
