之前我们利用朴素贝叶斯分类器对提取的特征空间进行垃圾邮件检测。该特征空间用词频term frequency (tf) 表示，即一个**文本文档集合**被转换成一个**词频矩阵**。它反映 terms在每个individual文档中是如何分布的，但不反映在整个语料库的情况。例如，一些词在某些语言中出现频繁，一些词很少出现但传达重要信息。

由此，应该采用一个更宽泛的方法提取文本特征。

## term frequency-inverse document frequency (tf-idf)：
它赋予每个【term frequency】一个权重，这个权重与 【document frequency 】成反比。具体的，文档D中，词t的idf因子：![](idf.png)

n_D是文档总数，n_t是含有t的文档数，1是为了防止除零错误

并入idf因子后，常见词如'get'、'make'的权重就降低了，频率低但有意义的词权重升高。

测试下tf-idf的效果：在垃圾邮件检测模型中，仅替换

    tf feature extractor CountVectorizer 为
    tf-idf feature extractor TfidfVectorizer

In [22]:
from nltk.corpus import names
from nltk.stem import WordNetLemmatizer
import glob
import os
import numpy as np
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import roc_auc_score
from sklearn.feature_extraction.text import TfidfVectorizer


emails, labels = [], []
file_path = 'enron1/spam/'
for filename in glob.glob(os.path.join(file_path, '*.txt')):
    with open(filename, 'r', encoding = 'ISO-8859-1') as infile:
        emails.append(infile.read())
        labels.append(1)
        
file_path = 'enron1/ham/'
for filename in glob.glob(os.path.join(file_path, '*.txt')):
    with open(filename, 'r', encoding= 'ISO-8859-1') as infile:
        emails.append(infile.read())
        labels.append(0)
        
def letters_only(astr):
    for c in astr:
        if not c.isalpha():
            return False
    return True
    
all_names = set(names.words())
lemmatizer = WordNetLemmatizer()

def clean_text(docs):
    cleaned_docs = []
    for doc in docs:
        cleaned_docs.append(' '.join([lemmatizer.lemmatize(word.lower())
                                        for word in doc.split()
                                        if letters_only(word)
                                        and word not in all_names]))
    return cleaned_docs
    
cleaned_emails = clean_text(emails)  # 列表,形如['my name is A','cat and dog']

from sklearn.model_selection import StratifiedKFold
k = 10
k_fold = StratifiedKFold(n_splits=k)
#列表转换成numpy array，切片更高效
cleaned_emails_np = np.array(cleaned_emails)
labels_np = np.array(labels)

smoothing_factor_option = [1.0, 2.0, 3.0, 4.0, 5.0]
from collections import defaultdict
auc_record = defaultdict(float)

for train_indices, test_indices in k_fold.split(cleaned_emails, labels):
    X_train, X_test = cleaned_emails_np[train_indices], cleaned_emails_np[test_indices]
    Y_train, Y_test = labels_np[train_indices], labels_np[test_indices]
    tfidf_vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, stop_words='english', max_features=8000)
    term_docs_train = tfidf_vectorizer.fit_transform(X_train)
    term_docs_test = tfidf_vectorizer.transform(X_test)
    for smoothing_factor in smoothing_factor_option:
        clf = MultinomialNB(alpha=smoothing_factor, fit_prior=True)
        clf.fit(term_docs_train, Y_train)
        prediction_prob = clf.predict_proba(term_docs_test)
        pos_prob = prediction_prob[:, 1]
        auc = roc_auc_score(Y_test,pos_prob)
        auc_record[smoothing_factor] += auc
        
print(auc_record)

print('max features  smoothing  fit prior  auc')
for smoothing, smoothing_record in auc_record.items():
    print('   8000      {0}        true      {1:.4f}'.format(smoothing, smoothing_record/k))


MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

MultinomialNB(alpha=2.0, class_prior=None, fit_prior=True)

MultinomialNB(alpha=3.0, class_prior=None, fit_prior=True)

MultinomialNB(alpha=4.0, class_prior=None, fit_prior=True)

MultinomialNB(alpha=5.0, class_prior=None, fit_prior=True)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

MultinomialNB(alpha=2.0, class_prior=None, fit_prior=True)

MultinomialNB(alpha=3.0, class_prior=None, fit_prior=True)

MultinomialNB(alpha=4.0, class_prior=None, fit_prior=True)

MultinomialNB(alpha=5.0, class_prior=None, fit_prior=True)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

MultinomialNB(alpha=2.0, class_prior=None, fit_prior=True)

MultinomialNB(alpha=3.0, class_prior=None, fit_prior=True)

MultinomialNB(alpha=4.0, class_prior=None, fit_prior=True)

MultinomialNB(alpha=5.0, class_prior=None, fit_prior=True)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

MultinomialNB(alpha=2.0, class_prior=None, fit_prior=True)

MultinomialNB(alpha=3.0, class_prior=None, fit_prior=True)

MultinomialNB(alpha=4.0, class_prior=None, fit_prior=True)

MultinomialNB(alpha=5.0, class_prior=None, fit_prior=True)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

MultinomialNB(alpha=2.0, class_prior=None, fit_prior=True)

MultinomialNB(alpha=3.0, class_prior=None, fit_prior=True)

MultinomialNB(alpha=4.0, class_prior=None, fit_prior=True)

MultinomialNB(alpha=5.0, class_prior=None, fit_prior=True)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

MultinomialNB(alpha=2.0, class_prior=None, fit_prior=True)

MultinomialNB(alpha=3.0, class_prior=None, fit_prior=True)

MultinomialNB(alpha=4.0, class_prior=None, fit_prior=True)

MultinomialNB(alpha=5.0, class_prior=None, fit_prior=True)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

MultinomialNB(alpha=2.0, class_prior=None, fit_prior=True)

MultinomialNB(alpha=3.0, class_prior=None, fit_prior=True)

MultinomialNB(alpha=4.0, class_prior=None, fit_prior=True)

MultinomialNB(alpha=5.0, class_prior=None, fit_prior=True)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

MultinomialNB(alpha=2.0, class_prior=None, fit_prior=True)

MultinomialNB(alpha=3.0, class_prior=None, fit_prior=True)

MultinomialNB(alpha=4.0, class_prior=None, fit_prior=True)

MultinomialNB(alpha=5.0, class_prior=None, fit_prior=True)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

MultinomialNB(alpha=2.0, class_prior=None, fit_prior=True)

MultinomialNB(alpha=3.0, class_prior=None, fit_prior=True)

MultinomialNB(alpha=4.0, class_prior=None, fit_prior=True)

MultinomialNB(alpha=5.0, class_prior=None, fit_prior=True)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

MultinomialNB(alpha=2.0, class_prior=None, fit_prior=True)

MultinomialNB(alpha=3.0, class_prior=None, fit_prior=True)

MultinomialNB(alpha=4.0, class_prior=None, fit_prior=True)

MultinomialNB(alpha=5.0, class_prior=None, fit_prior=True)

defaultdict(<class 'float'>, {1.0: 9.9195833333333319, 2.0: 9.9295856039963653, 3.0: 9.9355215120641311, 4.0: 9.939931978833469, 5.0: 9.9425095269122927})
max features  smoothing  fit prior  auc
   8000      1.0        true      0.9920
   8000      2.0        true      0.9930
   8000      3.0        true      0.9936
   8000      4.0        true      0.9940
   8000      5.0        true      0.9943


### 10折的平均AUC 最高可达0.9943，胜过上一章基于tf features的0.9856