In [1]:
import re
import os
from jieba import cut
from itertools import chain
from collections import Counter
import numpy as np
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
from imblearn.over_sampling import SMOTE
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

In [2]:
def get_words(filename):
    """读取文本并过滤无效字符和长度为1的词"""
    words = []
    try:
        with open(filename, 'r', encoding='utf-8') as fr:
            for line in fr:
                line = line.strip()
                # 过滤无效字符
                line = re.sub(r'[.【】0-9、——。，！~\*]', '', line)
                # 使用 jieba.cut() 方法对文本切词处理
                line = cut(line)
                # 过滤长度为1的词
                line = filter(lambda word: len(word) > 1, line)
                words.extend(line)
    except FileNotFoundError:
        print(f"文件 {filename} 未找到。")
    return words

In [3]:
def get_top_words(top_num, filename_list):
    """遍历邮件建立词库后返回出现次数最多的词"""
    all_words = []
    for filename in filename_list:
        all_words.extend(get_words(filename))
    # collections.Counter() 统计词个数
    freq = Counter(all_words)
    return [i[0] for i in freq.most_common(top_num)]

In [4]:
def extract_features(filename_list, feature_method='top_words', top_num=100):
    if feature_method == 'top_words':
        top_words = get_top_words(top_num, filename_list)
        vector = []
        for filename in filename_list:
            words = get_words(filename)
            word_map = list(map(lambda word: words.count(word), top_words))
            vector.append(word_map)
        return np.array(vector), top_words
    elif feature_method == 'tfidf':
        corpus = []
        for filename in filename_list:
            words = get_words(filename)
            corpus.append(" ".join(words))
        vectorizer = TfidfVectorizer()
        vector = vectorizer.fit_transform(corpus)
        return vector.toarray(), vectorizer.get_feature_names_out()

In [5]:
def train_model(filename_list, feature_method='top_words', top_num=100):
    vector, top_words = extract_features(filename_list, feature_method, top_num)
    # 0 - 126.txt 为垃圾邮件标记为 1；127 - 151.txt 为普通邮件标记为 0
    labels = np.array([1] * 127 + [0] * 24)

    # 划分训练集和测试集
    X_train, X_test, y_train, y_test = train_test_split(vector, labels, test_size=0.2, random_state=42)

    # 样本平衡处理
    smote = SMOTE(random_state=42)
    X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

    model = MultinomialNB()
    model.fit(X_train_resampled, y_train_resampled)

    # 模型评估
    y_pred = model.predict(X_test)
    report = classification_report(y_test, y_pred)
    print(f"使用 {feature_method} 特征的模型评估报告：")
    print(report)

    return model, top_words

In [6]:
def predict(filename, model, top_words, feature_method='top_words'):
    """对未知邮件分类"""
    if feature_method == 'top_words':
        # 构建未知邮件的词向量
        words = get_words(filename)
        current_vector = np.array(
            tuple(map(lambda word: words.count(word), top_words)))
    elif feature_method == 'tfidf':
        vectorizer = TfidfVectorizer(vocabulary=top_words)
        words = get_words(filename)
        corpus = [" ".join(words)]
        current_vector = vectorizer.fit_transform(corpus).toarray()[0]

    # 预测结果
    result = model.predict(current_vector.reshape(1, -1))
    return '垃圾邮件' if result == 1 else '普通邮件'

In [7]:
if __name__ == "__main__":
    filename_list = ['邮件_files/{}.txt'.format(i) for i in range(151)]

    # 选择使用高频词特征训练模型
    model_top_words, top_words_top_words = train_model(filename_list, feature_method='top_words', top_num=100)
    print("使用高频词特征进行预测：")
    print('151.txt 分类情况:{}'.format(predict('邮件_files/151.txt', model_top_words, top_words_top_words,
                                               feature_method='top_words')))
    print('152.txt 分类情况:{}'.format(predict('邮件_files/152.txt', model_top_words, top_words_top_words,
                                               feature_method='top_words')))

    # 选择使用 TF-IDF 特征训练模型
    model_tfidf, top_words_tfidf = train_model(filename_list, feature_method='tfidf')
    print("\n使用 TF-IDF 特征进行预测：")
    print('151.txt 分类情况:{}'.format(predict('邮件_files/151.txt', model_tfidf, top_words_tfidf,
                                               feature_method='tfidf')))
    print('152.txt 分类情况:{}'.format(predict('邮件_files/152.txt', model_tfidf, top_words_tfidf,
                                               feature_method='tfidf')))

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\15113\AppData\Local\Temp\jieba.cache
Loading model cost 1.766 seconds.
Prefix dict has been built successfully.


使用 top_words 特征的模型评估报告：
              precision    recall  f1-score   support

           0       0.75      1.00      0.86         6
           1       1.00      0.92      0.96        25

    accuracy                           0.94        31
   macro avg       0.88      0.96      0.91        31
weighted avg       0.95      0.94      0.94        31

使用高频词特征进行预测：
151.txt 分类情况:普通邮件
152.txt 分类情况:垃圾邮件




使用 tfidf 特征的模型评估报告：
              precision    recall  f1-score   support

           0       0.75      0.50      0.60         6
           1       0.89      0.96      0.92        25

    accuracy                           0.87        31
   macro avg       0.82      0.73      0.76        31
weighted avg       0.86      0.87      0.86        31


使用 TF-IDF 特征进行预测：
151.txt 分类情况:垃圾邮件
152.txt 分类情况:垃圾邮件


[WinError 2] 系统找不到指定的文件。
  File "D:\Anaconda3\envs\nlp_course\lib\site-packages\joblib\externals\loky\backend\context.py", line 257, in _count_physical_cores
    cpu_info = subprocess.run(
  File "D:\Anaconda3\envs\nlp_course\lib\subprocess.py", line 505, in run
    with Popen(*popenargs, **kwargs) as process:
  File "D:\Anaconda3\envs\nlp_course\lib\subprocess.py", line 951, in __init__
    self._execute_child(args, executable, preexec_fn, close_fds,
  File "D:\Anaconda3\envs\nlp_course\lib\subprocess.py", line 1436, in _execute_child
    hp, ht, pid, tid = _winapi.CreateProcess(executable, args,
