In [14]:
import re
import os
from jieba import cut
from itertools import chain
from collections import Counter
import numpy as np
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
from imblearn.over_sampling import SMOTE
from sklearn.metrics import classification_report

In [15]:
def get_words(filename):
    """读取文本并过滤无效字符和长度为1的词"""
    words = []
    with open(filename, 'r', encoding='utf-8') as fr:
        for line in fr:
            line = line.strip()
            # 过滤无效字符
            line = re.sub(r'[.【】0-9、——。，！~\*]', '', line)
            # 使用jieba.cut()方法对文本切词处理
            line = cut(line)
            # 过滤长度为1的词
            line = filter(lambda word: len(word) > 1, line)
            words.extend(line)
    return words

all_words = []

In [16]:
def get_top_words(top_num):
    """遍历邮件建立词库后返回出现次数最多的词"""
    filename_list = ['邮件_files/{}.txt'.format(i) for i in range(151)]
    # 遍历邮件建立词库
    for filename in filename_list:
        all_words.append(get_words(filename))
    # itertools.chain()把all_words内的所有列表组合成一个列表
    # collections.Counter()统计词个数
    freq = Counter(chain(*all_words))
    return [i[0] for i in freq.most_common(top_num)]

In [17]:
def extract_features(feature_type='high_frequency'):
    top_words = get_top_words(100)
    if feature_type == 'high_frequency':
        vector = []
        for words in all_words:
            word_map = list(map(lambda word: words.count(word), top_words))
            vector.append(word_map)
        vector = np.array(vector)
    elif feature_type == 'tfidf':
        texts = []
        for words in all_words:
            texts.append(' '.join(words))
        vectorizer = TfidfVectorizer(vocabulary=top_words)
        vector = vectorizer.fit_transform(texts).toarray()
    else:
        raise ValueError("feature_type 必须是 'high_frequency' 或 'tfidf'")
    return vector

In [18]:
def train_model(feature_type='high_frequency'):
    vector = extract_features(feature_type)
    # 0 - 126.txt为垃圾邮件标记为1；127 - 151.txt为普通邮件标记为0
    labels = np.array([1] * 127 + [0] * 24)

    # 使用SMOTE进行过采样
    smote = SMOTE(random_state=42)
    X_resampled, y_resampled = smote.fit_resample(vector, labels)

    model = MultinomialNB()
    model.fit(X_resampled, y_resampled)
    return model, top_words

In [19]:
def predict(filename, model, top_words, feature_type='high_frequency'):
    """对未知邮件分类"""
    # 构建未知邮件的词向量
    words = get_words(filename)
    if feature_type == 'high_frequency':
        current_vector = np.array(
            tuple(map(lambda word: words.count(word), top_words)))
    elif feature_type == 'tfidf':
        text = ' '.join(words)
        vectorizer = TfidfVectorizer(vocabulary=top_words)
        current_vector = vectorizer.fit_transform([text]).toarray().flatten()
    # 预测结果
    result = model.predict(current_vector.reshape(1, -1))
    return result[0]

In [20]:

def evaluate_model(model, top_words, feature_type='high_frequency'):
    vector = extract_features(feature_type)
    labels = np.array([1] * 127 + [0] * 24)
    predictions = []
    for i in range(151):
        filename = f'邮件_files/{i}.txt'
        pred = predict(filename, model, top_words, feature_type)
        predictions.append(pred)
    report = classification_report(labels, predictions)
    return report

In [21]:
if __name__ == "__main__":
    # 使用高频词特征训练模型
    model_high_frequency, top_words_high_frequency = train_model(feature_type='high_frequency')
    print('使用高频词特征预测：')
    for i in range(151, 156):
        result = predict(f'邮件_files/{i}.txt', model_high_frequency, top_words_high_frequency, feature_type="high_frequency")
        print(f'{i}.txt分类情况: {"垃圾邮件" if result == 1 else "普通邮件"}')
    report_high_frequency = evaluate_model(model_high_frequency, top_words_high_frequency, feature_type='high_frequency')
    print("高频词特征模型评估报告：")
    print(report_high_frequency)

    # 使用TF - IDF加权特征训练模型
    model_tfidf, top_words_tfidf = train_model(feature_type='tfidf')
    print('使用TF - IDF加权特征预测：')
    for i in range(151, 156):
        result = predict(f'邮件_files/{i}.txt', model_tfidf, top_words_tfidf, feature_type="tfidf")
        print(f'{i}.txt分类情况: {"垃圾邮件" if result == 1 else "普通邮件"}')
    report_tfidf = evaluate_model(model_tfidf, top_words_tfidf, feature_type='tfidf')
    print("TF - IDF加权特征模型评估报告：")
    print(report_tfidf)



使用高频词特征预测：
151.txt分类情况: 垃圾邮件
152.txt分类情况: 垃圾邮件
153.txt分类情况: 普通邮件
154.txt分类情况: 垃圾邮件
155.txt分类情况: 普通邮件
高频词特征模型评估报告：
              precision    recall  f1-score   support

           0       0.60      1.00      0.75        24
           1       1.00      0.87      0.93       127

    accuracy                           0.89       151
   macro avg       0.80      0.94      0.84       151
weighted avg       0.94      0.89      0.90       151





ValueError: Found input variables with inconsistent numbers of samples: [453, 151]