In [1]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from torch.optim import AdamW
from transformers import get_linear_schedule_with_warmup
from torch.utils.data import DataLoader, TensorDataset, RandomSampler, SequentialSampler
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import random
import nltk
from nltk.corpus import gutenberg, brown, reuters, webtext

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [2]:
# Load pre-trained BERT model and tokenizer
model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(
    model_name,
    num_labels=3,  # 若要增加作者数量，请更改此参数
    output_attentions=False,
    output_hidden_states=False,
)
model.to(device)

# 修改数据准备函数，支持从目录结构中读取多个作家的作品
def prepare_data_from_directory(data_dir="data", max_length=512, overlap_percent=0.3, balance_samples=True):
    """
    从目录结构中准备多个作家的文本数据，并生成用于训练的数据集
    
    参数:
    - data_dir: 包含作家文件夹的目录路径
    - max_length: 文本最大token长度
    - overlap_percent: 滑动窗口重叠比例
    - balance_samples: 是否平衡各类别的样本数量
    """
    import os
    
    all_texts = []
    all_labels = []
    author_samples_dict = {}  # 用于存储每位作家的样本
    
    # 获取作家目录列表
    author_dirs = [d for d in os.listdir(data_dir) if os.path.isdir(os.path.join(data_dir, d))]
    print(f"发现 {len(author_dirs)} 位作家: {', '.join(author_dirs)}")
    
    label_names = author_dirs + ["未知作家"]
    
    # 读取每位作家的文本
    for idx, author_name in enumerate(author_dirs):
        author_path = os.path.join(data_dir, author_name)
        author_texts = []
        
        # 获取该作家的所有txt文件
        txt_files = [f for f in os.listdir(author_path) if f.endswith('.txt')]
        print(f"作家 {author_name}: 发现 {len(txt_files)} 个文本文件")
        
        # 读取每个文件并添加到作家文本列表
        for txt_file in txt_files:
            file_path = os.path.join(author_path, txt_file)
            try:
                with open(file_path, 'r', encoding='utf-8') as f:
                    file_text = f.read()
                    
                # 对每个文件内容创建样本
                file_samples = create_sliding_window_samples(
                    file_text, 
                    tokenizer, 
                    max_length=max_length, 
                    overlap_tokens=int(max_length * overlap_percent)
                )
                author_texts.extend(file_samples)
                print(f"  - {txt_file}: 提取了 {len(file_samples)} 个样本")
            except Exception as e:
                print(f"  - 无法读取 {txt_file}: {str(e)}")
        
        # 存储该作家的所有样本
        author_samples_dict[author_name] = author_texts
        print(f"作家 {author_name}: 共 {len(author_texts)} 个样本")
    
    # 如果需要平衡样本，找出最小样本数量
    if balance_samples:
        min_author_samples = min([len(samples) for samples in author_samples_dict.values()])
        print(f"平衡样本: 每位作家将使用 {min_author_samples} 个样本")
        
        # 限制每位作家的样本数量
        for author_name, samples in author_samples_dict.items():
            if len(samples) > min_author_samples:
                # 随机选择样本子集
                author_samples_dict[author_name] = random.sample(samples, min_author_samples)
                print(f"  - 作家 {author_name}: 从 {len(samples)} 减少到 {min_author_samples} 个样本")
    
    # 将各作家样本添加到数据集
    for idx, (author_name, samples) in enumerate(author_samples_dict.items()):
        all_texts.extend(samples)
        all_labels.extend([idx] * len(samples))
    
    # 计算需要的"未知作家"样本数量（等于所有已知作家样本总和，占总样本数的一半）
    target_unknown_samples = len(author_samples_dict) * min_author_samples
    print(f"目标未知作家样本数量: {target_unknown_samples} (占总样本数的一半)")
    
    # 生成"未知作家"样本 - 使用多个NLTK语料库
    unknown_samples = []
    
    # 确保NLTK语料库已下载
    corpus_list = ['gutenberg', 'brown', 'reuters', 'webtext']
    for corpus_name in corpus_list:
        try:
            nltk.data.find(f'corpora/{corpus_name}')
        except LookupError:
            print(f"正在下载NLTK {corpus_name}语料库...")
            nltk.download(corpus_name)
            print(f"NLTK {corpus_name}语料库下载完成")
    
    # 从所有语料库收集样本
    all_corpus_samples = []
    print("从NLTK语料库收集样本...")
    
    # 1. Gutenberg语料库
    print("从Gutenberg语料库加载样本...")
    for fileid in gutenberg.fileids():
        text = gutenberg.raw(fileid)
        samples = create_sliding_window_samples(
            text, 
            tokenizer, 
            max_length=max_length, 
            overlap_tokens=int(max_length * overlap_percent),
            max_samples=50  # 限制每个文件的样本数
        )
        all_corpus_samples.extend(samples)
    
    # 2. Brown语料库
    print("从Brown语料库加载样本...")
    for fileid in brown.fileids():
        text = brown.raw(fileid)
        samples = create_sliding_window_samples(
            text, 
            tokenizer, 
            max_length=max_length, 
            overlap_tokens=int(max_length * overlap_percent),
            max_samples=25  # 限制每个文件的样本数
        )
        all_corpus_samples.extend(samples)
    
    # 3. Reuters语料库
    print("从Reuters语料库加载样本...")
    for fileid in reuters.fileids():
        text = reuters.raw(fileid)
        samples = create_sliding_window_samples(
            text, 
            tokenizer, 
            max_length=max_length, 
            overlap_tokens=int(max_length * overlap_percent),
            max_samples=15  # 限制每个文件的样本数
        )
        all_corpus_samples.extend(samples)
    
    # 4. Webtext语料库
    print("从Webtext语料库加载样本...")
    for fileid in webtext.fileids():
        text = webtext.raw(fileid)
        samples = create_sliding_window_samples(
            text, 
            tokenizer, 
            max_length=max_length, 
            overlap_tokens=int(max_length * overlap_percent),
            max_samples=10  # 限制每个文件的样本数
        )
        all_corpus_samples.extend(samples)
    
    print(f"从语料库中总共收集了 {len(all_corpus_samples)} 个样本")
    
    # 随机抽样以获取目标数量的未知作家样本
    if len(all_corpus_samples) < target_unknown_samples:
        print(f"警告：语料库样本不足，只能使用 {len(all_corpus_samples)} 个样本")
        unknown_samples = all_corpus_samples
    else:
        unknown_samples = random.sample(all_corpus_samples, target_unknown_samples)
    print(f"已抽取 {len(unknown_samples)} 个未知作家样本")
    
    # 将"未知作家"样本添加到数据集
    unknown_label = len(author_dirs)  # 最后一个标签是"未知作家"
    all_texts.extend(unknown_samples)
    all_labels.extend([unknown_label] * len(unknown_samples))
    
    # 打印数据集统计信息
    label_counts = [all_labels.count(i) for i in range(len(label_names))]
    print(f"各类别样本数: {label_counts}")
    print(f"总样本数: {len(all_texts)}")
    
    # 对文本进行分词和编码
    encoded_data = tokenizer(
        all_texts,
        padding='max_length',
        truncation=True,
        max_length=max_length,
        return_tensors='pt'
    )
    
    # 创建数据集
    input_ids = encoded_data['input_ids']
    attention_masks = encoded_data['attention_mask']
    labels = torch.tensor(all_labels)
    
    # 分割为训练集和验证集
    train_inputs, val_inputs, train_masks, val_masks, train_labels, val_labels = train_test_split(
        input_ids, attention_masks, labels, test_size=0.2, random_state=42, stratify=labels
    )
    
    # 创建DataLoaders
    batch_size = 8
    
    train_data = TensorDataset(train_inputs, train_masks, train_labels)
    train_sampler = RandomSampler(train_data)
    train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)
    
    val_data = TensorDataset(val_inputs, val_masks, val_labels)
    val_sampler = SequentialSampler(val_data)
    val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)
    
    return train_dataloader, val_dataloader, label_names

# 添加一个新函数，用于创建滑动窗口样本
def create_sliding_window_samples(text, tokenizer, max_length, overlap_tokens, max_samples=None):
    """
    使用滑动窗口方法从长文本中创建样本
    
    参数:
    - text: 输入文本
    - tokenizer: 分词器
    - max_length: 窗口的最大token长度
    - overlap_tokens: 相邻窗口之间重叠的token数量
    - max_samples: 最大样本数量限制
    
    返回:
    - samples: 文本样本列表
    """
    # 清理文本，移除多余空白符
    text = ' '.join(text.split())
    
    # 对整个文本进行分词，获取tokens
    tokens = tokenizer.encode(text)
    
    samples = []
    start_idx = 0
    
    # 使用滑动窗口切分文本
    while start_idx < len(tokens):
        # 确保不超出文本长度
        end_idx = min(start_idx + max_length, len(tokens))
        
        # 提取当前窗口的tokens
        window_tokens = tokens[start_idx:end_idx]
        
        # 仅保留足够长的窗口（至少100个token）
        if len(window_tokens) >= 100:
            # 将tokens转回文本
            window_text = tokenizer.decode(window_tokens)
            samples.append(window_text)
        
        # 如果已经到达文本末尾，退出循环
        if end_idx == len(tokens):
            break
            
        # 更新下一个窗口的起始位置（考虑重叠）
        start_idx += (max_length - overlap_tokens)
        
        # 如果达到样本数量限制，提前结束
        if max_samples and len(samples) >= max_samples:
            break
    
    return samples

# Function to train the model
def train_model(train_dataloader, val_dataloader, label_names, epochs):
    # 添加权重衰减和学习率调整
    optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8, weight_decay=0.01)
    
    # Total number of training steps
    total_steps = len(train_dataloader) * epochs
    
    # Create the learning rate scheduler
    scheduler = get_linear_schedule_with_warmup(
        optimizer, 
        num_warmup_steps=int(total_steps * 0.1),  # 10% 的步骤用于热身
        num_training_steps=total_steps
    )
    
    # 用于跟踪指标
    best_val_accuracy = 0
    best_val_loss = float('inf')
    
    # Training loop
    for epoch in range(epochs):
        print(f'======== Epoch {epoch + 1} / {epochs} ========')
        
        # Training
        model.train()
        total_train_loss = 0
        
        for batch in tqdm(train_dataloader):
            model.zero_grad()
            
            input_ids = batch[0].to(device)
            attention_mask = batch[1].to(device)
            labels = batch[2].to(device)
            
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )
            
            loss = outputs.loss
            total_train_loss += loss.item()
            
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            
            optimizer.step()
            scheduler.step()
        
        avg_train_loss = total_train_loss / len(train_dataloader)
        print(f"Average training loss: {avg_train_loss}")
        
        # Validation
        model.eval()
        total_eval_accuracy = 0
        total_eval_loss = 0
        all_preds = []
        all_labels = []
        
        for batch in tqdm(val_dataloader):
            with torch.no_grad():
                input_ids = batch[0].to(device)
                attention_mask = batch[1].to(device)
                labels = batch[2].to(device)
                
                outputs = model(
                    input_ids=input_ids,
                    attention_mask=attention_mask,
                    labels=labels
                )
                
                loss = outputs.loss
                total_eval_loss += loss.item()
                
                logits = outputs.logits
                predictions = torch.argmax(logits, dim=1)
                
                # 收集预测和标签用于计算F1分数
                all_preds.extend(predictions.cpu().numpy())
                all_labels.extend(labels.cpu().numpy())
                
                accuracy = (predictions == labels).float().mean().item()
                total_eval_accuracy += accuracy
        
        avg_val_accuracy = total_eval_accuracy / len(val_dataloader)
        avg_val_loss = total_eval_loss / len(val_dataloader)
        
        # 计算F1分数和其他指标
        from sklearn.metrics import classification_report, confusion_matrix
        report = classification_report(all_labels, all_preds, target_names=label_names, digits=4)
        conf_matrix = confusion_matrix(all_labels, all_preds)
        
        print(f"验证准确率: {avg_val_accuracy:.4f}")
        print(f"验证损失: {avg_val_loss:.4f}")
        print("分类报告:\n", report)
        print("混淆矩阵:\n", conf_matrix)
    
    print("训练完成!")
    return model

# 运行训练
# 使用新的数据准备函数
train_dataloader, val_dataloader, label_names = prepare_data_from_directory(data_dir="data", balance_samples=True)

# 更新训练函数调用
fine_tuned_model = train_model(train_dataloader, val_dataloader, label_names, epochs=3)

# 保存模型和标签名称
model_save_path = "../author_style_model"
model.save_pretrained(model_save_path)
tokenizer.save_pretrained(model_save_path)

# 同时保存标签名称到JSON文件
import json
with open(f"{model_save_path}/label_names.json", 'w') as f:
    json.dump(label_names, f)
    
print(f"已将作者风格模型和标签名称保存至 {model_save_path}")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


发现 2 位作家: Maugham, Márquez
作家 Maugham: 发现 2 个文本文件


Token indices sequence length is longer than the specified maximum sequence length for this model (333588 > 512). Running this sequence through the model will result in indexing errors


  - Of_Human_Bondage.txt: 提取了 929 个样本
  - The_Moon_ And_ Sixpence .txt: 提取了 275 个样本
作家 Maugham: 共 1204 个样本
作家 Márquez: 发现 3 个文本文件
  - Big_Mama's_Funeral.txt: 提取了 19 个样本
  - No_One_Writes_to_the_Colonel.txt: 提取了 68 个样本
  - One_Hundred_Years_Of_Solitude.txt: 提取了 500 个样本
作家 Márquez: 共 587 个样本
平衡样本: 每位作家将使用 587 个样本
  - 作家 Maugham: 从 1204 减少到 587 个样本
目标未知作家样本数量: 1174 (占总样本数的一半)
正在下载NLTK reuters语料库...
NLTK reuters语料库下载完成
从NLTK语料库收集样本...
从Gutenberg语料库加载样本...


[nltk_data] Downloading package reuters to
[nltk_data]     C:\Users\zyh56\AppData\Roaming\nltk_data...
[nltk_data]   Package reuters is already up-to-date!


从Brown语料库加载样本...
从Reuters语料库加载样本...
从Webtext语料库加载样本...
从语料库中总共收集了 20553 个样本
已抽取 1174 个未知作家样本
各类别样本数: [587, 587, 1174]
总样本数: 2348


100%|██████████| 235/235 [01:39<00:00,  2.36it/s]


Average training loss: 0.22552112632014967


100%|██████████| 59/59 [00:07<00:00,  7.97it/s]


验证准确率: 0.9979
验证损失: 0.0104
分类报告:
               precision    recall  f1-score   support

     Maugham     0.9916    1.0000    0.9958       118
     Márquez     1.0000    1.0000    1.0000       117
        未知作家     1.0000    0.9957    0.9979       235

    accuracy                         0.9979       470
   macro avg     0.9972    0.9986    0.9979       470
weighted avg     0.9979    0.9979    0.9979       470

混淆矩阵:
 [[118   0   0]
 [  0 117   0]
 [  1   0 234]]


100%|██████████| 235/235 [01:39<00:00,  2.35it/s]


Average training loss: 0.01041638583038993


100%|██████████| 59/59 [00:07<00:00,  8.06it/s]


验证准确率: 0.9958
验证损失: 0.0233
分类报告:
               precision    recall  f1-score   support

     Maugham     0.9833    1.0000    0.9916       118
     Márquez     1.0000    1.0000    1.0000       117
        未知作家     1.0000    0.9915    0.9957       235

    accuracy                         0.9957       470
   macro avg     0.9944    0.9972    0.9958       470
weighted avg     0.9958    0.9957    0.9958       470

混淆矩阵:
 [[118   0   0]
 [  0 117   0]
 [  2   0 233]]


100%|██████████| 235/235 [01:40<00:00,  2.35it/s]


Average training loss: 0.0015845272231827232


100%|██████████| 59/59 [00:07<00:00,  8.01it/s]


验证准确率: 0.9958
验证损失: 0.0200
分类报告:
               precision    recall  f1-score   support

     Maugham     0.9833    1.0000    0.9916       118
     Márquez     1.0000    1.0000    1.0000       117
        未知作家     1.0000    0.9915    0.9957       235

    accuracy                         0.9957       470
   macro avg     0.9944    0.9972    0.9958       470
weighted avg     0.9958    0.9957    0.9958       470

混淆矩阵:
 [[118   0   0]
 [  0 117   0]
 [  2   0 233]]
训练完成!
已将作者风格模型和标签名称保存至 ../author_style_model


In [3]:
from transformers import BertTokenizer, BertForSequenceClassification
import torch
import json

# 加载保存的模型和分词器
model_path = "../author_style_model"
tokenizer = BertTokenizer.from_pretrained(model_path)
model = BertForSequenceClassification.from_pretrained(model_path)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

# 预测部分
def analyze_text_style(text, confidence_threshold=0.6):
    """分析文本属于哪位作家的写作风格，或判断为未知作家"""
    inputs = tokenizer(
        text,
        return_tensors="pt",
        truncation=True,
        padding=True,
        max_length=256
    )
    inputs = {k: v.to(device) for k, v in inputs.items()}
    
    with torch.no_grad():
        outputs = model(**inputs)
    
    logits = outputs.logits
    probabilities = torch.nn.functional.softmax(logits, dim=1)
    
    # 获取最高概率及其对应的类别
    max_prob, predicted_class = torch.max(probabilities, dim=1)
    max_prob = max_prob.item()
    predicted_class = predicted_class.item()
    
    # 载入标签名称
    with open(f"{model_path}/label_names.json", 'r') as f:
        label_names = json.load(f)
    
    # 如果最高概率低于阈值，认为是未知作家
    if max_prob < confidence_threshold:
        result = {
            "predicted_author": "未知作家",
            "confidence": 1 - max_prob,  # 未知的置信度
            "probabilities": {name: prob.item() for name, prob in zip(label_names, probabilities[0])}
        }
    else:
        result = {
            "predicted_author": label_names[predicted_class],
            "confidence": max_prob,
            "probabilities": {name: prob.item() for name, prob in zip(label_names, probabilities[0])}
        }
    
    return result

# 测试示例
test_texts = [
    # Márquez
    "A police inspector had come forward with a very young medical student who was completing his forensic training at the municipal dispensary, and it was they who had ventilated the room and covered the body while waiting for Dr. Urbino to arrive. They greeted him with a solemnity that on this occasion had more of condolence than veneration, for no one was unaware of the degree of his friendship with Jeremiah de Saint-Amour. The eminent teacher shook hands with each of them, as he always did with every one of his pupils before beginning the daily class in general clinical medicine, and then, as if it were a flower, he grasped the hem of the blanket with the tips of his index finger and his thumb, and slowly uncovered the body with sacramental circumspection. Jeremiah de Saint-Amour was completely naked, stiff and twisted, eyes open, body blue, looking fifty years older than he had the night before. He had luminous pupils, yellowish beard and hair, and an old scar sewn with baling knots across his stomach. The use of crutches had made his torso and arms as broad as a galley slave’s, but his defenseless legs looked like an orphan’s. Dr. Juvenal Urbino studied him for a moment, his heart aching as it rarely had in the long years of his futile struggle against death.",

    # Maugham
    "I confess that when first I made acquaintance with Charles Strickland I never for a moment discerned that there was in him anything out of the ordinary. Yet now few will be found to deny his greatness. I do not speak of that greatness which is achieved by the fortunate politician or the successful soldier; that is a quality which belongs to the place he occupies rather than to the man; and a change of circumstances reduces it to very discreet proportions. The Prime Minister out of office is seen, too often, to have been but a pompous rhetorician, and the General without at! army is but the tame hero of a market town. The greatness of Charles Strickland was authentic. It may be that you do not like his art, but at all events you can hardly refuse it the tribute of your interest. He disturbs and arrests. The time has passed when he was an object of ridicule, and it is no longer a mark of eccentricity to defend or of perversity to extol him. His faults are accepted as the necessary complement to his merits. It is still possible to discuss his place in art, and the adulation of his admirers is perhaps no less capricious than the .disparagement of his detractors; but one thing cantilever be doubtful, and that is that he had genius. Tjo rqfy mind the most interesting thing in art is the personality of the artist ; and if that is singu- lar, I am willing to excuse a thousand faults. I suppose Velasquez was a better painter than El Greco, but custom stales one’s admiration for him: the Cretan, sensual and tragic, proffers the mystery of his soul like a standing sacrifice. The artist, painter, poet, or musician, by his decoration, sublime or beautiful, satisfies the aesthetic sense; but that is akin to the sexual instinct, anjl shares its barbarity: he lays before you also the greater gift of himself. To pursue his secret has something of the fascination of a detective story. It is a riddle which shares with the universe the merit of having no answer. The most insignificant of Strickland’s works suggests a personality which is strange, tormented, and complex; and it is this surely which prevents even those who do not like his pictures from being indifferent to them; it is this which has excited so curious an interest in his life and character. ",
    
    # Unknown author
    "Our team has decided to create the dataset ourselves, consisting of articles from famous authors. The dataset structure includes work title, author, article content, and sentiment labels (positive, neutral, negative). We will select both Chinese and English articles in their original forms to avoid translation biases. Preprocessing will standardize the text by removing punctuation, converting text to lowercase, removing stopwords, and applying other necessary preprocessing techniques. Sentiment labels will be added through manual annotation and leveraging publicly available sentiment datasets. Our primary approach is to fine-tune BERT based on the collected dataset. Given our multilingual dataset, we have opted to use Multilingual BERT. If initial results are unsatisfactory, separate models may be trained for Chinese and English texts. Considering BERT's token limit (512 tokens), we plan to employ a sliding window segmentation approach. Should segmentation negatively impact stylistic judgment, we will explore alternative models like LSTM combined with BERT to maintain context continuity. Additionally, we will fine-tune BERT or RoBERTa for sentiment analysis, potentially incorporating multi-task learning to simultaneously optimize authorship and sentiment classification tasks.",

    # Unknown author
    "In recent years, the rapid development of large language models (LLMs) has reshaped artificial intelligence research and applications. DeepSeek is an open-source AI research initiative dedicated to advancing natural language processing (NLP) through state-of-the-art LLMs. By offering open-access models and tools, DeepSeek aims to democratize AI capabilities and accelerate innovation in various fields. DeepSeek was established to address the growing need for transparency and accessibility in AI research. Unlike proprietary models from major tech companies, DeepSeek provides publicly available models that can be fine-tuned and deployed by researchers, developers, and businesses. This open approach fosters collaboration and ensures that AI advancements benefit a wider audience. DeepSeek's models are designed to support a wide range of NLP tasks, including text classification, sentiment analysis, question answering, and more. By leveraging large-scale pre-trained models, DeepSeek enables users to achieve high performance on various NLP benchmarks with minimal effort. DeepSeek's models are built on top of the Hugging Face Transformers library, a popular open-source framework for training and deploying transformer models. This integration allows users to easily access and utilize DeepSeek's models within their existing workflows. DeepSeek's models are available in multiple languages, making them suitable for global applications. Whether you're working on English, Chinese, Spanish, or other languages, DeepSeek provides models that can be fine-tuned for specific tasks and domains. DeepSeek's models are trained on diverse datasets to ensure robust performance across different languages and domains. By incorporating multilingual data and transfer learning techniques, DeepSeek's models can effectively handle various NLP tasks with high accuracy and efficiency. DeepSeek's models are continuously updated and refined to reflect the latest advancements in NLP research. By staying at the forefront of AI innovation, DeepSeek aims to empower users with cutting-edge tools and technologies for natural language processing. DeepSeek's mission is to democratize AI research and foster collaboration among researchers, developers, and businesses. By providing open-access models and resources, DeepSeek enables users to leverage state-of-the-art AI capabilities for a wide range of applications. Whether you're a student, researcher, developer, or business professional, DeepSeek offers tools and models that can accelerate your AI projects and drive innovation in the field of natural language processing.",
]

for text in test_texts:
    result = analyze_text_style(text)
    print(f"文本: {text[:50]}...")
    print(f"预测作家: {result['predicted_author']}")
    print(f"置信度: {result['confidence']:.2f}")
    print("所有类别概率:")
    for author, prob in result['probabilities'].items():
        print(f"  - {author}: {prob:.4f}")
    print("-" * 50)


文本: A police inspector had come forward with a very yo...
预测作家: Márquez
置信度: 0.95
所有类别概率:
  - Maugham: 0.0444
  - Márquez: 0.9549
  - 未知作家: 0.0007
--------------------------------------------------
文本: I confess that when first I made acquaintance with...
预测作家: Maugham
置信度: 1.00
所有类别概率:
  - Maugham: 0.9992
  - Márquez: 0.0003
  - 未知作家: 0.0005
--------------------------------------------------
文本: Our team has decided to create the dataset ourselv...
预测作家: 未知作家
置信度: 1.00
所有类别概率:
  - Maugham: 0.0013
  - Márquez: 0.0008
  - 未知作家: 0.9979
--------------------------------------------------
文本: In recent years, the rapid development of large la...
预测作家: 未知作家
置信度: 1.00
所有类别概率:
  - Maugham: 0.0017
  - Márquez: 0.0016
  - 未知作家: 0.9966
--------------------------------------------------
