In [1]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from torch.optim import AdamW
from transformers import get_linear_schedule_with_warmup
from torch.utils.data import DataLoader, TensorDataset, RandomSampler, SequentialSampler
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import random

# Set random seed for reproducibility
seed_val = 42
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(seed_val)

# Check for GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [2]:
# Load pre-trained BERT model and tokenizer
model_name = "bert-base-uncased"  # You can change to other BERT variants
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(
    model_name,
    num_labels=2,  # 我们将使用二分类：1表示目标作者的风格，0表示非目标作者
    output_attentions=False,
    output_hidden_states=False,
)
model.to(device)

# 加载和准备作者文本数据
def prepare_data(author_file_path, other_authors_file_path=None, max_length=256):
    """
    加载作者的文本数据，并准备用于训练的数据集
    
    参数:
    - author_file_path: 目标作者作品的文件路径
    - other_authors_file_path: 其他作者作品的文件路径(用于对比)
    - max_length: 文本最大长度
    """
    # 读取目标作者的文本
    with open(author_file_path, 'r', encoding='utf-8') as f:
        author_text = f.read()
    
    # 将作者文本分割成段落或句子
    author_samples = []
    # 可以按段落分割
    paragraphs = author_text.split('\n\n')
    for para in paragraphs:
        if len(para.strip()) > 50:  # 只保留有意义的段落
            author_samples.append(para.strip())
    
    # 准备标签(1表示目标作者)
    author_labels = [1] * len(author_samples)
    
    # 如果提供了对比文本，也加载它们
    other_samples = []
    other_labels = []
    if other_authors_file_path:
        with open(other_authors_file_path, 'r', encoding='utf-8') as f:
            other_text = f.read()
        paragraphs = other_text.split('\n\n')
        for para in paragraphs:
            if len(para.strip()) > 50:
                other_samples.append(para.strip())
        other_labels = [0] * len(other_samples)
    
    # 合并所有样本和标签
    all_texts = author_samples + other_samples
    all_labels = author_labels + other_labels
    
    # 确保数据集平衡
    pos_samples = [i for i, label in enumerate(all_labels) if label == 1]
    neg_samples = [i for i, label in enumerate(all_labels) if label == 0]
    
    # 如果正样本太多，减少一些
    if len(pos_samples) > len(neg_samples) * 2:
        remove_indices = random.sample(pos_samples, len(pos_samples) - len(neg_samples) * 2)
        all_texts = [text for i, text in enumerate(all_texts) if i not in remove_indices]
        all_labels = [label for i, label in enumerate(all_labels) if i not in remove_indices]
    
    print(f"总样本数: {len(all_texts)}, 正样本: {sum(all_labels)}, 负样本: {len(all_labels) - sum(all_labels)}")
    
    # 对文本进行分词和编码
    encoded_data = tokenizer(
        all_texts,
        padding='max_length',
        truncation=True,
        max_length=max_length,
        return_tensors='pt'
    )
    
    # 创建数据集
    input_ids = encoded_data['input_ids']
    attention_masks = encoded_data['attention_mask']
    labels = torch.tensor(all_labels)
    
    # 分割为训练集和验证集
    train_inputs, val_inputs, train_masks, val_masks, train_labels, val_labels = train_test_split(
        input_ids, attention_masks, labels, test_size=0.2, random_state=seed_val, stratify=labels
    )
    
    # 创建DataLoaders
    batch_size = 8
    
    train_data = TensorDataset(train_inputs, train_masks, train_labels)
    train_sampler = RandomSampler(train_data)
    train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)
    
    val_data = TensorDataset(val_inputs, val_masks, val_labels)
    val_sampler = SequentialSampler(val_data)
    val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)
    
    return train_dataloader, val_dataloader

# Function to train the model
def train_model(train_dataloader, val_dataloader, epochs=4):
    # 添加权重衰减和学习率调整
    optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8, weight_decay=0.01)
    
    # Total number of training steps
    total_steps = len(train_dataloader) * epochs
    
    # Create the learning rate scheduler
    scheduler = get_linear_schedule_with_warmup(
        optimizer, 
        num_warmup_steps=int(total_steps * 0.1),  # 10% 的步骤用于热身
        num_training_steps=total_steps
    )
    
    # 用于跟踪指标
    best_val_accuracy = 0
    best_val_loss = float('inf')
    
    # Training loop
    for epoch in range(epochs):
        print(f'======== Epoch {epoch + 1} / {epochs} ========')
        
        # Training
        model.train()
        total_train_loss = 0
        
        for batch in tqdm(train_dataloader):
            model.zero_grad()
            
            input_ids = batch[0].to(device)
            attention_mask = batch[1].to(device)
            labels = batch[2].to(device)
            
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )
            
            loss = outputs.loss
            total_train_loss += loss.item()
            
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            
            optimizer.step()
            scheduler.step()
        
        avg_train_loss = total_train_loss / len(train_dataloader)
        print(f"Average training loss: {avg_train_loss}")
        
        # Validation
        model.eval()
        total_eval_accuracy = 0
        total_eval_loss = 0
        all_preds = []
        all_labels = []
        
        for batch in tqdm(val_dataloader):
            with torch.no_grad():
                input_ids = batch[0].to(device)
                attention_mask = batch[1].to(device)
                labels = batch[2].to(device)
                
                outputs = model(
                    input_ids=input_ids,
                    attention_mask=attention_mask,
                    labels=labels
                )
                
                loss = outputs.loss
                total_eval_loss += loss.item()
                
                logits = outputs.logits
                predictions = torch.argmax(logits, dim=1)
                
                # 收集预测和标签用于计算F1分数
                all_preds.extend(predictions.cpu().numpy())
                all_labels.extend(labels.cpu().numpy())
                
                accuracy = (predictions == labels).float().mean().item()
                total_eval_accuracy += accuracy
        
        avg_val_accuracy = total_eval_accuracy / len(val_dataloader)
        avg_val_loss = total_eval_loss / len(val_dataloader)
        
        # 计算F1分数和其他指标
        from sklearn.metrics import classification_report, confusion_matrix
        report = classification_report(all_labels, all_preds, target_names=['非目标作者', '目标作者'], digits=4)
        conf_matrix = confusion_matrix(all_labels, all_preds)
        
        print(f"验证准确率: {avg_val_accuracy:.4f}")
        print(f"验证损失: {avg_val_loss:.4f}")
        print("分类报告:\n", report)
        print("混淆矩阵:\n", conf_matrix)
    
    print("训练完成!")
    return model

# 运行训练
author_file_path = "author_works.txt"
other_authors_file_path = "author_works2.txt"

# 尝试使用其他作者文本
train_dataloader, val_dataloader = prepare_data(author_file_path, other_authors_file_path)
fine_tuned_model = train_model(train_dataloader, val_dataloader, epochs=10)

# 保存模型
model_save_path = "../author_style_model"
model.save_pretrained(model_save_path)
tokenizer.save_pretrained(model_save_path)
print(f"已将作者风格模型保存至 {model_save_path}")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


总样本数: 2198, 正样本: 848, 负样本: 1350


100%|██████████| 220/220 [00:47<00:00,  4.60it/s]


Average training loss: 0.36558865130646157


100%|██████████| 55/55 [00:03<00:00, 16.75it/s]


验证准确率: 0.9750
验证损失: 0.0801
分类报告:
               precision    recall  f1-score   support

       非目标作者     0.9814    0.9778    0.9796       270
        目标作者     0.9649    0.9706    0.9677       170

    accuracy                         0.9750       440
   macro avg     0.9732    0.9742    0.9737       440
weighted avg     0.9750    0.9750    0.9750       440

混淆矩阵:
 [[264   6]
 [  5 165]]


100%|██████████| 220/220 [00:46<00:00,  4.75it/s]


Average training loss: 0.08574208684916482


100%|██████████| 55/55 [00:03<00:00, 17.21it/s]


验证准确率: 0.9795
验证损失: 0.0723
分类报告:
               precision    recall  f1-score   support

       非目标作者     0.9962    0.9704    0.9831       270
        目标作者     0.9548    0.9941    0.9741       170

    accuracy                         0.9795       440
   macro avg     0.9755    0.9822    0.9786       440
weighted avg     0.9802    0.9795    0.9796       440

混淆矩阵:
 [[262   8]
 [  1 169]]


100%|██████████| 220/220 [00:46<00:00,  4.76it/s]


Average training loss: 0.017384177094010574


100%|██████████| 55/55 [00:03<00:00, 16.73it/s]


验证准确率: 0.9773
验证损失: 0.0802
分类报告:
               precision    recall  f1-score   support

       非目标作者     0.9815    0.9815    0.9815       270
        目标作者     0.9706    0.9706    0.9706       170

    accuracy                         0.9773       440
   macro avg     0.9760    0.9760    0.9760       440
weighted avg     0.9773    0.9773    0.9773       440

混淆矩阵:
 [[265   5]
 [  5 165]]


100%|██████████| 220/220 [00:46<00:00,  4.73it/s]


Average training loss: 0.011827457018014022


100%|██████████| 55/55 [00:03<00:00, 16.78it/s]


验证准确率: 0.9636
验证损失: 0.2126
分类报告:
               precision    recall  f1-score   support

       非目标作者     1.0000    0.9407    0.9695       270
        目标作者     0.9140    1.0000    0.9551       170

    accuracy                         0.9636       440
   macro avg     0.9570    0.9704    0.9623       440
weighted avg     0.9668    0.9636    0.9639       440

混淆矩阵:
 [[254  16]
 [  0 170]]


100%|██████████| 220/220 [00:46<00:00,  4.77it/s]


Average training loss: 0.0012716856258273104


100%|██████████| 55/55 [00:03<00:00, 17.09it/s]


验证准确率: 0.9841
验证损失: 0.1155
分类报告:
               precision    recall  f1-score   support

       非目标作者     1.0000    0.9741    0.9869       270
        目标作者     0.9605    1.0000    0.9798       170

    accuracy                         0.9841       440
   macro avg     0.9802    0.9870    0.9833       440
weighted avg     0.9847    0.9841    0.9841       440

混淆矩阵:
 [[263   7]
 [  0 170]]


100%|██████████| 220/220 [00:46<00:00,  4.77it/s]


Average training loss: 8.410241225647042e-05


100%|██████████| 55/55 [00:03<00:00, 17.20it/s]


验证准确率: 0.9818
验证损失: 0.1074
分类报告:
               precision    recall  f1-score   support

       非目标作者     0.9962    0.9741    0.9850       270
        目标作者     0.9602    0.9941    0.9769       170

    accuracy                         0.9818       440
   macro avg     0.9782    0.9841    0.9809       440
weighted avg     0.9823    0.9818    0.9819       440

混淆矩阵:
 [[263   7]
 [  1 169]]


100%|██████████| 220/220 [00:46<00:00,  4.77it/s]


Average training loss: 6.629286776429085e-05


100%|██████████| 55/55 [00:03<00:00, 17.03it/s]


验证准确率: 0.9818
验证损失: 0.1086
分类报告:
               precision    recall  f1-score   support

       非目标作者     0.9962    0.9741    0.9850       270
        目标作者     0.9602    0.9941    0.9769       170

    accuracy                         0.9818       440
   macro avg     0.9782    0.9841    0.9809       440
weighted avg     0.9823    0.9818    0.9819       440

混淆矩阵:
 [[263   7]
 [  1 169]]


100%|██████████| 220/220 [00:46<00:00,  4.75it/s]


Average training loss: 5.570992699094032e-05


100%|██████████| 55/55 [00:03<00:00, 17.35it/s]


验证准确率: 0.9818
验证损失: 0.1106
分类报告:
               precision    recall  f1-score   support

       非目标作者     0.9962    0.9741    0.9850       270
        目标作者     0.9602    0.9941    0.9769       170

    accuracy                         0.9818       440
   macro avg     0.9782    0.9841    0.9809       440
weighted avg     0.9823    0.9818    0.9819       440

混淆矩阵:
 [[263   7]
 [  1 169]]


100%|██████████| 220/220 [00:46<00:00,  4.77it/s]


Average training loss: 5.1212503636964934e-05


100%|██████████| 55/55 [00:03<00:00, 17.12it/s]


验证准确率: 0.9818
验证损失: 0.1118
分类报告:
               precision    recall  f1-score   support

       非目标作者     0.9962    0.9741    0.9850       270
        目标作者     0.9602    0.9941    0.9769       170

    accuracy                         0.9818       440
   macro avg     0.9782    0.9841    0.9809       440
weighted avg     0.9823    0.9818    0.9819       440

混淆矩阵:
 [[263   7]
 [  1 169]]


100%|██████████| 220/220 [00:46<00:00,  4.74it/s]


Average training loss: 4.982007503713248e-05


100%|██████████| 55/55 [00:03<00:00, 17.08it/s]


验证准确率: 0.9818
验证损失: 0.1124
分类报告:
               precision    recall  f1-score   support

       非目标作者     0.9962    0.9741    0.9850       270
        目标作者     0.9602    0.9941    0.9769       170

    accuracy                         0.9818       440
   macro avg     0.9782    0.9841    0.9809       440
weighted avg     0.9823    0.9818    0.9819       440

混淆矩阵:
 [[263   7]
 [  1 169]]
训练完成!
已将作者风格模型保存至 ../author_style_model


In [None]:
from transformers import BertTokenizer, BertForSequenceClassification
import torch

# 加载保存的模型和分词器
model_path = "../author_style_model"
tokenizer = BertTokenizer.from_pretrained(model_path)
model = BertForSequenceClassification.from_pretrained(model_path)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

def analyze_text_style(text):
    """分析文本是否符合作者的写作风格"""
    inputs = tokenizer(
        text,
        return_tensors="pt",
        truncation=True,
        padding=True,
        max_length=256
    )
    inputs = {k: v.to(device) for k, v in inputs.items()}
    
    with torch.no_grad():
        outputs = model(**inputs)
    
    logits = outputs.logits
    probabilities = torch.nn.functional.softmax(logits, dim=1)
    author_style_prob = probabilities[0][1].item()
    
    return {
        "author_style_probability": author_style_prob,
        "is_author_style": author_style_prob > 0.5,
        "confidence": max(author_style_prob, 1-author_style_prob)
    }

# 测试示例
test_texts = [
    "A police inspector had come forward with a very young medical student who was completing his forensic training at the municipal dispensary, and it was they who had ventilated the room and covered the body while waiting for Dr. Urbino to arrive. They greeted him with a solemnity that on this occasion had more of condolence than veneration, for no one was unaware of the degree of his friendship with Jeremiah de Saint-Amour. The eminent teacher shook hands with each of them, as he always did with every one of his pupils before beginning the daily class in general clinical medicine, and then, as if it were a flower, he grasped the hem of the blanket with the tips of his index finger and his thumb, and slowly uncovered the body with sacramental circumspection. Jeremiah de Saint-Amour was completely naked, stiff and twisted, eyes open, body blue, looking fifty years older than he had the night before. He had luminous pupils, yellowish beard and hair, and an old scar sewn with baling knots across his stomach. The use of crutches had made his torso and arms as broad as a galley slave’s, but his defenseless legs looked like an orphan’s. Dr. Juvenal Urbino studied him for a moment, his heart aching as it rarely had in the long years of his futile struggle against death.",

    "I confess that when first I made acquaintance with Charles Strickland I never for a moment discerned that there was in him anything out of the ordinary. Yet now few will be found to deny his greatness. I do not speak of that greatness which is achieved by the fortunate politician or the successful soldier; that is a quality which belongs to the place he occupies rather than to the man; and a change of circumstances reduces it to very discreet proportions. The Prime Minister out of office is seen, too often, to have been but a pompous rhetorician, and the General without at! army is but the tame hero of a market town. The greatness of Charles Strickland was authentic. It may be that you do not like his art, but at all events you can hardly refuse it the tribute of your interest. He disturbs and arrests. The time has passed when he was an object of ridicule, and it is no longer a mark of eccentricity to defend or of perversity to extol him. His faults are accepted as the necessary complement to his merits. It is still possible to discuss his place in art, and the adulation of his admirers is perhaps no less capricious than the .disparagement of his detractors; but one thing cantilever be doubtful, and that is that he had genius. Tjo rqfy mind the most interesting thing in art is the personality of the artist ; and if that is singu- lar, I am willing to excuse a thousand faults. I suppose Velasquez was a better painter than El Greco, but custom stales one’s admiration for him: the Cretan, sensual and tragic, proffers the mystery of his soul like a standing sacrifice. The artist, painter, poet, or musician, by his decoration, sublime or beautiful, satisfies the aesthetic sense; but that is akin to the sexual instinct, anjl shares its barbarity: he lays before you also the greater gift of himself. To pursue his secret has something of the fascination of a detective story. It is a riddle which shares with the universe the merit of having no answer. The most insignificant of Strickland’s works suggests a personality which is strange, tormented, and complex; and it is this surely which prevents even those who do not like his pictures from being indifferent to them; it is this which has excited so curious an interest in his life and character. ",
    
    "Our team has decided to create the dataset ourselves, consisting of articles from famous authors. The dataset structure includes work title, author, article content, and sentiment labels (positive, neutral, negative). We will select both Chinese and English articles in their original forms to avoid translation biases. Preprocessing will standardize the text by removing punctuation, converting text to lowercase, removing stopwords, and applying other necessary preprocessing techniques. Sentiment labels will be added through manual annotation and leveraging publicly available sentiment datasets. Our primary approach is to fine-tune BERT based on the collected dataset. Given our multilingual dataset, we have opted to use Multilingual BERT. If initial results are unsatisfactory, separate models may be trained for Chinese and English texts. Considering BERT's token limit (512 tokens), we plan to employ a sliding window segmentation approach. Should segmentation negatively impact stylistic judgment, we will explore alternative models like LSTM combined with BERT to maintain context continuity. Additionally, we will fine-tune BERT or RoBERTa for sentiment analysis, potentially incorporating multi-task learning to simultaneously optimize authorship and sentiment classification tasks.",

    "In recent years, the rapid development of large language models (LLMs) has reshaped artificial intelligence research and applications. DeepSeek is an open-source AI research initiative dedicated to advancing natural language processing (NLP) through state-of-the-art LLMs. By offering open-access models and tools, DeepSeek aims to democratize AI capabilities and accelerate innovation in various fields. DeepSeek was established to address the growing need for transparency and accessibility in AI research. Unlike proprietary models from major tech companies, DeepSeek provides publicly available models that can be fine-tuned and deployed by researchers, developers, and businesses. This open approach fosters collaboration and ensures that AI advancements benefit a wider audience. DeepSeek's models are designed to support a wide range of NLP tasks, including text classification, sentiment analysis, question answering, and more. By leveraging large-scale pre-trained models, DeepSeek enables users to achieve high performance on various NLP benchmarks with minimal effort. DeepSeek's models are built on top of the Hugging Face Transformers library, a popular open-source framework for training and deploying transformer models. This integration allows users to easily access and utilize DeepSeek's models within their existing workflows. DeepSeek's models are available in multiple languages, making them suitable for global applications. Whether you're working on English, Chinese, Spanish, or other languages, DeepSeek provides models that can be fine-tuned for specific tasks and domains. DeepSeek's models are trained on diverse datasets to ensure robust performance across different languages and domains. By incorporating multilingual data and transfer learning techniques, DeepSeek's models can effectively handle various NLP tasks with high accuracy and efficiency. DeepSeek's models are continuously updated and refined to reflect the latest advancements in NLP research. By staying at the forefront of AI innovation, DeepSeek aims to empower users with cutting-edge tools and technologies for natural language processing. DeepSeek's mission is to democratize AI research and foster collaboration among researchers, developers, and businesses. By providing open-access models and resources, DeepSeek enables users to leverage state-of-the-art AI capabilities for a wide range of applications. Whether you're a student, researcher, developer, or business professional, DeepSeek offers tools and models that can accelerate your AI projects and drive innovation in the field of natural language processing.",
]

for text in test_texts:
    result = analyze_text_style(text)
    print(f"文本: {text[:50]}...")
    print(f"作者风格概率: {result['author_style_probability']:.2f}")
    print(f"判断结果: {'符合作者风格' if result['is_author_style'] else '不符合作者风格'}")
    print(f"置信度: {result['confidence']:.2f}")
    print("-" * 50)


文本: A police inspector had come forward with a very yo...
作者风格概率: 1.00
判断结果: 符合作者风格
置信度: 1.00
--------------------------------------------------
文本: I confess that when first I made acquaintance with...
作者风格概率: 0.00
判断结果: 不符合作者风格
置信度: 1.00
--------------------------------------------------
文本: Our team has decided to create the dataset ourselv...
作者风格概率: 0.76
判断结果: 符合作者风格
置信度: 0.76
--------------------------------------------------
