In [21]:
import pandas as pd

# 读取CSV文件
df = pd.read_csv('train.csv')

# 提取'text'和'generated'列
train_texts = df['text'].tolist()
train_labels = df['generated'].tolist()

# 将'label_list'中的布尔值转换为0和1
train_labels = [1 if label else 0 for label in train_labels]



In [22]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from transformers import BertModel, BertTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer
from torch.utils.data import DataLoader, Dataset

# 数据集类
class TextDataset(Dataset):
    def __init__(self, texts, tfidf_features, labels):
        self.texts = texts
        self.tfidf_features = tfidf_features
        self.labels = labels

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        return self.texts[idx], self.tfidf_features[idx], self.labels[idx]

# BERT特征提取
class BERTFeatureExtractor(nn.Module):
    def __init__(self, pretrained_model_name='google-bert/bert-base-uncased'):
        super(BERTFeatureExtractor, self).__init__()
        self.bert = BertModel.from_pretrained(pretrained_model_name)
        
    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        return outputs.last_hidden_state[:, 0, :]

# TF-IDF特征提取
class TFIDFFeatureExtractor:
    def __init__(self):
        self.vectorizer = TfidfVectorizer()

    def fit_transform(self, texts):
        return self.vectorizer.fit_transform(texts)

    def transform(self, texts):
        return self.vectorizer.transform(texts)

# Two-Stage模型
class TwoStageModel(nn.Module):
    def __init__(self, bert_feature_size, tfidf_feature_size, num_classes):
        super(TwoStageModel, self).__init__()
        self.bert_extractor = BERTFeatureExtractor()
        self.linear = nn.Linear(bert_feature_size + tfidf_feature_size, num_classes)

    def forward(self, bert_input_ids, bert_attention_mask, tfidf_features):
        bert_features = self.bert_extractor(bert_input_ids, bert_attention_mask)
        bert_features = F.normalize(bert_features, p=2, dim=1)
        tfidf_features = torch.tensor(tfidf_features, dtype=torch.float32)
        tfidf_features = F.normalize(tfidf_features, p=2, dim=1)
        combined_features = torch.cat((bert_features, tfidf_features), dim=1)
        output = self.linear(combined_features)
        return output

# 数据预处理和模型训练代码
def train_model(train_texts, train_labels, val_texts, val_labels, num_epochs=10, batch_size=32, learning_rate=1e-3):
    tokenizer = BertTokenizer.from_pretrained('google-bert/bert-base-uncased')
    bert_inputs = tokenizer(train_texts, padding=True, truncation=True, return_tensors='pt')
    bert_input_ids = bert_inputs['input_ids']
    bert_attention_mask = bert_inputs['attention_mask']
    tfidf_extractor = TFIDFFeatureExtractor()
    tfidf_features = tfidf_extractor.fit_transform(train_texts).toarray()

    train_dataset = TextDataset(list(zip(bert_input_ids, bert_attention_mask)), tfidf_features, train_labels)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

    model = TwoStageModel(bert_feature_size=768, tfidf_feature_size=tfidf_features.shape[1], num_classes=2)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    for epoch in range(num_epochs):
        model.train()
        total_loss = 0
        for inputs, tfidf_features, labels in train_loader:
            bert_inputs = list(zip(*inputs))
            bert_input_ids = torch.stack(bert_inputs[0])
            bert_attention_mask = torch.stack(bert_inputs[1])
            tfidf_features = torch.tensor(tfidf_features, dtype=torch.float32)
            labels = torch.tensor(labels, dtype=torch.long)

            optimizer.zero_grad()
            outputs = model(bert_input_ids, bert_attention_mask, tfidf_features)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        print(f"Epoch {epoch+1}/{num_epochs}, Loss: {total_loss/len(train_loader)}")

    return model


# 训练模型
trained_model = train_model(train_texts, train_labels, val_texts, val_labels)


  tfidf_features = torch.tensor(tfidf_features, dtype=torch.float32)
  labels = torch.tensor(labels, dtype=torch.long)
  tfidf_features = torch.tensor(tfidf_features, dtype=torch.float32)


Epoch 1/10, Loss: 0.7262551784515381
Epoch 2/10, Loss: 0.5592772364616394
Epoch 3/10, Loss: 0.48130202293395996
Epoch 4/10, Loss: 0.4070466458797455
Epoch 5/10, Loss: 0.3789891302585602
Epoch 6/10, Loss: 0.3610266447067261
Epoch 7/10, Loss: 0.34607523679733276
Epoch 8/10, Loss: 0.33193501830101013
Epoch 9/10, Loss: 0.3211379051208496
Epoch 10/10, Loss: 0.3092111647129059
