In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split

# 读取CSV文件
df = pd.read_csv('data/train.csv')

# 提取'text'和'generated'列
train_texts = df['text'].tolist()
train_labels = df['generated'].tolist()

# 将'label_list'中的布尔值转换为0和1
train_labels = [1 if label else 0 for label in train_labels]

train_texts, val_texts, train_labels, val_labels = train_test_split(
    train_texts, train_labels, test_size=0.2, random_state=42
)
train_texts = train_texts[:100]
train_labels = train_labels[:100]
val_texts = val_texts[:20]
val_labels = val_labels[:20]



In [4]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from transformers import BertModel, BertTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer
from torch.utils.data import DataLoader, Dataset

# 检查是否有可用的GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

# 数据集类
class TextDataset(Dataset):
    def __init__(self, bert_input_ids, bert_attention_mask, tfidf_features, labels):
        self.bert_input_ids = bert_input_ids
        self.bert_attention_mask = bert_attention_mask
        self.tfidf_features = tfidf_features
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return (self.bert_input_ids[idx], self.bert_attention_mask[idx], self.tfidf_features[idx]), self.labels[idx]


# BERT特征提取
class BERTFeatureExtractor(nn.Module):
    def __init__(self, pretrained_model_name='google-bert/bert-base-uncased'):
        super(BERTFeatureExtractor, self).__init__()
        self.bert = BertModel.from_pretrained(pretrained_model_name)
        
    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        return outputs.last_hidden_state[:, 0, :]

# TF-IDF特征提取
class TFIDFFeatureExtractor:
    def __init__(self):
        self.vectorizer = TfidfVectorizer()

    def fit_transform(self, texts):
        return self.vectorizer.fit_transform(texts)

    def transform(self, texts):
        return self.vectorizer.transform(texts)

# Two-Stage模型
class TwoStageModel(nn.Module):
    def __init__(self, bert_feature_size, tfidf_feature_size, num_classes):
        super(TwoStageModel, self).__init__()
        self.bert_extractor = BERTFeatureExtractor()
        # self.linear = nn.Linear(bert_feature_size + tfidf_feature_size, num_classes)
        self.linear = nn.Linear(bert_feature_size, num_classes)

    def forward(self, bert_input_ids, bert_attention_mask, tfidf_features):
        bert_features = self.bert_extractor(bert_input_ids, bert_attention_mask)
        bert_features = F.normalize(bert_features, p=2, dim=1)
        tfidf_features = torch.tensor(tfidf_features, dtype=torch.float32).to(device)
        tfidf_features = F.normalize(tfidf_features, p=2, dim=1)
        # combined_features = torch.cat((bert_features, tfidf_features), dim=1)
        combined_features = bert_features
        output = self.linear(combined_features)
        return output

# 数据预处理和模型训练代码
def train_model(train_texts, train_labels, val_texts, val_labels, num_epochs=10, batch_size=16, learning_rate=1e-4):

    # BERT features
    tokenizer = BertTokenizer.from_pretrained('google-bert/bert-base-uncased')
    print("Tokenizing the input ...... ")
    bert_inputs = tokenizer(train_texts, padding=True, truncation=True, return_tensors='pt')
    torch.save(bert_inputs, 'data/tokenized_train_inputs.pt')
    # bert_inputs = torch.load('tokenized_train_inputs.pt')
    bert_input_ids = bert_inputs['input_ids'].to(device)
    bert_attention_mask = bert_inputs['attention_mask'].to(device)

    # TFIDF features
    print("TF-IDF featuring ......")
    tfidf_extractor = TFIDFFeatureExtractor()
    tfidf_features = tfidf_extractor.fit_transform(train_texts).toarray()

    train_dataset = TextDataset(bert_input_ids, bert_attention_mask, tfidf_features, train_labels)

    # 同样处理验证集
    val_bert_inputs = tokenizer(val_texts, padding=True, truncation=True, return_tensors='pt')
    torch.save(bert_inputs, 'data/tokenized_val_inputs.pt')
    # bert_inputs = torch.load('tokenized_val_inputs.pt')
    val_bert_input_ids = val_bert_inputs['input_ids'].to(device)
    val_bert_attention_mask = val_bert_inputs['attention_mask'].to(device)
    val_tfidf_features = tfidf_extractor.transform(val_texts).toarray()

    val_dataset = TextDataset(val_bert_input_ids, val_bert_attention_mask, val_tfidf_features, val_labels)

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

    model = TwoStageModel(bert_feature_size=768, tfidf_feature_size=tfidf_features.shape[1], num_classes=2)
    model.to(device)
    criterion = nn.CrossEntropyLoss()

    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    for epoch in range(num_epochs):
        model.train()
        total_loss = 0
        for (bert_input_ids, bert_attention_mask, tfidf_features), labels in train_loader:
            bert_input_ids = bert_input_ids.to(device)
            bert_attention_mask = bert_attention_mask.to(device)
            tfidf_features = torch.tensor(tfidf_features, dtype=torch.float32).to(device)
            labels = torch.tensor(labels, dtype=torch.long).to(device)

            optimizer.zero_grad()
            outputs = model(bert_input_ids, bert_attention_mask, tfidf_features)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        avg_train_loss = total_loss / len(train_loader)

        # 验证集评估
        model.eval()
        total_val_loss = 0
        correct_predictions = 0
        with torch.no_grad():
            for (bert_input_ids, bert_attention_mask, tfidf_features), labels in val_loader:
                bert_input_ids = bert_input_ids.to(device)
                bert_attention_mask = bert_attention_mask.to(device)
                tfidf_features = torch.tensor(tfidf_features, dtype=torch.float32).to(device)
                labels = torch.tensor(labels, dtype=torch.long).to(device)

                outputs = model(bert_input_ids, bert_attention_mask, tfidf_features)
                loss = criterion(outputs, labels)
                total_val_loss += loss.item()

                _, predicted = torch.max(outputs, 1)
                correct_predictions += (predicted == labels).sum().item()

        avg_val_loss = total_val_loss / len(val_loader)
        val_accuracy = correct_predictions / len(val_dataset)

        print(f"Epoch {epoch+1}/{num_epochs}, "
              f"Train Loss: {avg_train_loss:.4f}, "
              f"Val Loss: {avg_val_loss:.4f}, "
              f"Val Accuracy: {val_accuracy:.4f}")

    return model


# 训练模型
trained_model = train_model(train_texts, train_labels, val_texts, val_labels)


cuda
Tokenizing the input ...... 
TF-IDF featuring ......


  tfidf_features = torch.tensor(tfidf_features, dtype=torch.float32).to(device)
  labels = torch.tensor(labels, dtype=torch.long).to(device)
  tfidf_features = torch.tensor(tfidf_features, dtype=torch.float32).to(device)
  tfidf_features = torch.tensor(tfidf_features, dtype=torch.float32).to(device)
  labels = torch.tensor(labels, dtype=torch.long).to(device)


Epoch 1/10, Train Loss: 0.6557, Val Loss: 0.7130, Val Accuracy: 0.5000
Epoch 2/10, Train Loss: 0.5274, Val Loss: 0.5805, Val Accuracy: 0.9500
Epoch 3/10, Train Loss: 0.4448, Val Loss: 0.6035, Val Accuracy: 0.7500
Epoch 4/10, Train Loss: 0.4153, Val Loss: 0.3592, Val Accuracy: 1.0000
Epoch 5/10, Train Loss: 0.3596, Val Loss: 0.3375, Val Accuracy: 1.0000
Epoch 6/10, Train Loss: 0.3487, Val Loss: 0.6798, Val Accuracy: 0.7000
Epoch 7/10, Train Loss: 0.3368, Val Loss: 0.4755, Val Accuracy: 0.9500
Epoch 8/10, Train Loss: 0.3363, Val Loss: 0.4566, Val Accuracy: 0.9000
Epoch 9/10, Train Loss: 0.3278, Val Loss: 0.4325, Val Accuracy: 0.9500
Epoch 10/10, Train Loss: 0.3006, Val Loss: 0.5261, Val Accuracy: 0.8000
