In [4]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertModel
import torch.nn as nn
import numpy as np
import random
# 检查GPU是否可用，如果不可用则使用CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 自定义数据集类
class NamesDataset(Dataset):
    def __init__(self, names_dict, tokenizer, max_length):
        self.names_dict = names_dict
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.names = list(names_dict.keys())
        self.labels = [0 if gender == '男' else 1 for gender in names_dict.values()]

    def __len__(self):
        return len(self.names)

    def __getitem__(self, idx):
        name = self.names[idx]
        label = self.labels[idx]
        encoding = self.tokenizer(name, return_tensors="pt", padding="max_length", truncation=True, max_length=self.max_length)
        input_ids = encoding['input_ids'].squeeze()
        attention_mask = encoding['attention_mask'].squeeze()
        return input_ids, attention_mask, label

# 加载BERT模型和分词器
model_path = 'model'  # 使用预训练的BERT模型
tokenizer = BertTokenizer.from_pretrained(model_path)
bert_model = BertModel.from_pretrained(model_path).to(device)

# 定义情感分类模型
class SentimentClassification(nn.Module):
    def __init__(self, bert_model, hidden_dim, output_dim):
        super().__init__()
        self.bert = bert_model
        self.fc1 = nn.Linear(bert_model.config.hidden_size, hidden_dim)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_dim, output_dim)
        self.dropout = nn.Dropout(0.1)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.last_hidden_state[:, 0, :]
        x = self.fc1(pooled_output)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.fc2(x)
        return x

# 文件路径
file_path = 'Chinese_Names_Corpus_Gender（120W）.txt'

# 读取文件并存储为字典
names_dict = {}
with open(file_path, 'r') as file:
    lines = file.readlines()
    sampled_lines = random.sample(lines, len(lines) // 10000)  # 选取1%的数据进行训练
    for line in sampled_lines:
        name, gender = line.strip().split(',')
        names_dict[name.strip()] = gender.strip()

# 构建数据集和数据加载器
max_length = 10  # 设定最大长度
dataset = NamesDataset(names_dict, tokenizer, max_length)
train_loader = DataLoader(dataset, batch_size=16, shuffle=True)

# 初始化模型、优化器和损失函数
output_dim = 2  # 二分类任务
hidden_dim = 128  # 隐藏层维度
model = SentimentClassification(bert_model, hidden_dim, output_dim).to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
criterion = nn.CrossEntropyLoss()

# 训练模型
model.train()
for epoch in range(50):  # 假设训练5个epoch
    total_loss = 0
    for input_ids, attention_mask, labels in train_loader:
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1} Loss: {total_loss / len(train_loader)}")

# 使用模型预测名字对应的性别
def predict_gender(model, tokenizer, name):
    inputs = tokenizer(name, return_tensors="pt", padding="max_length", truncation=True, max_length=max_length)
    inputs = {key: val.to(device) for key, val in inputs.items() if key != 'token_type_ids'}  # 将除了token_type_ids之外的所有输入移动到设备上
    outputs = model(**inputs)
    predicted_class = torch.argmax(outputs, dim=1).item()
    gender_label = "男" if predicted_class == 0 else "女"
    return gender_label


# 输入自己的名字进行预测
your_name = "戴鑫"
predicted_gender = predict_gender(model, tokenizer, your_name)
print(f"姓名：{your_name}\t预测性别：{predicted_gender}")


Epoch 1 Loss: 0.6292603611946106
Epoch 2 Loss: 0.6714542433619499
Epoch 3 Loss: 0.651294969022274
Epoch 4 Loss: 0.6482248455286026
Epoch 5 Loss: 0.6146441400051117
Epoch 6 Loss: 0.6018929146230221
Epoch 7 Loss: 0.5591788403689861
Epoch 8 Loss: 0.5747980438172817
Epoch 9 Loss: 0.5565221272408962
Epoch 10 Loss: 0.5043519921600819
Epoch 11 Loss: 0.556438110768795
Epoch 12 Loss: 0.4570106025785208
Epoch 13 Loss: 0.4779178649187088
Epoch 14 Loss: 0.4749242477118969
Epoch 15 Loss: 0.5342003963887691
Epoch 16 Loss: 0.481483593583107
Epoch 17 Loss: 0.44939539581537247
Epoch 18 Loss: 0.560304693877697
Epoch 19 Loss: 0.572664350271225
Epoch 20 Loss: 0.5358593836426735
Epoch 21 Loss: 0.48754923790693283
Epoch 22 Loss: 0.49835407361388206
Epoch 23 Loss: 0.46896854043006897
Epoch 24 Loss: 0.42970386520028114
Epoch 25 Loss: 0.43588175624608994
Epoch 26 Loss: 0.3920796886086464
Epoch 27 Loss: 0.4550652429461479
Epoch 28 Loss: 0.4064072109758854
Epoch 29 Loss: 0.3907865434885025
Epoch 30 Loss: 0.46243