In [14]:
import torch
import pandas as pd
from transformers import BartTokenizer, BartForConditionalGeneration, AdamW
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split

In [15]:
resume_data = pd.read_excel(io="jobDesc.xlsx", sheet_name="combine")

In [16]:
experience = resume_data['Experience'].tolist()
summarize = resume_data['Summarized Experience'].tolist()

In [17]:
experience_train, experience_test, summarize_train, summarize_test = train_test_split(
    experience, summarize, test_size=0.1, random_state=42
)

In [18]:
tokenizer = BartTokenizer.from_pretrained('facebook/bart-base')



In [19]:
# 对训练集进行标记化，得到 input_ids 和 attention_mask
inputs_train = tokenizer(experience_train, truncation=True, padding=True, max_length=512, return_tensors="pt")
labels_train = tokenizer(summarize_train, truncation=True, padding=True, max_length=512, return_tensors="pt")

In [20]:
# 对测试集进行标记化
inputs_test = tokenizer(experience_test, truncation=True, padding=True, max_length=512, return_tensors="pt")
labels_test = tokenizer(summarize_test, truncation=True, padding=True, max_length=512, return_tensors="pt")

In [21]:
# 将标签中的填充位置替换为 -100（BART 要求）
labels_train['input_ids'][labels_train['input_ids'] == tokenizer.pad_token_id] = -100
labels_test['input_ids'][labels_test['input_ids'] == tokenizer.pad_token_id] = -100

In [22]:
# 定义一个自定义数据集类，用于 PyTorch 数据加载器
class TextDataset(Dataset):
    def __init__(self, inputs, labels):
        self.inputs = inputs
        self.labels = labels

    def __len__(self):
        return len(self.inputs['input_ids'])

    def __getitem__(self, idx):
        input_ids = self.inputs['input_ids'][idx]
        attention_mask = self.inputs['attention_mask'][idx]
        label_ids = self.labels['input_ids'][idx]
        return input_ids, attention_mask, label_ids

In [23]:
# 创建训练集和测试集数据加载器
train_dataset = TextDataset(inputs_train, labels_train)
test_dataset = TextDataset(inputs_test, labels_test)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=8)

In [24]:
# 初始化 BART 模型
model = BartForConditionalGeneration.from_pretrained('facebook/bart-base')

In [25]:
# 使用 AdamW 优化器
optimizer = AdamW(model.parameters(), lr=1e-5, weight_decay=0.01)



In [28]:
from tqdm import tqdm  # 可选：进度条库，帮助可视化训练进度

# 设置设备为 GPU（如果可用），否则使用 CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 将模型移动到 GPU
model = model.to(device)

# Initialize AdamW optimizer after moving model to device
optimizer = AdamW(model.parameters(), lr=1e-5, weight_decay=0.01)

# 训练模型
epochs = 3  # 假设进行 3 个 epoch
for epoch in range(epochs):
    model.train()  # 设置模型为训练模式
    total_train_loss = 0  # 累积训练损失
    loop = tqdm(train_loader, leave=True)  # 使用 tqdm 进度条可视化训练进度

    for batch in loop:
        input_ids, attention_mask, labels = batch

        # 将批次数据移动到 GPU
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        labels = labels.to(device)

        # 前向传播计算损失
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss

        # 反向传播和优化
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_train_loss += loss.item()

        # 更新进度条显示
        loop.set_description(f"Epoch {epoch + 1}/{epochs}")
        loop.set_postfix(loss=loss.item())

    avg_train_loss = total_train_loss / len(train_loader)
    print(f"Epoch {epoch + 1}/{epochs}, Training Loss: {avg_train_loss}")

    # 每个 epoch 后进行一次测试集评估
    model.eval()  # 设置模型为评估模式
    total_test_loss = 0
    with torch.no_grad():
        for batch in test_loader:
            input_ids, attention_mask, labels = batch

            # 将批次数据移动到 GPU
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            labels = labels.to(device)

            # 前向传播计算损失
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            total_test_loss += loss.item()

    avg_test_loss = total_test_loss / len(test_loader)
    print(f"Epoch {epoch + 1}/{epochs}, Test Loss: {avg_test_loss}")

Epoch 1/3:  24%|██▍       | 6/25 [00:37<01:58,  6.24s/it, loss=nan]


KeyboardInterrupt: 

In [None]:
# 假设你有一个输入文本
input_text = "Leave Application Processing System (Web application) Dec 2023 Team leader Singapore System Design: Used the MVC design pattern to build the system architecture. Develpoment:Developed leave application processing system with java, spring, MySql, and h2 database, enabling leave requests, historical tracking, and administrative management. Github:https://github.com/Zxuanji/Leave-Application-Processing-System"

# 将输入文本标记化
input_ids = tokenizer(input_text, return_tensors='pt', truncation=True, padding=True, max_length=512).input_ids

# 将输入移动到 GPU（如果使用 GPU）
input_ids = input_ids.to(device)

# 模型切换到评估模式（inference 模式）
model.eval()

# 使用模型进行推理（生成输出）
with torch.no_grad():
    generated_ids = model.generate(input_ids=input_ids, max_length=50, num_beams=5, early_stopping=True)

# 解码生成的 id，得到输出文本
output_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)

# 打印输出
print(f"Input: {input_text}")
print(f"Output: {output_text}")

In [None]:
# 切换模型为评估模式，禁用 dropout 等训练时的机制
model.eval()

# 用于存储生成的结果和真实标签
generated_outputs = []
real_outputs = summarize_test  # summarize_test 是你的真实输出，已经是文本形式

# 遍历 experience_test，生成预测结果
for experience in experience_test:
    # 对每个 experience_test 进行标记化
    test_encoding = tokenizer(experience, return_tensors="pt", truncation=True, padding=True, max_length=512)
    test_input_ids = test_encoding['input_ids'].to(device)
    
    # 禁用梯度计算，节省内存和提高效率
    with torch.no_grad():
        # 生成输出
        generated_ids = model.generate(test_input_ids, max_length=150, num_beams=4, early_stopping=True)
    
    # 解码生成的 token IDs，转化为文本
    generated_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
    
    # 存储生成的文本
    generated_outputs.append(generated_text)

# 对比生成的输出和真实输出
for i in range(len(generated_outputs)):
    print(f"Generated Output {i + 1}: {generated_outputs[i]}")
    print(f"Real Output {i + 1}: {real_outputs[i]}")
    print("="*50)

In [None]:
!pip install rouge-score
from rouge_score import rouge_scorer

# 切换模型为评估模式，禁用 dropout 等训练时的机制
model.eval()

# 用于存储生成的结果和真实标签
generated_outputs = []
real_outputs = summarize_test  # summarize_test 是你的真实输出，已经是文本形式

# 遍历 experience_test，生成预测结果
for experience in experience_test:
    # 对每个 experience_test 进行标记化
    test_encoding = tokenizer(experience, return_tensors="pt", truncation=True, padding=True, max_length=512)
    test_input_ids = test_encoding['input_ids'].to(device)
    
    # 禁用梯度计算，节省内存和提高效率
    with torch.no_grad():
        # 生成输出
        generated_ids = model.generate(test_input_ids, max_length=150, num_beams=4, early_stopping=True)
    
    # 解码生成的 token IDs，转化为文本
    generated_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
    
    # 存储生成的文本
    generated_outputs.append(generated_text)

# 初始化 ROUGE 评分器
scorer = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=True)

# 计算 ROUGE 分数并对比生成的输出和真实输出
for i in range(len(generated_outputs)):
    print(f"Generated Output {i + 1}: {generated_outputs[i]}")
    print(f"Real Output {i + 1}: {real_outputs[i]}")
    
    # 计算 ROUGE 分数
    scores = scorer.score(real_outputs[i], generated_outputs[i])
    print(f"ROUGE-1 Precision: {scores['rouge1'].precision:.4f}, Recall: {scores['rouge1'].recall:.4f}, F1: {scores['rouge1'].fmeasure:.4f}")
    print(f"ROUGE-L Precision: {scores['rougeL'].precision:.4f}, Recall: {scores['rougeL'].recall:.4f}, F1: {scores['rougeL'].fmeasure:.4f}")
    print("="*50)