In [None]:
import requests
import json
import pandas as pd
import numpy as np
from transformers import BertTokenizer, BertModel
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split

# 1. 从NewsAPI获取新闻数据
def fetch_news_data(api_key, query='technology', language='en', page_size=100):
    """
    从NewsAPI获取新闻数据
    """
    url = f'https://newsapi.org/v2/everything?q={query}&language={language}&pageSize={page_size}'
    headers = {'X-Api-Key': api_key}
    
    response = requests.get(url, headers=headers)
    return response.json()['articles']

# 2. 数据预处理和BIO标注
def prepare_bio_data(articles):
    """
    将新闻数据转换为BIO格式
    """
    bio_data = []
    
    for article in articles:
        title = article['title']
        # 使用简单的规则来识别实体
        # 这里我们假设大写开头的连续词是实体
        words = title.split()
        bio_tags = []
        i = 0
        while i < len(words):
            word = words[i]
            # 检查是否是潜在的实体（大写开头）
            if word[0].isupper() and len(word) > 1:
                # 检查是否是多词实体
                if i + 1 < len(words) and words[i + 1][0].isupper():
                    bio_tags.append((word, 'B-ORG'))  # 实体开始
                    bio_tags.append((words[i + 1], 'I-ORG'))  # 实体继续
                    i += 2
                else:
                    bio_tags.append((word, 'B-ORG'))  # 单词实体
                    i += 1
            else:
                bio_tags.append((word, 'O'))  # 非实体
                i += 1
        
        bio_data.append(bio_tags)
    
    return bio_data

# 3. 创建数据集类
class NERDataset(Dataset):
    def __init__(self, texts, tags, tokenizer, max_len=128):
        self.texts = texts
        self.tags = tags
        self.tokenizer = tokenizer
        self.max_len = max_len
        
        self.tag2idx = {'O': 0, 'B-ORG': 1, 'I-ORG': 2, 'PAD': 3}
        self.idx2tag = {v: k for k, v in self.tag2idx.items()}
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = ' '.join([word for word, _ in self.texts[idx]])
        tags = [self.tag2idx[tag] for _, tag in self.texts[idx]]
        
        # 使用BERT tokenizer处理文本
        encoding = self.tokenizer(
            text,
            return_tensors='pt',
            max_length=self.max_len,
            padding='max_length',
            truncation=True
        )
        
        # 处理标签
        attention_mask = encoding['attention_mask'][0]
        labels = torch.tensor(tags + [self.tag2idx['PAD']] * (self.max_len - len(tags)))
        
        return {
            'input_ids': encoding['input_ids'][0],
            'attention_mask': attention_mask,
            'labels': labels[:self.max_len]
        }

# 4. 定义模型
class BertNER(nn.Module):
    def __init__(self, bert_model, num_labels):
        super(BertNER, self).__init__()
        self.bert = bert_model
        self.dropout = nn.Dropout(0.1)
        self.classifier = nn.Linear(768, num_labels)  # BERT hidden size = 768
        
    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        sequence_output = outputs[0]
        sequence_output = self.dropout(sequence_output)
        logits = self.classifier(sequence_output)
        return logits

# 5. 训练和评估函数
def train_model(model, train_loader, val_loader, device, epochs=3):
    """
    训练NER模型
    """
    optimizer = torch.optim.Adam(model.parameters(), lr=2e-5)
    criterion = nn.CrossEntropyLoss()
    
    for epoch in range(epochs):
        model.train()
        total_loss = 0
        
        for batch in train_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            optimizer.zero_grad()
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            loss = criterion(outputs.view(-1, outputs.shape[-1]), labels.view(-1))
            
            loss.backward()
            optimizer.step()
            
            total_loss += loss.item()
        
        print(f'Epoch {epoch+1}, Loss: {total_loss/len(train_loader)}')
        
        # 验证
        model.eval()
        val_loss = 0
        with torch.no_grad():
            for batch in val_loader:
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['labels'].to(device)
                
                outputs = model(input_ids=input_ids, attention_mask=attention_mask)
                loss = criterion(outputs.view(-1, outputs.shape[-1]), labels.view(-1))
                val_loss += loss.item()
                
        print(f'Validation Loss: {val_loss/len(val_loader)}')

# 6. 预测函数
def predict_entities(model, tokenizer, text, device):
    """
    预测文本中的实体
    """
    model.eval()
    encoding = tokenizer(
        text,
        return_tensors='pt',
        max_length=128,
        padding='max_length',
        truncation=True
    )
    
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)
    
    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        predictions = torch.argmax(outputs, dim=2)
    
    return predictions[0].cpu().numpy()

In [None]:

# 2. 设置你的NewsAPI密钥
api_key = 'API_Key'

# 3. 获取数据
articles = fetch_news_data(api_key)
bio_data = prepare_bio_data(articles)

# 4. 初始化BERT模型和tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased')

# 5. 准备数据集
train_data, val_data = train_test_split(bio_data, test_size=0.2)
train_dataset = NERDataset(train_data, train_data, tokenizer)
val_dataset = NERDataset(val_data, val_data, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8)

# 6. 初始化模型
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = BertNER(bert_model, num_labels=4).to(device)

# 7. 训练模型
train_model(model, train_loader, val_loader, device)

Epoch 1, Loss: 0.8372131794691086
Validation Loss: 0.3999054829279582
Epoch 2, Loss: 0.27877880185842513
Validation Loss: 0.23904635508855185
Epoch 3, Loss: 0.1831450268626213
Validation Loss: 0.17095055679480234
