In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from torch.utils.data import DataLoader, TensorDataset, random_split
import torch
import torch.nn as nn
from torch.utils.data import Dataset
import torch.nn.functional as F
from tqdm.auto import tqdm
import time

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
df = pd.read_excel('/content/drive/MyDrive/DSAA5002PROJECT/Task1_data/labelnews.xlsx')

In [None]:
# Import model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')
model = BertForSequenceClassification.from_pretrained('bert-base-chinese', num_labels=2)
model.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-chinese and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(21128, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [None]:
# Split dataset
train_df, test_df = train_test_split(df, test_size=0.2, random_state=0)

In [None]:
class NewsDataset(Dataset):
    def __init__(self, texts, labels, tokenizer):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]

        # Encode the text using the provided tokenizer
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=512,
            return_token_type_ids=False,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
            truncation=True
        )

        item = {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
        }

        # Only add a label if it's not None
        if label is not None:
            item['labels'] = torch.tensor(label, dtype=torch.long)

        return item

train_dataset = NewsDataset(train_df['NewsContent'].values, train_df['label'].values, tokenizer)
test_dataset = NewsDataset(test_df['NewsContent'].values, test_df['label'].values, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

In [None]:
# Give 8 times the weight to 0 tags and Initialize the loss function
weights = torch.tensor([8.0, 1.0], dtype=torch.float).to(device)
loss_fn = nn.CrossEntropyLoss(weight=weights)

def train_epoch(model, data_loader, optimizer, device, scheduler, n_examples, loss_fn):
    model = model.train()
    losses = []
    correct_predictions = 0

    for d in data_loader:
        input_ids = d["input_ids"].to(device)
        attention_mask = d["attention_mask"].to(device)
        labels = d["labels"].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        loss = loss_fn(logits, labels)

        _, preds = torch.max(logits, dim=1)
        correct_predictions += torch.sum(preds == labels)
        losses.append(loss.item())

        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()

    return correct_predictions.double() / n_examples, np.mean(losses)

def eval_model(model, data_loader, device, n_examples, loss_fn):
    model = model.eval()
    losses = []
    correct_predictions = 0

    with torch.no_grad():
        for d in data_loader:
            input_ids = d["input_ids"].to(device)
            attention_mask = d["attention_mask"].to(device)
            labels = d["labels"].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            loss = loss_fn(logits, labels)

            _, preds = torch.max(logits, dim=1)
            correct_predictions += torch.sum(preds == labels)
            losses.append(loss.item())

    return correct_predictions.double() / n_examples, np.mean(losses)

In [None]:
optimizer = AdamW(model.parameters(), lr=1e-5, correct_bias=False, weight_decay=0.05)
epoch_name = 8
total_steps = len(train_loader) * epoch_name
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

for epoch in range(epoch_name):
    print(f'Epoch {epoch + 1}/{epoch_name}')
    print('-' * 10)

    train_acc, train_loss = train_epoch(model, train_loader, optimizer, device, scheduler, len(train_dataset), loss_fn)
    print(f'Train loss {train_loss} accuracy {train_acc}')

    val_acc, val_loss = eval_model(model, test_loader, device, len(test_dataset), loss_fn)
    print(f'Val   loss {val_loss} accuracy {val_acc}')
    print()

Epoch 1/8
----------
Train loss 0.7904914429313258 accuracy 0.6270627062706271
Val   loss 0.6552443504333496 accuracy 0.2631578947368421

Epoch 2/8
----------
Train loss 0.5736358761787415 accuracy 0.7128712871287128
Val   loss 0.7429872155189514 accuracy 0.513157894736842

Epoch 3/8
----------
Train loss 0.49836251218067973 accuracy 0.8316831683168316
Val   loss 0.4996787667274475 accuracy 0.8157894736842105

Epoch 4/8
----------
Train loss 0.2635927931650689 accuracy 0.9273927392739274
Val   loss 0.43347851037979124 accuracy 0.8684210526315789

Epoch 5/8
----------
Train loss 0.13972282600834182 accuracy 0.9735973597359736
Val   loss 1.2540863394737243 accuracy 0.894736842105263

Epoch 6/8
----------
Train loss 0.054873995556447064 accuracy 0.9834983498349835
Val   loss 1.4639798045158385 accuracy 0.9078947368421052

Epoch 7/8
----------
Train loss 0.020707314123252506 accuracy 0.9933993399339934
Val   loss 1.6108197450637818 accuracy 0.9210526315789473

Epoch 8/8
----------
Train lo

In [None]:
# Evaluating model performance
test_acc, _ = eval_model(model, test_loader, device, len(test_dataset), loss_fn)
print(f'Test Accuracy: {test_acc.item()}')

Test Accuracy: 0.9210526315789473


In [None]:
def get_predictions(model, data_loader, device):
    model = model.eval()
    predictions = []
    real_values = []

    with torch.no_grad():
        for d in data_loader:
            inputs = d["input_ids"].to(device)
            attention_mask = d["attention_mask"].to(device)
            labels = d["labels"].to(device)

            outputs = model(input_ids=inputs, attention_mask=attention_mask)
            _, preds = torch.max(outputs.logits, dim=1)

            predictions.extend(preds)
            real_values.extend(labels)

    predictions = torch.stack(predictions).cpu()
    real_values = torch.stack(real_values).cpu()
    return predictions, real_values

test_preds, test_labels = get_predictions(model, test_loader, device)
f'Test Accuracy: {test_acc.item()}'
print(f'Predict labels: {test_preds}')
print(f'True labels: {test_labels}')

Predict labels: tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
        1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1,
        1, 1, 1, 1])
True labels: tensor([1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0,
        1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1,
        1, 1, 1, 1])


In [None]:
test_preds-test_labels

tensor([ 0,  0,  0,  0,  0,  1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  0,  0,  0,  0,  0,  0,  0,  0,
         1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  0, -1,  0,  0,  0,
         0,  0,  0,  0])

In [None]:
df_prepare = pd.read_excel('/content/drive/MyDrive/DSAA5002PROJECT/Task1_data/Flitered/Task1_part1.xlsx')

In [None]:
# Create a new DataFrame to hold the predictions
df_prepare['NewsContent'] = df_prepare['NewsContent'].astype(str)
new_df = df_prepare[['NewsContent']].copy()
new_df['label'] = None

# Creat a new Dataset for forecasting
predict_dataset = NewsDataset(new_df['NewsContent'].values, [None]*len(new_df), tokenizer)
predict_loader = DataLoader(predict_dataset, batch_size=16, shuffle=False)

# Get Forecasts
def get_predictions(model, data_loader, device):
    model = model.eval()
    predictions = []

    with torch.no_grad():
        for d in tqdm(data_loader, desc="Predicting", unit="batch", leave=False):
            inputs = d["input_ids"].to(device)
            attention_mask = d["attention_mask"].to(device)

            outputs = model(input_ids=inputs, attention_mask=attention_mask)
            _, preds = torch.max(outputs.logits, dim=1)

            predictions.extend(preds)

    predictions = torch.stack(predictions).cpu()
    return predictions

start_time = time.time()
predictions = get_predictions(model, predict_loader, device)
end_time = time.time()

new_df['label'] = predictions.numpy()

# Print forecast results and times
new_df.head()
print(f"Prediction time: {end_time - start_time:.2f}s")

Predicting:   0%|          | 0/32950 [00:00<?, ?batch/s]

Prediction time: 4828.03s


In [None]:
new_df.to_excel('/content/drive/MyDrive/DSAA5002PROJECT/Task1_data/Flitered/Task1_answer.xlsx', encoding='utf-8')

  return func(*args, **kwargs)


In [None]:
new_df['NewsID'] = df_prepare[['NewsID']].copy()
new_df['Explicit_Company'] = df_prepare[['Explicit_Company']].copy()

In [None]:
new_df

Unnamed: 0,NewsContent,label,NewsID,Explicit_Company
0,本报记者 田雨 李京华 中国建设银行股份有限公司原董事长张恩照受贿案３日一审宣...,0,1,建设银行
1,中国农业银行信用卡中心由北京搬到上海了！ 农行行长杨明生日前在信用卡中心揭牌仪式上...,1,2,农业银行
2,在新基金快速发行以及申购资金回流的情况下，市场总体上呈现资金流动性过剩格局，考虑到现阶段...,1,3,"中国国航,外运发展"
3,胜利股份（000407）公司子公司填海造地2800亩，以青岛的地价估算，静态价值在10亿...,1,4,胜利股份
4,全景网11月30日讯 外围股市造好，带动港股今早造好，恒指高开后反覆上升，最高升252点...,1,5,"中国银行,建设银行,工商银行,中国太保,交通银行,中国人寿,招商银行"
...,...,...,...,...
527181,每经AI快讯，有投资者在投资者互动平台提问：请问公司目前有没有电解槽产能，规划情况能否详细介...,0,1037031,亿华通
527182,依米康（SZ 300249，收盘价：10.38元）发布公告称，2023年10月12日，依米康...,1,1037032,"中泰证券,依米康"
527183,天风证券10月13日发布研报称，给予中核科技（000777.SZ，最新价：13.03元）买入...,1,1037033,"中核科技,天风证券"
527184,有投资者提问：抗癌药CPT获批后，公司是否应该按照股权协议继续收购沙东股权，适应症为MM的C...,1,1037034,海特生物


In [None]:
new_order = ['NewsID', 'NewsContent', 'Explicit_Company', 'label']

# Rearranging the columns of a DataFrame with a new column order
new_df1 = new_df[new_order]

new_df1

Unnamed: 0,NewsID,NewsContent,Explicit_Company,label
0,1,本报记者 田雨 李京华 中国建设银行股份有限公司原董事长张恩照受贿案３日一审宣...,建设银行,0
1,2,中国农业银行信用卡中心由北京搬到上海了！ 农行行长杨明生日前在信用卡中心揭牌仪式上...,农业银行,1
2,3,在新基金快速发行以及申购资金回流的情况下，市场总体上呈现资金流动性过剩格局，考虑到现阶段...,"中国国航,外运发展",1
3,4,胜利股份（000407）公司子公司填海造地2800亩，以青岛的地价估算，静态价值在10亿...,胜利股份,1
4,5,全景网11月30日讯 外围股市造好，带动港股今早造好，恒指高开后反覆上升，最高升252点...,"中国银行,建设银行,工商银行,中国太保,交通银行,中国人寿,招商银行",1
...,...,...,...,...
527181,1037031,每经AI快讯，有投资者在投资者互动平台提问：请问公司目前有没有电解槽产能，规划情况能否详细介...,亿华通,0
527182,1037032,依米康（SZ 300249，收盘价：10.38元）发布公告称，2023年10月12日，依米康...,"中泰证券,依米康",1
527183,1037033,天风证券10月13日发布研报称，给予中核科技（000777.SZ，最新价：13.03元）买入...,"中核科技,天风证券",1
527184,1037034,有投资者提问：抗癌药CPT获批后，公司是否应该按照股权协议继续收购沙东股权，适应症为MM的C...,海特生物,1


In [None]:
new_df1.to_excel('/content/drive/MyDrive/DSAA5002PROJECT/Task1_data/Flitered/Task1.xlsx', index=False, encoding='utf-8')

  return func(*args, **kwargs)


In [None]:
are_columns_equal = new_df1['NewsContent'].equals(df_prepare['NewsContent'])

print('Are the NewsContent columns equal?', are_columns_equal)

Are the NewsContent columns equal? True
