In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
import pandas as pd
import torch
from torch.utils.data import DataLoader, Dataset
import torch.nn as nn
import torch.nn.functional as F
import os
import jieba

In [2]:
from transformers import BertTokenizer, BertForSequenceClassification

# Load the fine-tuned model and tokenizer
model = BertForSequenceClassification.from_pretrained('/content/drive/MyDrive/Lab1/fine-tuned-bert-environmental-sentiment')
tokenizer = BertTokenizer.from_pretrained('/content/drive/MyDrive/Lab1/fine-tuned-bert-environmental-sentiment')

In [5]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(21128, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [None]:
def read_chinese_files(directory_path):
    texts = []
    filenames = []
    for filename in os.listdir(directory_path):
        if filename.endswith('.txt'):
            file_path = os.path.join(directory_path, filename)
            with open(file_path, 'r', encoding='utf-8') as file:
                text = file.read()
                texts.append(text)
                filenames.append(filename)
    return texts, filenames

directory_path = './data/Analyze'
texts, filenames = read_chinese_files(directory_path)

In [None]:
def tokenize_with_jieba(text):
    tokens = jieba.lcut(text)
    return tokens

In [17]:
MAX_LEN = 128

In [18]:
def predict_weighted_score(text):
    encoding = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=MAX_LEN,
        truncation=True,
        padding='max_length',
        return_attention_mask=True,
        return_tensors='pt',
    )

    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)

    outputs = model(input_ids, attention_mask)
    logits = outputs.logits

    # Apply softmax to get probabilities
    probabilities = F.softmax(logits, dim=1)

    # Compute weighted sum
    class_indices = torch.arange(3).to(device)  # [0, 1, 2]
    score = torch.sum(probabilities * class_indices, dim=1).item()
    return score

In [19]:
results = []

for text, filename in zip(texts, filenames):
    score = predict_weighted_score(text)
    results.append({'filename': filename, 'score': score})

In [30]:
results.sort(key=lambda x: int(x['filename'][:4]))

In [32]:
for result in results:
    print(f"File: {result['filename']}, Sentiment Score: {result['score']:.2f}")

File: 2012年中华人民共和国国务院政府工作报告.txt, Sentiment Score: 0.94
File: 2013年中华人民共和国国务院政府工作报告.txt, Sentiment Score: 1.04
File: 2014年中华人民共和国国务院政府工作报告.txt, Sentiment Score: 1.07
File: 2015年中华人民共和国国务院政府工作报告.txt, Sentiment Score: 1.18
File: 2016年中华人民共和国国务院政府工作报告.txt, Sentiment Score: 1.23
File: 2017年中华人民共和国国务院政府工作报告.txt, Sentiment Score: 1.22
File: 2018年中华人民共和国国务院政府工作报告.txt, Sentiment Score: 1.28
File: 2019年中华人民共和国国务院政府工作报告.txt, Sentiment Score: 1.29
File: 2020年中华人民共和国国务院政府工作报告.txt, Sentiment Score: 1.30
File: 2021年中华人民共和国国务院政府工作报告.txt, Sentiment Score: 1.28
File: 2022年中华人民共和国国务院政府工作报告.txt, Sentiment Score: 1.22


In [None]:
# Convert results to a DataFrame
df_results = pd.DataFrame(results)

# Save to CSV
df_results.to_csv('./data/sentiment_analysis_results.csv', index=False, encoding='utf-8-sig')