In [3]:
# pip install pandas numpy tqdm transformers torch

In [6]:
# !pip3 install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu126

In [2]:
import torch

print("CUDA Available:", torch.cuda.is_available())
print("cuDNN Enabled:", torch.backends.cudnn.enabled)
print("cuDNN Version:", torch.backends.cudnn.version())



CUDA Available: True
cuDNN Enabled: True
cuDNN Version: 90501


In [5]:
import torch
print(torch.__version__)


2.6.0.dev20241221+cu126


In [1]:
import ta
print("TA Library Imported Successfully")

TA Library Imported Successfully


In [7]:
import torch

# 測試 CUDA 是否可用
print("CUDA Available:", torch.cuda.is_available())

# 測試 GPU 名稱
if torch.cuda.is_available():
    print("GPU Name:", torch.cuda.get_device_name(0))

# 測試 cuDNN
print("cuDNN Enabled:", torch.backends.cudnn.enabled)
print("cuDNN Version:", torch.backends.cudnn.version())



CUDA Available: True
GPU Name: NVIDIA GeForce RTX 3050 Laptop GPU
cuDNN Enabled: True
cuDNN Version: 90501


In [None]:
import os
import pandas as pd
from tqdm import tqdm
from transformers import BertTokenizer, BertForSequenceClassification
import torch
import torch.nn.functional as F
import re

# 設定資料夾路徑
articles_dir = r'C:\Users\morri\Desktop\IRTM-project\articles'  # 替換為實際路徑

# 初始化 BERT Tokenizer 和模型（以 FinBERT 為例）
tokenizer = BertTokenizer.from_pretrained('yiyanghkust/finbert-tone')
model = BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-tone')

# 確保模型在評估模式
model.eval()

# 使用 GPU 如果可用
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# 儲存每日情緒的列表
daily_sentiments = []

# 定義清理文本的函數
def clean_text(text):
    # 去除版權信息、商標符號等
    text = re.sub(r'©\s*\d{4}.*?All Rights Reserved\.', '', text)
    text = re.sub(r'\s*&\s*©\s*\d{4}.*?All Rights Reserved\.', '', text)
    # 去除其他不需要的部分（根據需要調整）
    text = re.sub(r'\s+', ' ', text)  # 將多個空白字元替換為一個空白
    text = text.strip()
    return text

# 定義批次處理的函數
def process_batch(texts, tokenizer, model, device, max_length=512):
    inputs = tokenizer(texts, return_tensors='pt', truncation=True, padding=True, max_length=max_length)
    inputs = {key: val.to(device) for key, val in inputs.items()}

    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        probs = F.softmax(logits, dim=1)
        # 假設模型有三個類別：positive, neutral, negative
        sentiment_scores = probs[:, 0].cpu().numpy() - probs[:, 2].cpu().numpy()  # positive - negative
    return sentiment_scores

# 初始化全局文章計數器
global_article_count = 0

# 定義進度間隔
progress_interval = 100  # 每100篇輸出一次
# 若想每1000篇輸出一次，將上行改為 progress_interval = 1000

# 獲取所有日期資料夾，並排序（確保時間順序）
date_folders = sorted([folder for folder in os.listdir(articles_dir) if os.path.isdir(os.path.join(articles_dir, folder))])

# 使用外層 tqdm 顯示日期進度
for date_folder in tqdm(date_folders, desc="Processing Dates"):
    date_path = os.path.join(articles_dir, date_folder)
    sentiments = []
    texts = []
    
    # 使用內層 tqdm 顯示當日文章進度
    article_files = sorted([file for file in os.listdir(date_path) if os.path.isfile(os.path.join(date_path, file))])
    for article_file in tqdm(article_files, desc=f"Processing {date_folder}", leave=False):
        article_path = os.path.join(date_path, article_file)
        try:
            with open(article_path, 'r', encoding='utf-8') as f:
                text = f.read()
            # 資料清洗
            cleaned_text = clean_text(text)
            
            # 若清洗後文本過短，則跳過
            if len(cleaned_text) < 10:
                continue
            texts.append(cleaned_text)
        except Exception as e:
            print(f"Error processing {article_path}: {e}")
            continue
    
    # 批次處理
    batch_size = 32  # 根據 GPU 記憶體調整
    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i+batch_size]
        batch_sentiments = process_batch(batch_texts, tokenizer, model, device)
        sentiments.extend(batch_sentiments)
        
        # 更新全局文章計數器
        global_article_count += len(batch_texts)
        
        # 檢查是否達到進度顯示的間隔
        if global_article_count % progress_interval < batch_size:
            print(f'Processed {global_article_count} articles so far.')
    
    # 計算當日平均情緒分數
    if sentiments:
        average_sentiment = sum(sentiments) / len(sentiments)
    else:
        average_sentiment = 0  # 若無文章，設定為0或其他適當值

    daily_sentiments.append({
        'date': date_folder,
        'average_sentiment': average_sentiment,
        'num_articles': len(sentiments)
    })

# 轉換為 DataFrame
sentiment_df = pd.DataFrame(daily_sentiments)

# 儲存為 CSV（可選）
sentiment_df.to_csv('daily_sentiments.csv', index=False)

print("Completed processing all articles.")
