## 安裝套件

In [None]:
!pip install jieba
!pip install emoji langdetect
!pip install datasets
!pip install lingua-language-detector

Collecting emoji
  Downloading emoji-2.14.1-py3-none-any.whl.metadata (5.7 kB)
Collecting langdetect
  Downloading langdetect-1.0.9.tar.gz (981 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m981.5/981.5 kB[0m [31m23.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Downloading emoji-2.14.1-py3-none-any.whl (590 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m590.6/590.6 kB[0m [31m40.9 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: langdetect
  Building wheel for langdetect (setup.py) ... [?25l[?25hdone
  Created wheel for langdetect: filename=langdetect-1.0.9-py3-none-any.whl size=993223 sha256=4e1609d1c583813437d8d71d8981c298c515062526bf51e2e900b0b88f40d93f
  Stored in directory: /root/.cache/pip/wheels/0a/f2/b2/e5ca405801e05eb7c8ed5b3b4bcf1fcabcd6272c167640072e
Successfully built langdetect
Installing collected packages: langdetect, emoji
Successfully installed emoji-2.14.

In [None]:
import sys
!{sys.executable} -m pip install jieba emoji langdetect pytz torch lingua-language-detector datasets openpyxl

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

## 引入套件

In [None]:
import pandas as pd
import jieba
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import numpy as np

import re
import emoji
import langdetect
from langdetect import detect
from lingua import LanguageDetectorBuilder, Language, IsoCode639_1
from datetime import datetime
import pytz
from datasets import Dataset
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import LabelEncoder
from transformers import AutoTokenizer, AutoModel
import torch

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# 設定好路徑 (後面都是使用相對路徑)
base_path = '/content/drive/My Drive/SMA'

Mounted at /content/drive


In [None]:
# 讀取資料（請確認你的 Excel 路徑）
# df = pd.read_excel(base_path+"/threads.xlsx")
df = pd.read_excel("threads.xlsx", engine='openpyxl')
df

FileNotFoundError: [Errno 2] No such file or directory: 'threads.xlsx'

In [None]:
# === 語言偵測修正版===
lingua_detector = LanguageDetectorBuilder.from_all_languages().with_preloaded_language_models().build()
lingua_available = True
def detect_lang_with_preprocessing_lingua(text):
    original_text = text

    if pd.isna(text):
        return "unknown"
    text = str(text).strip()
    if not text:
        return "unknown"
    try:
      text_cleaned = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
      text_cleaned = re.sub(r'@\w+', '', text_cleaned)
      text_cleaned = re.sub(r'#\w+', '', text_cleaned)
      text_cleaned = emoji.replace_emoji(text_cleaned, replace='')
      text_cleaned = re.sub(r'\s+', ' ', text_cleaned).strip()
    except Exception as e:
      return "error_state_preprocessing"

    if not text_cleaned:
      return "empty_after_clean"

    try:
      chinese_chars = re.findall(r'[\u4e00-\u9fff]', text_cleaned)
      text_len = len(text_cleaned)
      ratio = len(chinese_chars) / max(text_len, 1)
      chinese_threshold = 0.3
      if ratio > chinese_threshold:
        return "Ch"

      detected_language = lingua_detector.detect_language_of(text_cleaned)

      if detected_language is not None:
        iso_code = detected_language.iso_code_639_1.name
        if iso_code == 'ZH':
          return "Ch"
        else:
          return iso_code.lower()
      else:
        return "unknown"

    except Exception as e:
      return "unknown"

## 清洗數據V1

In [None]:
# === 數值欄位清洗（萬字、逗號格式處理）===
def parse_count(value):
    if pd.isna(value): return 0
    value = str(value).replace(",", "")
    if "萬" in value:
        return int(float(value.replace("萬", "")) * 10000)
    try:
        return int(float(value))
    except:
        return 0

for col in ["like_count", "view_count", "share_count", "repost_count", "reply_count"]:
    df[col] = df[col].apply(parse_count)

# === 布林欄位處理 ===
df["has_photo"] = df["has_photo"].apply(lambda x: str(x).strip().upper() == "Y")
df["has_video"] = df["has_video"].apply(lambda x: str(x).strip().upper() == "Y")

# === emoji 萃取與統計 ===
def extract_emojis(text):
    if not isinstance(text, str): return ""
    return "".join([ch for ch in text if ch in emoji.EMOJI_DATA])

df["emojis"] = df["content"].apply(extract_emojis)
df["emoji_count"] = df["emojis"].apply(len)

# # === 語言偵測修正版===
# def detect_lang_custom(text):
#     try:
#         text = str(text)
#         chinese_chars = re.findall(r'[\u4e00-\u9fff]', text)
#         if len(chinese_chars) / max(len(text), 1) > 0.3:
#             return "Ch"
#         return detect(text)
#     except:
#         return "unknown"

df["lang"] = df["content"].apply(detect_lang_with_preprocessing_lingua)

# === scrape_time 處理（轉換時區 + 抽取星期與小時）===
df["scrape_time_origin"] = pd.to_datetime(df["scrape_time"], utc=True).dt.tz_convert("Asia/Taipei")
df["scrape_time"]  = df["scrape_time_origin"].dt.strftime("%Y年%m月%d日 %H:%M")
df["post_weekday"] = df["scrape_time_origin"].dt.day_name()
df["post_hour"] = df["scrape_time_origin"].dt.hour

# === 是否為高流量文章（破萬）===
df["viral"] = (df["view_count"] >= 10000).astype(int)

# === 是否使用問號、驚嘆號 ===
df["has_question"] = df["content"].apply(lambda x: "？" in str(x) or "?" in str(x))
df["has_exclaim"] = df["content"].apply(lambda x: "！" in str(x) or "!" in str(x))

# === 儲存結果 ===
df.to_csv("threads_cleaned_v1.csv",encoding='utf_8_sig', index=False)
print("✅ 處理完成，已輸出 threads_cleaned_v1.csv")
display(df)


## 清洗數據V2

In [None]:
# --- 文章長度 ---
df["content_length"] = df["content"].apply(lambda x: len(str(x)))

# --- 是否包含網址 ---
df["has_url"] = df["content"].apply(lambda x: "http" in str(x) or "www." in str(x))

# --- 是否包含 @標記他人 ---
df["has_mention"] = df["content"].apply(lambda x: "@" in str(x))

# --- 是否使用 Hashtag ---
df["has_hashtag"] = df["content"].apply(lambda x: "#" in str(x))

# 貼文主題字詞提取（可後續做 TF-IDF 或主題建模）
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(max_features=100, stop_words='english')
word_matrix = vectorizer.fit_transform(df['content'].astype(str))

# 將常見詞語提取出來
keywords = vectorizer.get_feature_names_out()

# 是否為深夜或白天貼文（時間段分類）
def time_period(hour):
    if 5 <= hour < 12:
        return "morning"
    elif 12 <= hour < 17:
        return "afternoon"
    elif 17 <= hour < 22:
        return "evening"
    else:
        return "night"

df["post_period"] = df["post_hour"].apply(time_period)

cols_to_show_first = ['author', 'content', 'content_length', 'lang', 'scrape_time', 'post_weekday', 'post_hour', 'post_period', 'viral']
df = df[cols_to_show_first + [col for col in df.columns if col not in cols_to_show_first]]
df.to_csv("threads_cleaned_v2.csv",encoding='utf_8_sig',index=False)
print("✅ 處理完成，已輸出 threads_cleaned_v2.csv")
display(df)

## 清洗數據embbedding

In [None]:
from datasets import Dataset

# =============== BERT 向量嵌入 ===============
df = df.dropna(subset=['content']) #要先處理content空值才能embedding
# --- 載入 tokenizer & model ---
tokenizer = AutoTokenizer.from_pretrained("bert-base-chinese")
model = AutoModel.from_pretrained("bert-base-chinese")
device = "mps" if torch.backends.mps.is_available() else "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

# --- 建立 HuggingFace Dataset ---
hf_dataset = Dataset.from_pandas(df[["content"]])

# --- tokenize function ---
def tokenize_function(examples):
    return tokenizer(examples['content'], truncation=True, padding='max_length', max_length=128)
tokenized_dataset = hf_dataset.map(tokenize_function, batched=True)

# --- 取得 [CLS] 向量 ---
def extract_embeddings(batch):
    inputs = {k: torch.tensor(v).to(model.device) for k, v in batch.items() if k in tokenizer.model_input_names}
    with torch.no_grad():
        outputs = model(**inputs)
    embeddings = outputs.last_hidden_state[:, 0, :].detach().cpu().numpy()
    return {"embeddings": embeddings}

# --- 批次轉換為 embeddings ---
batch_size = 64
embeddings_dataset = tokenized_dataset.map(extract_embeddings, batched=True, batch_size=batch_size)

# =============== 匯出最終結果 ===============
# embeddings_dataset["embeddings"] 是 list of 768-dim vectors
embedding_df = pd.DataFrame(embeddings_dataset["embeddings"])
final_df = pd.concat([df.reset_index(drop=True), embedding_df], axis=1)

# 儲存
# final_df.to_csv("C:/Users/User/Desktop/louis/threads_with_embeddings.csv",encoding='utf_8_sig', index=False)
final_df.to_csv("threads_with_embeddings.csv",encoding='utf_8_sig', index=False)
print("✅ 全部處理完成，已輸出 threads_with_embeddings.csv")

## 分詞

In [None]:
stopwords = set(['的', '了', '在', '是', '和', '也', '與', '有', '為', '等'])

def tokenize_and_remove_stopwords(text):
    if not isinstance(text, str):
        return ''
    words = jieba.cut(text)
    words_filtered = [word for word in words if word not in stopwords]
    return ' '.join(words_filtered)


df['processed_content'] = df['content'].apply(tokenize_and_remove_stopwords)
df['processed_content'][2]

## 機器學習建模

In [None]:
# 計算 TF-IDF
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(df['processed_content'])

# 計算 TF
tf_vectorizer = CountVectorizer()
tf_matrix = tf_vectorizer.fit_transform(df['processed_content'])

print(tfidf_matrix)

# 多模型分類實驗

In [None]:
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, WeightedRandomSampler, random_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import classification_report
from transformers import AutoTokenizer, AutoModel
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
batch_size = 8
num_epochs = 5
# ========== 參數設定 ==========
model_tokenizer_map = {
    "FusionMacBERT": "hfl/chinese-macbert-base",
    "PureMacBERT": "hfl/chinese-macbert-base",
    "NumericOnly": None,
    "BiLSTMWithNumeric": "bert-base-chinese",
    "MacBERTWithGRU": "hfl/chinese-macbert-base",
    "MacBERTMLPFusion": "hfl/chinese-macbert-base",
    "TextCNNMacBERT": "hfl/chinese-macbert-base",
    "RoBERTa": "hfl/chinese-roberta-wwm-ext",
    "BERTwwmExt": "hfl/chinese-bert-wwm-ext",
    "ERNIE": "nghuyong/ernie-3.0-base-zh",
    "ConvBERT": "YituTech/conv-bert-base"
}

#tokenizer
default_tokenizer_name = model_tokenizer_map["FusionMacBERT"]
tokenizer = AutoTokenizer.from_pretrained(default_tokenizer_name)

#載入資料
# df = pd.read_csv("C:/Users/User/Desktop/louis/threads_cleaned_v2.csv", encoding='utf_8_sig')
#df = df.dropna(subset=['content', 'view_count']).reset_index(drop=True)
df

# Label 分群 (用四分位數分三群)

In [None]:
# Label 分群 ：標籤轉換（按瀏覽數進行分群）
# 取「瀏覽數」的第 80 百分位作為高人氣門檻（q_high）、第 20 百分位作為低人氣門檻（q_low）
# 把每筆資料的「view_count」劃分為三類：0 高人氣 (high)、1 中人氣 (medium)、2 低人氣 (low)
q_high = df['view_count'].quantile(0.80)
q_low = df['view_count'].quantile(0.20)
df['view_class'] = df['view_count'].apply(lambda x: "high" if x >= q_high else ("low" if x <= q_low else "medium"))
label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['view_class'])

#做 oversampling 類別資料平衡 ：資料增強（針對 high / low 類別 oversample）
df_high = df[df['view_class'] == 'high']
df_low = df[df['view_class'] == 'low']
df_medium = df[df['view_class'] == 'medium']

# 分別取出三個分類的樣本：對 high 與 low 分類做「過採樣」，各自複製三次，讓資料數量接近 medium
# 再對整個資料表做隨機打散 (shuffle），避免模型學到資料順序的偏誤
df_high_oversampled = pd.concat([df_high] * 3, ignore_index=True)
df_low_oversampled = pd.concat([df_low] * 3, ignore_index=True)
df = pd.concat([df_medium, df_high_oversampled, df_low_oversampled], ignore_index=True)
df = df.sample(frac=1).reset_index(drop=True)


In [None]:
q_high

In [None]:
q_low

## Label 分群 （1000以下、1000~10000、10000~100000、100000以上)

In [None]:
# 將 view_count 分成四類：
# 0: 小於 1000
# 1: 1000 ~ 9999
# 2: 10000 ~ 99999
# 3: 100000 以上

def map_view_class(x):
    if x < 1000:
        return 'low'
    elif x < 10000:
        return 'medium'
    elif x < 100000:
        return 'high'
    else:
        return 'very_high'

df['view_class'] = df['view_count'].apply(map_view_class)

# 編碼成數字 label
label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['view_class'])

df_high = df[df['view_class'] == 'very_high']
df_medium = df[df['view_class'] == 'high']
df_low = df[df['view_class'] == 'medium']
df_very_low = df[df['view_class'] == 'low']

# 針對較少的類別進行擴增（假設 high 和 very_low 比較少）
df_high_oversampled = pd.concat([df_high] * 3, ignore_index=True)
df_very_low_oversampled = pd.concat([df_very_low] * 3, ignore_index=True)

# 合併並打亂
df = pd.concat([df_medium, df_low, df_high_oversampled, df_very_low_oversampled], ignore_index=True)
df = df.sample(frac=1).reset_index(drop=True)

df

In [None]:
class RegressionDataset(Dataset):
    def __init__(self, df):
        self.input_ids = df['input_ids'].tolist()
        self.attention_mask = df['attention_mask'].tolist()
        self.numerics = df[num_cols].values
        self.targets = df['target'].values

    def __len__(self):
        return len(self.targets)

    def __getitem__(self, idx):
        return {
            'input_ids': torch.tensor(self.input_ids[idx], dtype=torch.long),
            'attention_mask': torch.tensor(self.attention_mask[idx], dtype=torch.long),
            'numerics': torch.tensor(self.numerics[idx], dtype=torch.float),
            'target': torch.tensor(self.targets[idx], dtype=torch.float)
        }


# Normalization 數值特徵標準化

In [None]:
# Normalization 數值特徵標準化
base_num_cols = ['like_count', 'share_count', 'repost_count', 'reply_count', 'emoji_count', 'has_photo', 'has_video', 'has_question', 'has_exclaim', 'has_mention', 'has_url', 'has_hashtag', 'content_length']
# 找出 one-hot 編碼的欄位（語言類型、發文時段、星期幾等類別欄位）
# 使用 StandardScaler 將數值欄位轉換為「標準常態分布」（mean=0, std=1），有助於模型學習穩定。
onehot_cols = [col for col in df.columns if col.startswith('lang_') or col.startswith('post_period_') or col.startswith('post_weekday_')]
num_cols = base_num_cols + onehot_cols
scaler = StandardScaler()
df[num_cols] = scaler.fit_transform(df[num_cols])

# Tokenizer 文本編碼 ：使用事先定義好的 tokenizer（例如 MacBERT、RoBERTa）對貼文進行斷詞、編碼
# 將編碼後的結果儲存到 df 中，這兩個欄位會作為 BERT 模型的輸入
encodings = tokenizer(df['content'].tolist(), truncation=True, padding='max_length', max_length=128)
df['input_ids'] = encodings['input_ids']  # 斷詞後對應的詞彙 ID
df['attention_mask'] = encodings['attention_mask']  # 對應位置是否是 padding（0）或實際內容（1）

In [None]:
df

In [None]:
# Dataset
class CustomDataset(Dataset):
    def __init__(self, df):
        self.input_ids = df['input_ids'].tolist()
        self.attention_mask = df['attention_mask'].tolist()
        self.labels = df['label'].tolist()
        self.numerics = df[num_cols].values

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            'input_ids': torch.tensor(self.input_ids[idx], dtype=torch.long),
            'attention_mask': torch.tensor(self.attention_mask[idx], dtype=torch.long),
            'numerics': torch.tensor(self.numerics[idx], dtype=torch.float),
            'labels': torch.tensor(self.labels[idx], dtype=torch.long)
        }

# Loss
class FocalLoss(nn.Module):
    def __init__(self, gamma=2.0):
        super().__init__()
        self.gamma = gamma
        self.ce = nn.CrossEntropyLoss()

    def forward(self, input, target):
        logp = self.ce(input, target)
        p = torch.exp(-logp)
        loss = (1 - p) ** self.gamma * logp
        return loss.mean()

#模型架構

In [None]:
#模型架構
# 1. FusionMacBERT：BERT + 數值特徵 concat
class FusionMacBERTModel(nn.Module):
    def __init__(self, model_name, num_numeric_features, num_classes):
        super().__init__()
        self.bert = AutoModel.from_pretrained(model_name)
        self.dropout = nn.Dropout(0.3)
        self.numeric_fc = nn.Linear(num_numeric_features, 64)
        self.classifier = nn.Linear(self.bert.config.hidden_size + 64, num_classes)

    def forward(self, input_ids, attention_mask, numerics):
        cls_output = self.bert(input_ids=input_ids, attention_mask=attention_mask).last_hidden_state[:, 0, :]
        num_out = torch.relu(self.numeric_fc(numerics))
        combined = torch.cat((cls_output, num_out), dim=1)
        return self.classifier(self.dropout(combined))

# 2. PureMacBERT：只有文字
class PureMacBERTModel(nn.Module):
    def __init__(self, model_name, num_classes):
        super().__init__()
        self.bert = AutoModel.from_pretrained(model_name)
        self.dropout = nn.Dropout(0.3)
        self.classifier = nn.Linear(self.bert.config.hidden_size, num_classes)

    def forward(self, input_ids, attention_mask, numerics=None):
        cls_output = self.bert(input_ids=input_ids, attention_mask=attention_mask).last_hidden_state[:, 0, :]
        return self.classifier(self.dropout(cls_output))

# 3. NumericOnly：只有數值特徵
class NumericOnlyModel(nn.Module):
    def __init__(self, num_numeric_features, num_classes):
        super().__init__()
        self.classifier = nn.Sequential(
            nn.Linear(num_numeric_features, 64),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(64, num_classes)
        )

    def forward(self, input_ids=None, attention_mask=None, numerics=None):
        return self.classifier(numerics)

# 4. BiLSTMWithNumeric：LSTM 處理詞嵌入 + 數值特徵
class BiLSTMWithNumeric(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_numeric_features, num_classes):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True, bidirectional=True)
        self.numeric_fc = nn.Linear(num_numeric_features, 64)
        self.dropout = nn.Dropout(0.3)
        self.classifier = nn.Linear(hidden_dim * 2 + 64, num_classes)

    def forward(self, input_ids, attention_mask, numerics):
        x = self.embedding(input_ids)
        lstm_out, _ = self.lstm(x)
        pooled = lstm_out[:, -1, :]
        num_out = torch.relu(self.numeric_fc(numerics))
        combined = torch.cat((pooled, num_out), dim=1)
        return self.classifier(self.dropout(combined))

# 5. MacBERTWithGRU：BERT + GRU + 數值特徵
class MacBERTWithGRU(nn.Module):
    def __init__(self, model_name, num_numeric_features, num_classes):
        super().__init__()
        self.bert = AutoModel.from_pretrained(model_name)
        self.gru = nn.GRU(self.bert.config.hidden_size, 128, batch_first=True, bidirectional=True)
        self.numeric_fc = nn.Linear(num_numeric_features, 64)
        self.classifier = nn.Linear(128*2 + 64, num_classes)
        self.dropout = nn.Dropout(0.3)

    def forward(self, input_ids, attention_mask, numerics):
        bert_out = self.bert(input_ids=input_ids, attention_mask=attention_mask).last_hidden_state
        gru_out, _ = self.gru(bert_out)
        pooled = gru_out[:, -1, :]
        num_out = torch.relu(self.numeric_fc(numerics))
        combined = torch.cat((pooled, num_out), dim=1)
        return self.classifier(self.dropout(combined))

# 6. MacBERTMLPFusion：BERT + 數值特徵 -> MLP
class MacBERTMLPFusion(nn.Module):
    def __init__(self, model_name, num_numeric_features, num_classes):
        super().__init__()
        self.bert = AutoModel.from_pretrained(model_name)
        self.fc = nn.Sequential(
            nn.Linear(self.bert.config.hidden_size + num_numeric_features, 256),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(256, num_classes)
        )

    def forward(self, input_ids, attention_mask, numerics):
        cls_output = self.bert(input_ids=input_ids, attention_mask=attention_mask).last_hidden_state[:, 0, :]
        combined = torch.cat((cls_output, numerics), dim=1)
        return self.fc(combined)

# 7. TextCNNMacBERT：BERT 輸出卷積後 + 數值特徵
class TextCNNMacBERT(nn.Module):
    def __init__(self, model_name, num_numeric_features, num_classes):
        super().__init__()
        self.bert = AutoModel.from_pretrained(model_name)
        self.convs = nn.ModuleList([nn.Conv2d(1, 64, (k, self.bert.config.hidden_size)) for k in [2, 3, 4]])
        self.numeric_fc = nn.Linear(num_numeric_features, 64)
        self.classifier = nn.Linear(64 * len([2, 3, 4]) + 64, num_classes)

    def conv_and_pool(self, x, conv):
        x = torch.relu(conv(x)).squeeze(3)
        x = torch.max_pool1d(x, x.size(2)).squeeze(2)
        return x

    def forward(self, input_ids, attention_mask, numerics):
        x = self.bert(input_ids=input_ids, attention_mask=attention_mask).last_hidden_state.unsqueeze(1)
        cnn_out = torch.cat([self.conv_and_pool(x, conv) for conv in self.convs], 1)
        num_out = torch.relu(self.numeric_fc(numerics))
        combined = torch.cat((cnn_out, num_out), dim=1)
        return self.classifier(combined)


In [None]:
#訓練與評估
def train_and_eval(model, name, preview_count=10):
    model = model.to(torch.device("cuda" if torch.cuda.is_available() else "cpu"))
    optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)

    loss_fn = FocalLoss()
    # loss_fn = nn.MSELoss()

    # 訓練階段
    for epoch in range(5):
        model.train()
        for batch in train_loader:
            optimizer.zero_grad()
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            numerics = batch['numerics'].to(device)
            labels = batch['labels'].to(device)
            output = model(input_ids, attention_mask, numerics)
            loss = loss_fn(output, labels)
            loss.backward()
            optimizer.step()

    # 評估階段
    model.eval()
    all_preds, all_labels = [], []
    '''
    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            numerics = batch['numerics'].to(device)
            labels = batch['labels'].to(device)
            output = model(input_ids, attention_mask, numerics)
            preds = output.argmax(dim=1).cpu().numpy()
            all_preds.extend(preds)
            all_labels.extend(labels.cpu().numpy())

    from sklearn.metrics import mean_squared_error, mean_absolute_error
    mse = mean_squared_error(all_targets, all_preds)
    mae = mean_absolute_error(all_targets, all_preds)
    print(f"[{name} 評估結果] MSE: {mse:.2f} | MAE: {mae:.2f}")
    print(f"\n{name} 評估結果：")
    print(classification_report(all_labels, all_preds, target_names=label_encoder.classes_))
    '''
    preview_shown = 0
    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            numerics = batch['numerics'].to(device)
            labels = batch['labels'].to(device)
            output = model(input_ids, attention_mask, numerics)
            preds = output.argmax(dim=1).cpu().numpy()
            all_preds.extend(preds)
            all_labels.extend(labels.cpu().numpy())

            #印出前幾筆的預測、真實值
            if preview_shown < preview_count:
                batch_size = input_ids.shape[0]
                for i in range(batch_size):
                    if preview_shown >= preview_count:
                        break
                    input_id = input_ids[i].cpu().numpy()
                    text = tokenizer.decode(input_id, skip_special_tokens=True)
                    print(f"\n[{name} 預測] 第 {preview_shown+1} 筆")
                    print(f"Text: {text}")
                    print(f"Predicted: {label_encoder.inverse_transform([preds[i]])[0]}")
                    print(f"Actual:    {label_encoder.inverse_transform([labels[i].cpu().item()])[0]}")
                    preview_shown += 1

    print(f"\n{name} 評估結果：")
    print(classification_report(all_labels, all_preds, target_names=label_encoder.classes_))

In [None]:
# 資料分割：資料集切分與取樣
dataset = CustomDataset(df)
train_size = int(0.8 * len(dataset))
train_dataset, test_dataset = random_split(dataset, [train_size, len(dataset)-train_size])
train_labels = [train_dataset[i]['labels'].item() for i in range(len(train_dataset))]
class_counts = pd.Series(train_labels).value_counts().to_dict()
weights = [1.0 / class_counts[label] for label in train_labels]
sampler = WeightedRandomSampler(weights, len(weights), replacement=True)
train_loader = DataLoader(train_dataset, batch_size=8, sampler=sampler)
test_loader = DataLoader(test_dataset, batch_size=8)

# 執行多模型訓練
model_variants = {
    "FusionMacRegressor": FusionMacBERTRegressor("hfl/chinese-macbert-base", len(num_cols), 3),
    "FusionMacBERT": FusionMacBERTModel("hfl/chinese-macbert-base", len(num_cols), 3),
    "PureMacBERT": PureMacBERTModel("hfl/chinese-macbert-base", 3),
    "NumericOnly": NumericOnlyModel(len(num_cols), 3),
    "BiLSTMWithNumeric": BiLSTMWithNumeric(tokenizer.vocab_size, 128, 128, len(num_cols), 3),
    "MacBERTWithGRU": MacBERTWithGRU("hfl/chinese-macbert-base", len(num_cols), 3),
    "MacBERTMLPFusion": MacBERTMLPFusion("hfl/chinese-macbert-base", len(num_cols), 3),
    "TextCNNMacBERT": TextCNNMacBERT("hfl/chinese-macbert-base", len(num_cols), 3),
    "RoBERTa": FusionMacBERTModel("hfl/chinese-roberta-wwm-ext", len(num_cols), 3),
    "BERTwwmExt": FusionMacBERTModel("hfl/chinese-bert-wwm-ext", len(num_cols), 3),
    "ERNIE": FusionMacBERTModel("nghuyong/ernie-3.0-base-zh", len(num_cols), 3),
    "ConvBERT": FusionMacBERTModel("YituTech/conv-bert-base", len(num_cols), 3)
}

# 逐個模型訓練與輸出結果
for name, model in model_variants.items():
    tokenizer_name = model_tokenizer_map.get(name, default_tokenizer_name)
    if tokenizer_name:
        tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
        encodings = tokenizer(df['content'].tolist(), truncation=True, padding='max_length', max_length=128)
        df['input_ids'] = encodings['input_ids']
        df['attention_mask'] = encodings['attention_mask']
    train_and_eval(model, name)

1. FusionMacBERT ✅文字 + ✅數值
BERT： 使用 MacBERT
架構： 把 [CLS] 向量與數值特徵經過 MLP 融合
用途： 做為 baseline 融合模型
優點： 同時考慮內容語義與貼文統計資料（如按讚數、是否有 hashtag）

2. PureMacBERT ✅文字 + ❌數值
BERT： 使用 MacBERT
架構： 單純使用 [CLS]，後接 linear 層分類
用途： 純語言模型 baseline
對照： 可用來比較是否有數值輔助提升效果

3. NumericOnly ❌文字 + ✅數值
模型類型： 只有數值輸入，經過 MLP 做分類
用途： 測試「只靠貼文統計資料」能否達到合理分類
對照： 可與文字模型或融合模型對比效果

4. BiLSTMWithNumeric ✅文字（Embedding+LSTM）+ ✅數值
嵌入方式： 使用 nn.Embedding + BiLSTM 處理文字（不是 BERT）
融合方式： 將 LSTM 最後時間步 + 數值特徵拼接
特別點： 測試「非 Transformer 模型」是否仍具競爭力

5. MacBERTWithGRU ✅文字（MacBERT）+ ✅數值
文字處理： MacBERT 之後再串 GRU
融合方式： GRU 輸出最後一步拼接數值特徵
意圖： 想看 BERT+RNN 的表現 vs. 傳統 BERT

6. MacBERTMLPFusion ✅文字 + ✅數值
處理方式： 文字與數值直接拼接後進入 MLP
不同於 FusionMacBERT：
沒有額外處理數值特徵（如沒有經過 nn.Linear)
更單純的融合設計（屬於 Early Fusion）

7. TextCNNMacBERT ✅文字 + ✅數值
模型組合：
使用 BERT 編碼後丟進 CNN filter (TextCNN)
再與數值特徵融合
用途： 測試 BERT 結合 CNN 特徵提取是否提升效果
有趣點： 有些短文模型（如微博、Threads）對 CNN 特徵抓取敏感

8. RoBERTa ✅文字 + ✅數值
BERT 替代品： 改用 RoBERTa（中文版本）
融合方式： 同 FusionMacBERT
實驗目的： 測試不同語言模型對結果的影響（語言模型 ablation）


9. BERTwwmExt ✅文字 + ✅數值
BERT： 使用 Chinese BERT whole-word-masking 擴展版
比較目的： 同上，用於測試不同語言模型特性的影響

10. ERNIE ✅文字 + ✅數值
BERT： 改用百度的 ERNIE（引入知識增強）
適用場景： 當文本與常識有關（如話題、用語）
目的： 評估知識型語言模型在社群文本分類的效果

11. ConvBERT ✅文字 + ✅數值
模型特色： 使用 Convolution + Self-Attention 混合架構的 BERT
實驗意義： 試驗非傳統 Self-Attention 模型是否有優勢


| 模型名稱              | 說明               | 是否融合 | 文本處理法         | 特殊處理       |
| ----------------- | ---------------- | ---- | ------------- | ---------- |
| FusionMacBERT     | BERT + 數值特徵      | ✅    | MacBERT       | 自製融合層      |
| PureMacBERT       | 純文本模型            | ❌    | MacBERT       | baseline   |
| NumericOnly       | 純統計數值            | ❌    | 無             | MLP only   |
| BiLSTMWithNumeric | LSTM + 數值        | ✅    | nn.Embedding  | 不使用 BERT   |
| MacBERTWithGRU    | BERT + GRU + 數值  | ✅    | MacBERT + GRU | 時序特徵強化     |
| MacBERTMLPFusion  | BERT + 數值        | ✅    | MacBERT       | 拼接後進 MLP   |
| TextCNNMacBERT    | BERT + CNN + 數值  | ✅    | MacBERT + CNN | 模仿 TextCNN |
| RoBERTa           | 換 BERT backbone  | ✅    | RoBERTa       | 模型比較       |
| BERTwwmExt        | 換 BERT backbone  | ✅    | BERT-wwm      | 模型比較       |
| ERNIE             | 引入知識的 BERT       | ✅    | ERNIE         | 模型比較       |
| ConvBERT          | 混合卷積 + 注意力的 BERT | ✅    | ConvBERT      | 模型比較       |




In [None]:
# 資料分割：資料集切分與取樣
dataset = CustomDataset(df)
train_size = int(0.8 * len(dataset))
train_dataset, test_dataset = random_split(dataset, [train_size, len(dataset)-train_size])
train_labels = [train_dataset[i]['labels'].item() for i in range(len(train_dataset))]
class_counts = pd.Series(train_labels).value_counts().to_dict()
weights = [1.0 / class_counts[label] for label in train_labels]
sampler = WeightedRandomSampler(weights, len(weights), replacement=True)
train_loader = DataLoader(train_dataset, batch_size=8, sampler=sampler)
test_loader = DataLoader(test_dataset, batch_size=8)

# 執行多模型訓練
model_variants = {
    "FusionMacBERT": FusionMacBERTModel("hfl/chinese-macbert-base", len(num_cols), 3),
    "PureMacBERT": PureMacBERTModel("hfl/chinese-macbert-base", 3),
    "NumericOnly": NumericOnlyModel(len(num_cols), 3),
    "BiLSTMWithNumeric": BiLSTMWithNumeric(tokenizer.vocab_size, 128, 128, len(num_cols), 3),
    "MacBERTWithGRU": MacBERTWithGRU("hfl/chinese-macbert-base", len(num_cols), 3),
    "MacBERTMLPFusion": MacBERTMLPFusion("hfl/chinese-macbert-base", len(num_cols), 3),
    "TextCNNMacBERT": TextCNNMacBERT("hfl/chinese-macbert-base", len(num_cols), 3),
    "RoBERTa": FusionMacBERTModel("hfl/chinese-roberta-wwm-ext", len(num_cols), 3),
    "BERTwwmExt": FusionMacBERTModel("hfl/chinese-bert-wwm-ext", len(num_cols), 3),
    "ERNIE": FusionMacBERTModel("nghuyong/ernie-3.0-base-zh", len(num_cols), 3),
    "ConvBERT": FusionMacBERTModel("YituTech/conv-bert-base", len(num_cols), 3)
}

# 逐個模型訓練與輸出結果
for name, model in model_variants.items():
    tokenizer_name = model_tokenizer_map.get(name, default_tokenizer_name)
    if tokenizer_name:
        tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
        encodings = tokenizer(df['content'].tolist(), truncation=True, padding='max_length', max_length=128)
        df['input_ids'] = encodings['input_ids']
        df['attention_mask'] = encodings['attention_mask']
    train_and_eval(model, name)

# 迴歸預測

In [None]:
# 原本這樣分類（要拿掉）
# df['view_class'] = ...
# df['label'] = ...

# 直接用原始 view_count 作為 regression target
df = df.dropna(subset=["content", "view_count"])
df['target'] = df['view_count'].apply(parse_count)  # 如果 view_count 不是數字要先轉換


In [None]:
def train_and_eval_regression(model, name):
    model = model.to(device)
    optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
    loss_fn = nn.MSELoss()

    # 訓練
    for epoch in range(5):
        model.train()
        for batch in train_loader:
            optimizer.zero_grad()
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            numerics = batch['numerics'].to(device)
            targets = batch['labels'].float().to(device)  # 重要：labels 必須是 float
            preds = model(input_ids, attention_mask, numerics)
            loss = loss_fn(preds, targets)
            loss.backward()
            optimizer.step()

    # 評估
    model.eval()
    all_preds, all_targets = [], []
    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            numerics = batch['numerics'].to(device)
            targets = batch['labels'].float().to(device)
            preds = model(input_ids, attention_mask, numerics)
            all_preds.extend(preds.cpu().numpy())
            all_targets.extend(targets.cpu().numpy())

    from sklearn.metrics import mean_squared_error, mean_absolute_error
    mse = mean_squared_error(all_targets, all_preds)
    mae = mean_absolute_error(all_targets, all_preds)
    r2 = r2_score(all_targets, all_preds)
    print(f"\n{name}  MSE: {mse:.2f} | MAE: {mae:.2f}| R2: {r2:.2f}")

    from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

    return all_targets, all_preds





In [None]:
class FusionMacBERTRegressor(nn.Module):
    def __init__(self, model_name, num_numeric_features, output_dim=1):
        super().__init__()
        self.bert = AutoModel.from_pretrained(model_name)
        self.dropout = nn.Dropout(0.3)
        self.numeric_fc = nn.Linear(num_numeric_features, 64)
        self.regressor = nn.Linear(self.bert.config.hidden_size + 64, output_dim)

    def forward(self, input_ids, attention_mask, numerics):
        cls_output = self.bert(input_ids=input_ids, attention_mask=attention_mask).last_hidden_state[:, 0, :]
        num_out = torch.relu(self.numeric_fc(numerics))
        combined = torch.cat((cls_output, num_out), dim=1)
        return self.regressor(self.dropout(combined)).squeeze(1)  # (batch,)


In [None]:
rm -rf ~/.cache/huggingface/transformers/hfl__chinese-macbert-base


In [None]:
# Dataset
'''
class CustomDatasetRegression(Dataset):
    def __init__(self, df):
        self.input_ids = df['input_ids'].tolist()
        self.attention_mask = df['attention_mask'].tolist()
        self.labels = df['label'].astype(float).values   # ← 為回歸任務需轉成 float
        self.numerics = df[num_cols].astype(float).values  # ← 確保為 float array

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            'input_ids': torch.tensor(self.input_ids[idx], dtype=torch.long),
            'attention_mask': torch.tensor(self.attention_mask[idx], dtype=torch.long),
            'numerics': torch.tensor(self.numerics[idx], dtype=torch.float),  # ← 修正為 float
            'labels': torch.tensor(self.labels[idx], dtype=torch.float)       # ← 修正為 float
        }
'''
class CustomDatasetRegression(Dataset):
    def __init__(self, df):
        self.input_ids = df['input_ids'].tolist()
        self.attention_mask = df['attention_mask'].tolist()
        self.labels = df['target'].astype(float).values
        self.numerics = df[num_cols].astype(float).values

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            'input_ids': torch.tensor(self.input_ids[idx], dtype=torch.long),
            'attention_mask': torch.tensor(self.attention_mask[idx], dtype=torch.long),
            'numerics': torch.tensor(self.numerics[idx], dtype=torch.float),
            'labels': torch.tensor(self.labels[idx], dtype=torch.float)
        }

# Loss
class FocalLoss(nn.Module):
    def __init__(self, gamma=2.0):
        super().__init__()
        self.gamma = gamma
        self.ce = nn.CrossEntropyLoss()

    def forward(self, input, target):
        logp = self.ce(input, target)
        p = torch.exp(-logp)
        loss = (1 - p) ** self.gamma * logp
        return loss.mean()

In [None]:

dataset = CustomDatasetRegression(df)
train_size = int(0.8 * len(dataset))
train_dataset, test_dataset = random_split(dataset, [train_size, len(dataset) - train_size])
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=8)


class FusionMacBERTRegressor(nn.Module):
    def __init__(self, model_name, num_numeric_features, output_dim=1):
        super().__init__()
        self.bert = AutoModel.from_pretrained(model_name)
        self.dropout = nn.Dropout(0.3)
        self.numeric_fc = nn.Linear(num_numeric_features, 64)
        self.regressor = nn.Linear(self.bert.config.hidden_size + 64, output_dim)

    def forward(self, input_ids, attention_mask, numerics):
        cls_output = self.bert(input_ids=input_ids, attention_mask=attention_mask).last_hidden_state[:, 0, :]
        num_out = torch.relu(self.numeric_fc(numerics))
        combined = torch.cat((cls_output, num_out), dim=1)
        return self.regressor(self.dropout(combined)).squeeze(1)


def train_and_eval_regression(model, name):
    model = model.to(device)
    optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
    loss_fn = nn.MSELoss()

    for epoch in range(5):
        model.train()
        for batch in train_loader:
            optimizer.zero_grad()
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            numerics = batch['numerics'].to(device)
            targets = batch['labels'].float().to(device)

            preds = model(input_ids, attention_mask, numerics)
            loss = loss_fn(preds, targets)
            loss.backward()
            optimizer.step()

    model.eval()
    all_preds, all_targets = [], []
    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            numerics = batch['numerics'].to(device)
            targets = batch['labels'].float().to(device)

            preds = model(input_ids, attention_mask, numerics)
            all_preds.extend(preds.cpu().numpy())
            all_targets.extend(targets.cpu().numpy())

    from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
    mse = mean_squared_error(all_targets, all_preds)
    mae = mean_absolute_error(all_targets, all_preds)
    r2 = r2_score(all_targets, all_preds)

    print(f"\n{name}  MSE: {mse:.2f} | MAE: {mae:.2f} | R²: {r2:.2f}")
    return all_targets, all_preds


model = FusionMacBERTRegressor("hfl/chinese-macbert-base", len(num_cols))
all_targets, all_preds = train_and_eval_regression(model, "FusionMacBERTRegressor")

#**下面都是舊的東西而已~~~~**


In [None]:
stopwords = set(['的', '了', '在', '是', '和', '也', '與', '有', '為', '等'])

def tokenize_and_remove_stopwords(text):
    words = jieba.cut(text)
    words_filtered = [word for word in words if word not in stopwords]
    return ' '.join(words_filtered)

df['processed_content'] = df['Content'].apply(tokenize_and_remove_stopwords)

new_article = "IC 設計大廠聯發科 (2454-TW) 副董事長暨執行長蔡力行今 (26) 日獲頒潘文淵獎，會後受訪表示，聯發科 3 奈米會在台積電 (2330-TW)(TSM-US) 做，且由於先進製程技術相當複雜，不論要採用或更換都非常困難，雙方會持續緊密合作。外界今日提問不論是輝達 (NVDA-US)、蘋果 (AAPL-US) 等都表示尋求多元的晶圓代工方案，蔡力行回應，聯發科在先進製程持續與台積電緊密合作，英特爾 (INTC-US) 則負責 16 奈米蔡力行也強調，聯發科不會只停在採用 4 奈米，也會採用 3 奈米製程，此外，由於電晶體微縮速度趨緩，儘管技術上可行，但不一定符合經濟效益，因此技術也逐步從平面變成 2D、2.5D，甚至 3D 等，先進封裝的重要性比以前增加。至於跟輝達合作，蔡力行重申，雙方合作仍以汽車為主，輝達布局車用比聯發科早，主要著墨在智慧座艙與 ADAS 系統，雙方有很好的配合，其中，輝達主攻高階、聯發科則瞄準中階，雙方正密切合作開會。"
processed_new_article = tokenize_and_remove_stopwords(new_article)

# Random Forest 模型訓練與預測
from sklearn.ensemble import RandomForestClassifier

# 使用相同的數據分割方式
X_train_rf, X_val_rf, y_train_rf, y_val_rf = train_test_split(tfidf_matrix, y, test_size=0.1, random_state=42)

X_train_rf, X_val_rf, y_train_rf, y_val_rf = train_test_split(X_train_rf, y_train_rf, test_size=0.1, random_state=42)

# 創建隨機森林模型
rand_forest_model = RandomForestClassifier(n_estimators=100, random_state=42)

# 訓練隨機森林模型
rand_forest_model.fit(X_train_rf, y_train_rf)

# 預測
y_val_pred_rf = rand_forest_model.predict(X_val_rf)
y_test_pred_rf = rand_forest_model.predict(X_test)

# 分類報告
print("驗證集 Validation Classification Report:")
print(classification_report(y_val_rf, y_val_pred_rf))

print("\n測試集 Test Classification Report:")
print(classification_report(y_test, y_test_pred_rf))

# 對照表
result_df_val_rf = pd.DataFrame({'Actual': y_val_rf, 'Predicted': y_val_pred_rf})
result_df_test_rf = pd.DataFrame({'Actual': y_test, 'Predicted': y_test_pred_rf})

print("驗證集 Validation Result Comparison:")
print(result_df_val_rf)

print("\n測試集 Test Result Comparison:")
print(result_df_test_rf)



In [None]:
new_article = "IC 設計大廠聯發科 (2454-TW) 副董事長暨執行長蔡力行今 (26) 日獲頒潘文淵獎，會後受訪表示，聯發科 3 奈米會在台積電 (2330-TW)(TSM-US) 做，且由於先進製程技術相當複雜，不論要採用或更換都非常困難，雙方會持續緊密合作。外界今日提問不論是輝達 (NVDA-US)、蘋果 (AAPL-US) 等都表示尋求多元的晶圓代工方案，蔡力行回應，聯發科在先進製程持續與台積電緊密合作，英特爾 (INTC-US) 則負責 16 奈米蔡力行也強調，聯發科不會只停在採用 4 奈米，也會採用 3 奈米製程，此外，由於電晶體微縮速度趨緩，儘管技術上可行，但不一定符合經濟效益，因此技術也逐步從平面變成 2D、2.5D，甚至 3D 等，先進封裝的重要性比以前增加。至於跟輝達合作，蔡力行重申，雙方合作仍以汽車為主，輝達布局車用比聯發科早，主要著墨在智慧座艙與 ADAS 系統，雙方有很好的配合，其中，輝達主攻高階、聯發科則瞄準中階，雙方正密切合作開會。"

processed_new_article = tokenize_and_remove_stopwords(new_article)

print(processed_new_article)

# 將新文章轉換為 TF-IDF 表示形式
new_article_tfidf = tfidf_vectorizer.transform([processed_new_article])

# 使用投票分類器進行預測
predicted_label_ensemble = voting_classifier.predict(new_article_tfidf)

print(f"新文章預測結果: {predicted_label_ensemble}")


In [None]:
# 使用 inverse_transform 將預測的數字編碼轉換回原始標籤
predicted_label_original = label_encoder.inverse_transform(predicted_label_ensemble)

print(f"新文章預測結果（原始標籤）: {predicted_label_original}")


## 實際預測（研究）


In [None]:
# 獲取所有標籤對應的編碼
all_labels = label_encoder.classes_

print("所有標籤對應的編碼:")
for label_code, label in enumerate(all_labels):
    print(f"編碼 {label_code}: 標籤 {label}")


In [None]:
import jieba

# 定義停用詞
stopwords = set(['的', '了', '在', '是', '和', '也', '與', '有', '為', '等'])

# 定義分詞並去除停用詞的函數
def tokenize_and_remove_stopwords(text):
    words = jieba.cut(text)  # 使用 jieba 进行分词
    words_filtered = [word for word in words if word not in stopwords]  # 去除停用词
    return ' '.join(words_filtered)

# 將處理後的內容加入 DataFrame 中
df['processed_content'] = df['Content'].apply(tokenize_and_remove_stopwords)

# 新文章
new_article = "台股守穩季線，週線三連紅。（資料照） 〔財經頻道／綜合報導〕美國CPI略高於市場預期，美股漲勢暫歇，本週以來，台股經過兩日大漲後，今（13）日指數震盪走低，終場下跌43.34點，以16782.57點作收，守住季線關卡，成交量為2986.08億元，週線上漲262點，呈現三連紅，緯創失守百元大關，AI族群普遍都是收黑，電子類股以矽光子、網通等次族群比較有表現，傳產輪動到營建、造紙、百貨等接棒演出。 前10大成交額個股漲多跌少，除了AI族群收黑，其他都是紅盤居多，廣達跌12元，收226元，成交額182.32億元，排名第1；台積電終場漲3元，收553元，成交額171.29億元，排名第2；矽統終場漲2.75元，收47.7元，成交額146.25億元，排名第3；定穎投控漲3.3元，收103元，成交額95.92億元，排名第4；緯創跌3.4元，收99.1元，成交額93.89億元，排名第5。 請繼續往下閱讀...  技嘉跌13.5元，收271元，成交額89.05億元，排名第6；創意收1695元平盤，成交額86.78億元，排名第7；聯發科上漲27元，收842元，成交額81.63億元，排名第8；裕隆漲1.1元，收85.1元，成交額66.51億元，排名第9；材料-KY漲 5元，收1185元，成交額63.34億元，排名第10。 一手掌握經濟脈動點我訂閱自由財經Youtube頻道 不用抽 不用搶 現在用APP看新聞 保證天天中獎點我下載APP按我看活動辦法 相關新聞"

# 處理新文章
processed_new_article = tokenize_and_remove_stopwords(new_article)

# 輸出處理後的文章
print(processed_new_article)

# 將新文章轉換為 TF-IDF 表示形式
new_article_tfidf = tfidf_vectorizer.transform([processed_new_article])

# 使用投票分類器進行預測
predicted_label_ensemble = voting_classifier.predict(new_article_tfidf)

# 輸出預測結果
print(f"新文章預測結果: {predicted_label_ensemble}")


In [None]:
import jieba

# 定義停用詞
stopwords = set(['的', '了', '在', '是', '和', '也', '與', '有', '為', '等'])

# 定義分詞並去除停用詞的函數
def tokenize_and_remove_stopwords(text):
    words = jieba.cut(text)  # 使用 jieba 进行分词
    words_filtered = [word for word in words if word not in stopwords]  # 去除停用词
    return ' '.join(words_filtered)

# 將處理後的內容加入 DataFrame 中
df['processed_content'] = df['Content'].apply(tokenize_and_remove_stopwords)

# 新文章
new_article = "小漲"

# 處理新文章
processed_new_article = tokenize_and_remove_stopwords(new_article)

# 輸出處理後的文章
print(processed_new_article)

# 將新文章轉換為 TF-IDF 表示形式
new_article_tfidf = tfidf_vectorizer.transform([processed_new_article])

# 使用投票分類器進行預測
predicted_label_ensemble = voting_classifier.predict(new_article_tfidf)

# 輸出預測結果
print(f"新文章預測結果: {predicted_label_ensemble}")