In [6]:
#1.數據預處理
#1-1數據清洗-缺失值處理

import pandas as pd
import json

# 定義檔路徑
data_identification_path = r'/kaggle/input/dm-2024-isa-5810-lab-2-homework/data_identification.csv'
emotion_path = r'/kaggle/input/dm-2024-isa-5810-lab-2-homework/emotion.csv'

# 讀取數據
data_identification = pd.read_csv(data_identification_path)
emotion = pd.read_csv(emotion_path)

# 1-1 資料清洗 - 缺失值處理
# 檢查data_identification.csv中的缺失值
missing_data_identification = data_identification.isnull().sum()
print("缺失值統計 - data_identification.csv:")
print(missing_data_identification)

# 檢查emotion.csv中的缺失值
missing_emotion = emotion.isnull().sum()
print("缺失值統計 - emotion.csv:")
print(missing_emotion)


缺失值統計 - data_identification.csv:
tweet_id          0
identification    0
dtype: int64
缺失值統計 - emotion.csv:
tweet_id    0
emotion     0
dtype: int64


In [7]:
# 1-2 資料劃分 - 訓練集和測試集
# 根據identification列將資料分為訓練集和測試集
train_data = data_identification[data_identification['identification'] == 'train'].copy()
test_data = data_identification[data_identification['identification'] == 'test'].copy()

# 檢查訓練集和測試集中的tweet_id是否有重疊
重疊tweet_id = train_data['tweet_id'].isin(test_data['tweet_id']).sum()
print("訓練集和測試集中重疊的tweet_id數量:", 重疊tweet_id)

# 如果有重疊，列印出重疊的tweet_id
if 重疊tweet_id > 0:
    overlapping_ids = train_data[train_data['tweet_id'].isin(test_data['tweet_id'])]['tweet_id'].unique()
    print("重疊的tweet_id列表:", overlapping_ids)
else:
    print("訓練集和測試集中的tweet_id沒有重疊。")


訓練集和測試集中重疊的tweet_id數量: 0
訓練集和測試集中的tweet_id沒有重疊。


In [8]:
# 1-3 合併資料集
# 使用tweet_id將train_data和emotion.csv合併，確保每個推文都有對應的情緒標籤
merged_train_data = pd.merge(train_data, emotion, on='tweet_id', how='left')

# 檢查合併後的缺失值
missing_values = merged_train_data.isnull().sum()
print("合併後的缺失值統計:")
print(missing_values)

# 查看合併後的資料集前幾行
print(merged_train_data.head())


合併後的缺失值統計:
tweet_id          0
identification    0
emotion           0
dtype: int64
   tweet_id identification       emotion
0  0x29e452          train           joy
1  0x2b3819          train           joy
2  0x2a2acc          train         trust
3  0x2a8830          train           joy
4  0x20b21d          train  anticipation


In [9]:
#1-4文本資料讀取
import os

# 定義JSON檔路徑
tweets_dm_path = r'/kaggle/input/dm-2024-isa-5810-lab-2-homework/tweets_DM.json'

# 初始化一個空字典來存儲推文資料
tweets_data = {}

# 讀取JSON文件
with open(tweets_dm_path, 'r', encoding='utf-8') as file:
    for line in file:
        try:
            # 將每行解析為JSON對象
            tweet = json.loads(line)
            # 提取tweet_id和推文文本
            tweet_id = tweet['_source']['tweet']['tweet_id']
            text = tweet['_source']['tweet']['text']
            # 將推文文本存儲在字典中
            tweets_data[tweet_id] = text
        except json.JSONDecodeError as e:
            # 如果解析出錯，列印錯誤資訊和對應的行
            print(f"Error parsing line: {line}")
            print(e)

# 將推文文本資料轉換為DataFrame
tweets_df = pd.DataFrame(list(tweets_data.items()), columns=['tweet_id', 'text'])

# 將推文文本資料與合併後的資料集關聯，使用tweet_id作為關聯鍵
final_data = pd.merge(merged_train_data, tweets_df, on='tweet_id', how='left')

# 檢查最終關聯後的資料集前幾行
print(final_data.head())

# 將推文文本資料與合併後的資料集關聯，使用tweet_id作為關聯鍵
final_test_data = pd.merge(test_data, tweets_df, on='tweet_id', how='left')

# 檢查最終關聯後的資料集前幾行
print(final_test_data.head())

   tweet_id identification       emotion  \
0  0x29e452          train           joy   
1  0x2b3819          train           joy   
2  0x2a2acc          train         trust   
3  0x2a8830          train           joy   
4  0x20b21d          train  anticipation   

                                                text  
0  Huge Respect🖒 @JohnnyVegasReal talking about l...  
1  Yoooo we hit all our monthly goals with the ne...  
2  @KIDSNTS @PICU_BCH @uhbcomms @BWCHBoss Well do...  
3  Come join @ambushman27 on #PUBG while he striv...  
4  @fanshixieen2014 Blessings!My #strength little...  
   tweet_id identification                                               text
0  0x28cc61           test  @Habbo I've seen two separate colours of the e...
1  0x2db41f           test  @FoxNews @KellyannePolls No serious self respe...
2  0x2466f6           test  Looking for a new car, and it says 1 lady owne...
3  0x23f9e9           test  @cineworld “only the brave” just out and fount...
4  0x1fb4e1    

In [10]:
#2.特徵工程
#2-1文本特徵(詞幹提取)
import nltk
from nltk.stem import PorterStemmer

# 初始化詞幹提取器
stemmer = PorterStemmer()

# 定義詞幹提取函數
def stemming(text):
    words = text.split()
    stemmed_words = [stemmer.stem(word) for word in words]
    return ' '.join(stemmed_words)

# 應用文本特徵提取函數到推文文本列
final_data['text_stemmed'] = final_data['text'].apply(stemming)

# 檢查最終關聯後的資料集前幾行
print(final_data.head())

# 應用文本特徵提取函數到推文文本列
final_test_data['text_stemmed'] = final_test_data['text'].apply(stemming)

# 檢查最終關聯後的資料集前幾行
print(final_test_data.head())

   tweet_id identification       emotion  \
0  0x29e452          train           joy   
1  0x2b3819          train           joy   
2  0x2a2acc          train         trust   
3  0x2a8830          train           joy   
4  0x20b21d          train  anticipation   

                                                text  \
0  Huge Respect🖒 @JohnnyVegasReal talking about l...   
1  Yoooo we hit all our monthly goals with the ne...   
2  @KIDSNTS @PICU_BCH @uhbcomms @BWCHBoss Well do...   
3  Come join @ambushman27 on #PUBG while he striv...   
4  @fanshixieen2014 Blessings!My #strength little...   

                                        text_stemmed  
0  huge respect🖒 @johnnyvegasr talk about lose hi...  
1  yoooo we hit all our monthli goal with the new...  
2  @kidsnt @picu_bch @uhbcomm @bwchboss well done...  
3  come join @ambushman27 on #pubg while he striv...  
4  @fanshixieen2014 blessings!mi #strength little...  
   tweet_id identification                                          

In [11]:
#2-2中繼資料特徵(時間特徵、使用者行為特徵、tweet長度)
import datetime

# 定義提取時間特徵的函數
def extract_time_features(date_str):
    date = datetime.datetime.strptime(date_str, '%Y-%m-%d %H:%M:%S')
    return {
        'year': date.year,
        'month': date.month,
        'day': date.day,
        'weekday': date.weekday(),  # 0 is Monday, 6 is Sunday
        'hour': date.hour
    }

# 定義提取使用者行為特徵的函數
def extract_user_behavior_features(text):
    hashtags = text.count('#')
    retweet = text.count('RT ')
    reply = text.count('@')
    return {
        'hashtags_count': hashtags,
        'retweets_count': retweet,
        'replies_count': reply
    }

# 定義提取tweet長度特徵的函數
def tweet_length(text):
    return len(text)

# 讀取tweets_DM.json檔並提取特徵
tweets_features = []

with open(tweets_dm_path, 'r', encoding='utf-8') as file:
    for line in file:
        try:
            tweet = json.loads(line)
            tweet_id = tweet['_source']['tweet']['tweet_id']
            crawl_date = tweet['_crawldate']
            
            # 提取時間特徵
            time_features = extract_time_features(crawl_date)
            
            # 將特徵組合在一起
            tweets_features.append({
                'tweet_id': tweet_id,
                **time_features
            })
        except json.JSONDecodeError as e:
            print(f"Error parsing line: {line}")
            print(e)

# 將特徵轉換為DataFrame
tweets_time_features_df = pd.DataFrame(tweets_features)

# 合併時間特徵到tweets_df
tweets_df = pd.merge(tweets_df, tweets_time_features_df, on='tweet_id', how='left')

# 提取使用者行為特徵和tweet長度特徵
tweets_df['user_behavior_features'] = tweets_df['text'].apply(extract_user_behavior_features)
tweets_df['tweet_length'] = tweets_df['text'].apply(tweet_length)

# 展開使用者行為特徵
user_behavior_columns = ['hashtags_count', 'retweets_count', 'replies_count']
tweets_df[user_behavior_columns] = pd.DataFrame(tweets_df['user_behavior_features'].tolist(), index=tweets_df.index)

# 刪除臨時列
tweets_df.drop(['user_behavior_features'], axis=1, inplace=True)

# 合併最終資料集
final_data = pd.merge(final_data, tweets_df, on='tweet_id', how='left')

# 檢查最終資料集的前幾行
print(final_data.head())

# 合併最終資料集
final_test_data = pd.merge(final_test_data, tweets_df, on='tweet_id', how='left')

# 檢查最終資料集的前幾行
print(final_test_data.head())


   tweet_id identification       emotion  \
0  0x29e452          train           joy   
1  0x2b3819          train           joy   
2  0x2a2acc          train         trust   
3  0x2a8830          train           joy   
4  0x20b21d          train  anticipation   

                                              text_x  \
0  Huge Respect🖒 @JohnnyVegasReal talking about l...   
1  Yoooo we hit all our monthly goals with the ne...   
2  @KIDSNTS @PICU_BCH @uhbcomms @BWCHBoss Well do...   
3  Come join @ambushman27 on #PUBG while he striv...   
4  @fanshixieen2014 Blessings!My #strength little...   

                                        text_stemmed  \
0  huge respect🖒 @johnnyvegasr talk about lose hi...   
1  yoooo we hit all our monthli goal with the new...   
2  @kidsnt @picu_bch @uhbcomm @bwchboss well done...   
3  come join @ambushman27 on #pubg while he striv...   
4  @fanshixieen2014 blessings!mi #strength little...   

                                              text_y  year  m

In [12]:
#2-3特徵選擇(中繼資料特徵)
from sklearn.ensemble import GradientBoostingClassifier

# 選擇特徵和目標變數
X = final_data.drop(['tweet_id', 'identification', 'emotion', 'text_x', 'text_stemmed', 'text_y'], axis=1)
y = final_data['emotion']

# 初始化梯度提升樹模型
gb = GradientBoostingClassifier(n_estimators=10, random_state=42)

# 訓練模型
gb.fit(X, y)

# 使用模型評估特徵重要性
importances = gb.feature_importances_

# 將特徵重要性與特徵名稱結合起來
feature_importances = pd.Series(importances, index=X.columns).sort_values(ascending=False)

# 選擇平均以上的特徵
threshold = importances.mean()
selected_features = feature_importances[feature_importances > threshold].index.tolist()

# 創建一個新的 DataFrame，包含選中的特徵和其他非特徵列
selected_data = final_data[['tweet_id', 'identification', 'emotion'] + selected_features]

# 列印最終選擇的特徵和目標變數的前幾行
print(selected_data.head())

# 創建一個新的 DataFrame，包含選中的特徵和其他非特徵列
selected_test_data = final_test_data[['tweet_id', 'identification'] + selected_features]

# 列印最終選擇的特徵和目標變數的前幾行
print(selected_test_data.head())

   tweet_id identification       emotion  replies_count  tweet_length  \
0  0x29e452          train           joy              1           140   
1  0x2b3819          train           joy              0            87   
2  0x2a2acc          train         trust              4            81   
3  0x2a8830          train           joy              1           132   
4  0x20b21d          train  anticipation              1           139   

   hashtags_count  
0               1  
1               2  
2               0  
3               6  
4               4  
   tweet_id identification  replies_count  tweet_length  hashtags_count
0  0x28cc61           test              1            81               0
1  0x2db41f           test              2            99               0
2  0x2466f6           test              0           116               1
3  0x23f9e9           test              1           105               1
4  0x1fb4e1           test              0           137               0


In [13]:
#2-4特徵合併
# 從第二個程式中獲取選擇的特徵清單
selected_features = feature_importances[feature_importances > threshold].index.tolist()

# 將text_stemmed添加到selected_features列表中
selected_features.append('text_stemmed')

# 創建一個新的DataFrame，包含選中的特徵和其他非特徵列
selected_data = final_data[['tweet_id', 'identification', 'emotion'] + selected_features]

# 列印最終選擇的特徵和目標變數的前幾行
print(selected_data.head())

# 創建一個新的DataFrame，包含選中的特徵和其他非特徵列
selected_test_data = final_test_data[['tweet_id', 'identification'] + selected_features]

# 列印最終選擇的特徵和目標變數的前幾行
print(selected_test_data.head())

   tweet_id identification       emotion  replies_count  tweet_length  \
0  0x29e452          train           joy              1           140   
1  0x2b3819          train           joy              0            87   
2  0x2a2acc          train         trust              4            81   
3  0x2a8830          train           joy              1           132   
4  0x20b21d          train  anticipation              1           139   

   hashtags_count                                       text_stemmed  
0               1  huge respect🖒 @johnnyvegasr talk about lose hi...  
1               2  yoooo we hit all our monthli goal with the new...  
2               0  @kidsnt @picu_bch @uhbcomm @bwchboss well done...  
3               6  come join @ambushman27 on #pubg while he striv...  
4               4  @fanshixieen2014 blessings!mi #strength little...  
   tweet_id identification  replies_count  tweet_length  hashtags_count  \
0  0x28cc61           test              1            81     

In [14]:
#3-1模型訓練(樸素貝葉斯模型)
import numpy as np
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from joblib import dump
from scipy.sparse import hstack

# 創建樸素貝葉斯模型
nb_classifier = MultinomialNB()

# 準備數據
X = selected_data.drop(['tweet_id', 'identification', 'emotion'], axis=1)
y = selected_data['emotion']

# 準備數據
X_sample = selected_test_data.drop(['tweet_id', 'identification'], axis=1)

# 文本特徵處理，優化詞彙量
vectorizer = TfidfVectorizer(max_features=20000, ngram_range=(1, 2)) # 假設我們只保留最重要的20000個詞彙
X_text = vectorizer.fit_transform(X['text_stemmed'])
X_sample_test = vectorizer.fit_transform(X_sample['text_stemmed'])

# 將其他數值特徵與文本特徵合併
X_numeric = X.drop('text_stemmed', axis=1).values
X_combined = hstack((X_numeric, X_text))  # 使用疏鬆陣列合併
X_sample_numeric = X_sample.drop('text_stemmed', axis=1).values
X_sample_combined = hstack((X_sample_numeric, X_sample_test))  # 使用疏鬆陣列合併

# 交叉驗證
scores = cross_val_score(nb_classifier, X_combined, y, cv=3)  # 使用3折交叉驗證
print(f"Cross-validation scores: {scores}")
print(f"Average score: {scores.mean()}")

# 訓練模型
nb_classifier.fit(X_combined, y)

# 保存模型
dump(nb_classifier, 'naive_bayes_model.joblib')

Cross-validation scores: [0.52143705 0.52368566 0.52271598]
Average score: 0.522612899684235


['naive_bayes_model.joblib']

In [16]:
#3-2模型優化及評估（網格搜索）
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV
from joblib import load
from sklearn.metrics import make_scorer, f1_score

# 載入已經訓練好的模型
nb_classifier = load('naive_bayes_model.joblib')

# 定義超參數網格
param_grid = {
    'alpha': [0.1, 0.5, 1.0, 2.0, 5.0],
    'fit_prior': [True, False]
}


# 創建F1分數評分器
f1_scorer = make_scorer(f1_score, average='micro')  

from sklearn.model_selection import StratifiedKFold

skf = StratifiedKFold(n_splits=5)
grid_search = GridSearchCV(estimator=nb_classifier, param_grid=param_grid, 
                           cv=skf, scoring={'accuracy': 'accuracy', 'f1': f1_scorer}, 
                           refit='f1', verbose=2, n_jobs=-1)

# 執行網格搜索
grid_search.fit(X_combined, y)

# 輸出最佳參數和得分
print("Best parameters found: ", grid_search.best_params_)
print("Best cross-validation scores: ")
print("Accuracy: ", grid_search.cv_results_['mean_test_accuracy'][grid_search.best_index_])
print("F1 score: ", grid_search.cv_results_['mean_test_f1'][grid_search.best_index_])

# 使用最佳參數訓練模型
best_nb_classifier = grid_search.best_estimator_

# 保存模型
dump(best_nb_classifier, 'naive_bayes_optimized_model.joblib')

Fitting 5 folds for each of 10 candidates, totalling 50 fits
Best parameters found:  {'alpha': 0.1, 'fit_prior': True}
Best cross-validation scores: 
Accuracy:  0.5245379281421201
F1 score:  0.5245379281421201


['naive_bayes_optimized_model.joblib']

[CV] END ..........................alpha=0.1, fit_prior=True; total time=  19.7s
[CV] END .........................alpha=0.1, fit_prior=False; total time=  20.2s
[CV] END ..........................alpha=0.5, fit_prior=True; total time=  19.4s
[CV] END .........................alpha=0.5, fit_prior=False; total time=  19.9s
[CV] END .........................alpha=0.5, fit_prior=False; total time=  19.9s
[CV] END ..........................alpha=1.0, fit_prior=True; total time=  19.5s
[CV] END .........................alpha=1.0, fit_prior=False; total time=  19.9s
[CV] END ..........................alpha=2.0, fit_prior=True; total time=  19.3s
[CV] END .........................alpha=2.0, fit_prior=False; total time=  19.9s
[CV] END .........................alpha=2.0, fit_prior=False; total time=  19.7s
[CV] END ..........................alpha=5.0, fit_prior=True; total time=  19.3s
[CV] END .........................alpha=5.0, fit_prior=False; total time=  20.1s
[CV] END ...................

In [17]:
#4.生成預測
import joblib
import pandas as pd

# 載入訓練好的模型
optimized_nb_classifier = joblib.load('naive_bayes_optimized_model.joblib')

# 使用模型進行預測
predicted_emotions = optimized_nb_classifier.predict(X_sample_combined)

# 創建提交文件的DataFrame，將'tweet_id'改為'id'
submission_data = pd.DataFrame({
    'id': selected_test_data['tweet_id'],  # 修改列名
    'emotion': predicted_emotions
})

# 保存為CSV檔
submission_data.to_csv('sampleSubmission.csv', index=False)
