# 🧠 LIAR-PLUS Dataset Exploration
這份 Notebook 主要用於分析 `train2.tsv`、`val2.tsv`、`test2.tsv` 三個檔案的結構與內容，並透過視覺化了解資料分佈與特性。

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os

# 設定資料路徑
train_path = './LIAR-PLUS/dataset/train2.tsv'
val_path = './LIAR-PLUS/dataset/val2.tsv'
test_path = './LIAR-PLUS/dataset/test2.tsv'

# 讀入資料集
train_df = pd.read_csv(train_path, sep='\t', header=None)
val_df = pd.read_csv(val_path, sep='\t', header=None)
test_df = pd.read_csv(test_path, sep='\t', header=None)

# 指定欄位名稱
columns = ["id", "json", "label", "statement", "subject", "speaker", "job", "state", "party", "barely_true", "false", "half_true", "mostly_true", "true", "context", "justification"]
for df in [train_df, val_df, test_df]:
    df.columns = columns

## 🔍 1. 資料基本資訊

In [None]:
print("Train set size:", train_df.shape)
print("Validation set size:", val_df.shape)
print("Test set size:", test_df.shape)

train_df.head()

## 📊 2. 標籤分佈

In [None]:
def plot_label_distribution(df, name):
    plt.figure(figsize=(8,5))
    sns.countplot(x='label', data=df, order=df['label'].value_counts().index)
    plt.title(f'{name} - Label Distribution')
    plt.xlabel('Label')
    plt.ylabel('Count')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()

plot_label_distribution(train_df, "Train")
plot_label_distribution(val_df, "Validation")
plot_label_distribution(test_df, "Test")

## 📈 3. 信用分數相關性視覺化

In [None]:
credit_cols = ["barely_true", "false", "half_true", "mostly_true", "true"]
plt.figure(figsize=(10, 6))
sns.heatmap(train_df[credit_cols].corr(), annot=True, cmap='Blues')
plt.title("Credit Feature Correlation (Train Set)")
plt.show()

## 🗣 4. 發言者與黨派關聯統計

In [None]:
top_speakers = train_df['speaker'].value_counts().head(10)
plt.figure(figsize=(10,5))
sns.barplot(x=top_speakers.values, y=top_speakers.index)
plt.title('Top 10 Frequent Speakers in Train Set')
plt.xlabel('Counts')
plt.ylabel('Speaker')
plt.tight_layout()
plt.show()

# 黨派分佈
plt.figure(figsize=(8,5))
sns.countplot(y='party', data=train_df, order=train_df['party'].value_counts().index[:10])
plt.title('Top 10 Parties in Train Set')
plt.tight_layout()
plt.show()