## Subject: LLM - Detect AI Generated Text
## Reference: LLM Detect: Text Cluster [中文]

### Install Packet

In [None]:
!pip install sentence_transformers
!pip install scikit-learn
!pip install pandas
!pip install numpy

### Import Packet

In [None]:
from sentence_transformers import SentenceTransformer
from sklearn.manifold import TSNE
import pandas as pd
import numpy as np

### Path settings

In [None]:
# 資料路徑
# DATA_PATH = './input'
DATA_PATH = '/kaggle/input/llm-detect-ai-generated-text'

# 讀取訓練集文章數據
train_essays = pd.read_csv(f'{DATA_PATH}/train_essays.csv')

# 讀取訓練集作文題目數據
train_prompts = pd.read_csv(f'{DATA_PATH}/train_prompts.csv')

# 讀取測試集文章數據
test_essays = pd.read_csv(f'{DATA_PATH}/test_essays.csv')

# 讀取樣本提交文件
sample_submit = pd.read_csv(f'{DATA_PATH}/sample_submission.csv')

In [None]:
# 載入外資料集
train_v2_drcat_02_path = '/kaggle/input/daigt-v2-train-dataset/train_v2_drcat_02.csv'
# train_v2_drcat_02_path = './train_v2_drcat_02.csv'
train = pd.read_csv(train_v2_drcat_02_path, sep=',')

In [None]:
# 合併外部資料集
# train = pd.concat([
#     train_essays.rename({'generated': 'label'}, axis=1)[['text', 'label']],
#     train[['text', 'label']]
# ],axis=0)

# 依照文字內容進行去重
train = train.drop_duplicates(subset=['text'])
train.reset_index(drop=True, inplace=True)

In [None]:
# 載入與訓練模型
model = SentenceTransformer('all-MiniLM-L6-v2')

In [None]:
# 進行文字編碼
embeddings = model.encode(train['text'], show_progress_bar=True)

In [None]:
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
embeddings_pca = pca.fit_transform(embeddings)

In [None]:
import matplotlib.pyplot as plt

for c in train['label'].unique():
    plt.scatter(embeddings_pca[train['label']==c, 0],
                embeddings_pca[train['label']==c, 1])
plt.legend(train['label'].unique(), title='label')

In [None]:
for c in train['prompt_name'].unique():
    plt.scatter(embeddings_pca[train['prompt_name']==c, 0],
                embeddings_pca[train['prompt_name']==c, 1])
plt.legend(train['prompt_name'].unique(), title='prompt_name', bbox_to_anchor=(1, 1.0))

In [None]:
X = np.array([[0, 0, 0], [0, 1, 1], [1, 0, 1], [1, 1, 1]])
X_embedded = TSNE(n_components=2, learning_rate='auto',
                  init='random', perplexity=3).fit_transform(X)
X_embedded.shape

In [None]:
data_index = np.random.choice(range(44868), 5000)

tsne = TSNE(n_components=2)
embeddings_tsne = tsne.fit_transform(embeddings[data_index])

In [None]:
for c in train['label'].iloc[data_index].unique():
    plt.scatter(embeddings_tsne[train['label'].iloc[data_index]==c, 0],
                embeddings_tsne[train['label'].iloc[data_index]==c, 1])

plt.legend(train['label'].iloc[data_index].unique(), title='label')

In [None]:
for c in train['prompt_name'].iloc[data_index].unique():
    plt.scatter(embeddings_tsne[train['prompt_name'].iloc[data_index]==c, 0],
                embeddings_tsne[train['prompt_name'].iloc[data_index]==c, 1])

plt.legend(train['prompt_name'].unique(), title='prompt_name', bbox_to_anchor=(1, 1.0))