# Settings

In [6]:
import os
import random
from tqdm import tqdm
import numpy as np
import pandas as pd

import torch
from torch.utils.data import Dataset

import evaluate
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from transformers import DataCollatorWithPadding
from transformers import TrainingArguments, Trainer

from sklearn.model_selection import train_test_split

SEED = 456
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)

DEVICE = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
DEVICE
MAX_SEQ_LENGTH = 36 
BASE_DIR = os.getcwd()
DATA_DIR = os.path.join(BASE_DIR, '../data')
OUTPUT_DIR = os.path.join(BASE_DIR, '../output')

In [None]:
model_name = 'klue/bert-base'
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=7).to(DEVICE)
tokenizer = AutoTokenizer.from_pretrained(model_name)

data = pd.read_csv(os.path.join('/data/ephemeral/home/level2-nlp-datacentric-nlp-11/data/merge_gj_sy_jh_ms_sn_clustering_clean_train_data_aug-google.csv'))
dataset_train, dataset_valid = train_test_split(data, test_size=0.3, stratify=data['target'],random_state=SEED)

In [8]:
class BERTDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=MAX_SEQ_LENGTH):
        input_texts = data['clean_text'][(data["is_noise"] == 1.0)&(data['clean_text']!='0')]
        labels = data['target'][(data["is_noise"] == 1.0)&(data['clean_text']!='0')]
        self.inputs = []
        self.labels = []
        
        for text, label in zip(input_texts, labels):
            tokenized_input = tokenizer(text, padding='max_length', truncation=True, max_length=max_length, return_tensors='pt')
            self.inputs.append(tokenized_input)
            self.labels.append(torch.tensor(label))
    
    def __getitem__(self, idx):
        return {
            'input_ids': self.inputs[idx]['input_ids'].squeeze(0),  
            'attention_mask': self.inputs[idx]['attention_mask'].squeeze(0),
            'labels': self.labels[idx].squeeze(0)
        }
    
    def __len__(self):
        return len(self.labels)

# Embedding

In [9]:
import matplotlib.pyplot as plt
#% matplotlib inline
from transformers import AutoTokenizer, AutoModel

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=7,output_hidden_states=True).to(DEVICE)
model.config.problem_type = "single_label_classification" 
model.load_state_dict(torch.load('/data/ephemeral/home/level2-nlp-datacentric-nlp-11/output/11051351_merge_gj_sy_jh_new_ms_sn_clustering_clean_train_data_aug-google_model.bin'))
data_all = BERTDataset(data, tokenizer)

In [11]:
# Test
inputs = tokenizer(data.iloc[5]['text'],return_tensors='pt').to(DEVICE)

In [None]:
model.eval()
preds = []
embedding = []
for idx, sample in tqdm(data.iterrows()):
    inputs = tokenizer(sample['text'], return_tensors="pt").to(DEVICE)
    with torch.no_grad():
        embedding.append(model(inputs['input_ids'])[1][-1].squeeze()[0].cpu())

In [13]:
df_embedding = pd.DataFrame(embedding)
df_embedding['label'] = data['target']

In [None]:
df_embedding

In [17]:
df = df_embedding.iloc[:,:-1].applymap(lambda x: x.item())

In [18]:
df['label'] = [int(x) for x in df_embedding['label']]

# T-sne

In [None]:
data['clean_text']

In [20]:
df['clean_text'] = data['clean_text']

In [27]:
import pandas as pd

# 임베딩 데이터프레임 생성
# df_embedding = pd.DataFrame(embedding)
# df_embedding['label'] = data['target']
# df_embedding['clean_text'] = data['clean_text']  # clean_text 추가

# # 숫자 타입으로 변환 (필요한 경우)
# df = df_embedding.copy()

# 모든 열 이름을 문자열로 변환
df.columns = df.columns.astype(str)

# 'label'과 'clean_text'는 마지막 두 열이라고 가정
feature_columns = df.columns[:-2]  # 임베딩 벡터 열
df_features = df[feature_columns].applymap(lambda x: x.item() if hasattr(x, 'item') else x)

# 'label'과 'clean_text' 추가
df_features['label'] = df['label'].astype(int)
df_features['clean_text'] = df['clean_text']


In [None]:
import pandas as pd
import plotly.express as px
from sklearn.decomposition import PCA

# PCA 3D 실행
pca_3d = PCA(n_components=3, random_state=42)
X_pca_3d = pca_3d.fit_transform(df_features.iloc[:, :-2])

# PCA 3D 결과 데이터프레임 생성
df_pca_3d = pd.DataFrame(X_pca_3d, columns=['PC1', 'PC2', 'PC3'])
df_pca_3d['Label'] = df_features['label'][(data["is_noise"] == 1.0)&(data['clean_text']!='0')].astype(str)
df_pca_3d['clean_text'] = df_features['clean_text'][(data["is_noise"] == 1.0)&(data['clean_text']!='0')]

# 인터랙티브 3D PCA 산점도 생성
fig_pca_3d = px.scatter_3d(
    df_pca_3d,
    x='PC1',
    y='PC2',
    z='PC3',
    color='Label',
    hover_data=['clean_text'],
    title='Interactive 3D PCA Scatter Plot',
    opacity=0.7,
    width=1000,
    height=800,
)

fig_pca_3d.update_layout(
    legend_title_text='Label'
)
fig_pca_3d.update_traces(marker=dict(size=2))  # 여기서 3은 점의 크기, 필요한 크기로 조절

fig_pca_3d.show()
fig_pca_3d.write_html('interactive_3d_scatter.html')
html_str = fig_pca_3d.to_html()


In [None]:
html_str = fig_pca_3d.to_html()
print(html_str)

In [None]:
import plotly.express as px
from sklearn.manifold import TSNE

# 2D t-SNE 실행
n_components_2d = 2
tsne_2d = TSNE(n_components=n_components_2d, random_state=42)
X_embedded_2d = tsne_2d.fit_transform(df_features.iloc[:, :-2])  # 'label'과 'clean_text' 제외

# 2D 임베딩 데이터프레임 생성
df_embedded_2d = pd.DataFrame(X_embedded_2d, columns=['Component 1', 'Component 2'])
df_embedded_2d['Label'] = df_features['label'][(data["is_noise"] == 1.0)&(data['clean_text']!='0')].astype(str)  # 범주형 라벨을 문자열로 변환
df_embedded_2d['clean_text'] = df_features['clean_text'][(data["is_noise"] == 1.0)&(data['clean_text']!='0')]

# 인터랙티브 2D 산점도 생성
fig_2d = px.scatter(
    df_embedded_2d,
    x='Component 1',
    y='Component 2',
    color='Label',
    hover_data=['clean_text'],
    title='Interactive 2D t-SNE Scatter Plot',
    opacity=0.7,
    width=800,
    height=600
)

fig_2d.update_layout(
    legend_title_text='Label'
)

fig_2d.show()
fig_2d.write_html('interactive_2d_scatter.html')

In [None]:
import pandas as pd
import plotly.express as px
import umap.umap_ as umap

# UMAP 실행
umap_reducer = umap.UMAP(n_components=2, random_state=42)
X_umap = umap_reducer.fit_transform(df_features.iloc[:, :-2])


# UMAP 결과 데이터프레임 생성
df_umap = pd.DataFrame(X_umap, columns=['UMAP1', 'UMAP2'])
df_umap['Label'] = df_features['label'][(data["is_noise"] == 1.0)&(data['clean_text']!='0')].astype(str)
df_umap['clean_text'] = df_features['clean_text'][(data["is_noise"] == 1.0)&(data['clean_text']!='0')]

# 인터랙티브 2D UMAP 산점도 생성
fig_umap = px.scatter(
    df_umap,
    x='UMAP1',
    y='UMAP2',
    color='Label',
    hover_data=['clean_text'],
    title='Interactive 2D UMAP Scatter Plot',
    opacity=0.7,
    width=800,
    height=600
)

fig_umap.update_layout(
    legend_title_text='Label'
)

fig_umap.show()


In [None]:
df_features

In [None]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples, silhouette_score, confusion_matrix
from scipy.optimize import linear_sum_assignment

# 클러스터링 수행 (예: KMeans)
n_clusters = 7
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
cluster_labels = kmeans.fit_predict(df_features.iloc[:, :-8])  # 'label'과 'clean_text' 제외

# 실루엣 점수 계산
silhouette_avg = silhouette_score(df_features.iloc[:, :-8], cluster_labels)
silhouette_values = silhouette_samples(df_features.iloc[:, :-8], cluster_labels)

# 데이터프레임에 클러스터 및 실루엣 점수 추가
df_features['Cluster'] = cluster_labels
df_features['Silhouette'] = silhouette_values

print(f"Average Silhouette Score: {silhouette_avg}")

# 아웃라이어 식별: 실루엣 점수가 0.25 미만인 데이터
outliers = df_features[df_features['Silhouette'] < 0.25]
print(f"Number of outliers: {len(outliers)}")

# 실제 라벨 (예: df_features에 실제 라벨 컬럼이 있는 경우)
true_labels = df_features['label']

# 클러스터 라벨과 실제 라벨 간의 혼동 행렬 생성
conf_matrix = confusion_matrix(true_labels, df_features['Cluster'])

# 헝가리안 알고리즘을 사용하여 최적 매칭 찾기
row_ind, col_ind = linear_sum_assignment(-conf_matrix)

# 클러스터 라벨 재맵핑
mapping = {old_label: new_label for old_label, new_label in zip(col_ind, row_ind)}
df_features['Cluster'] = df_features['Cluster'].map(mapping)

# 새로운 클러스터 라벨 확인
print("Cluster label mapping:")
print(mapping)

# 데이터프레임 확인
print(df_features.head())


In [None]:
import numpy as np

# 클러스터 중심 계산
centroids = kmeans.cluster_centers_

# 각 데이터 포인트와 클러스터 중심 간 거리 계산
distances = []
for idx, row in df_features.iterrows():
    cluster = row['Cluster']
    centroid = centroids[cluster]
    distance = np.linalg.norm(row[:-8] - centroid)
    distances.append(distance)

df_features['Distance_to_Centroid'] = distances

# 거리의 평균과 표준편차 계산
mean_distance = df_features['Distance_to_Centroid'].mean()
std_distance = df_features['Distance_to_Centroid'].std()

# 아웃라이어 식별: 평균 + 2*표준편차 이상인 데이터
distance_threshold = mean_distance + 1 * std_distance
distance_outliers = df_features[df_features['Distance_to_Centroid'] > distance_threshold]
print(f"Number of distance-based outliers: {len(distance_outliers)}")


In [None]:
import plotly.express as px

# 2D t-SNE 실행 (이미 클러스터링을 수행했다면 재실행할 필요 없음)
from sklearn.manifold import TSNE

tsne = TSNE(n_components=2, random_state=42)
X_tsne = tsne.fit_transform(df_features.iloc[:, :-8])  # 'label', 'clean_text', 'Cluster' 제외

df_tsne = pd.DataFrame(X_tsne, columns=['TSNE1', 'TSNE2'])
df_tsne['Cluster'] = df_features['Cluster'].astype(str)
df_tsne['Silhouette'] = df_features['Silhouette']
df_tsne['clean_text'] = df_features['clean_text']

# 아웃라이어는 실루엣 점수가 0.25 미만인 데이터
df_tsne['Outlier'] = df_features['Silhouette'] < 0.25

# 인터랙티브 2D t-SNE 산점도 생성
fig_tsne = px.scatter(
    df_tsne,
    x='TSNE1',
    y='TSNE2',
    color='Cluster',
    symbol='Outlier',
    hover_data=['clean_text', 'Silhouette'],
    title='2D t-SNE Scatter Plot with Outliers',
    opacity=0.7,
    width=800,
    height=600
)

fig_tsne.update_layout(
    legend_title_text='Cluster'
)

fig_tsne.show()


In [None]:
# 거리 기반 아웃라이어 표시
df_tsne['Distance_outlier'] = df_features['Distance_to_Centroid'] > distance_threshold

fig_tsne_distance = px.scatter(
    df_tsne,
    x='TSNE1',
    y='TSNE2',
    color='Cluster',
    symbol='Distance_outlier',
    hover_data=['clean_text', 'Distance_outlier'],
    title='2D t-SNE Scatter Plot with Distance-based Outliers',
    opacity=0.7,
    width=800,
    height=600
)

fig_tsne_distance.update_layout(
    legend_title_text='Cluster'
)

fig_tsne_distance.show()


In [None]:
df_features

In [None]:
# 최종 아웃라이어 식별: 세 가지 방법 중 하나라도 해당하면 아웃라이어
df_features['Final_Outlier'] = df_features['Silhouette'] < 0.2
print(f"Total number of final outliers: {len(df_features[df_features['Final_Outlier']])}")


In [None]:
df_cleaned

In [None]:
# 아웃라이어 제거
df_cleaned = df_features[df_features['Silhouette'] >= 0.20].copy()
print(f"Data shape after removing outliers: {df_cleaned.shape}")
# 아웃라이어 제거 후 CSV 파일로 출력
output_path = '/data/ephemeral/home/level2-nlp-datacentric-nlp-11/data/clustering_delete_outlier_data.csv'  # 저장할 파일 이름 또는 경로
selected_columns = ['clean_text', 'label', 'Cluster','Mapped_Cluster'] 
df_cleaned[selected_columns].to_csv(output_path, index=False)
print(f"Cleaned data saved to {output_path}")


In [None]:
# 아웃라이어를 새로운 클러스터로 할당 (예: 클러스터 7)
df_features['Final_Cluster'] = df_features['Cluster']
df_features.loc[df_features['Silhouette'] < 0.25, 'Final_Cluster'] = '7_Outlier'

# 시각화 업데이트
df_tsne['Final_Cluster'] = df_features['Final_Cluster'].astype(str)

fig_final = px.scatter(
    df_tsne,
    x='TSNE1',
    y='TSNE2',
    color='Final_Cluster',
    hover_data=['clean_text', 'Silhouette'],
    title='2D t-SNE Scatter Plot with Final Clusters (Including Outliers)',
    opacity=0.7,
    width=800,
    height=600
)

fig_final.update_layout(
    legend_title_text='Final Cluster'
)

fig_final.show()


In [None]:
df_features

In [65]:
from sklearn.mixture import GaussianMixture

# GMM 클러스터링 수행
gmm = GaussianMixture(n_components=n_clusters, random_state=42)
gmm_labels = gmm.fit_predict(df_features.iloc[:, :-9])

# 각 데이터 포인트의 클러스터 할당 확률 가져오기
probs = gmm.predict_proba(df_features.iloc[:, :-9])

# 데이터프레임에 추가
df_features['GMM_Cluster'] = gmm_labels
df_features['GMM_Max_Prob'] = probs.max(axis=1)


In [66]:
# 아웃라이어 데이터 가져오기 (예: Silhouette Score가 0.25 미만인 데이터)
outliers = df_features[df_features['Silhouette'] < 0.25]

# 아웃라이어의 인덱스 가져오기
outlier_indices = outliers.index

# 아웃라이어의 클러스터 할당 확률 가져오기
outlier_probs = probs[outlier_indices]

# 가장 높은 확률을 가진 클러스터 찾기
reassigned_clusters = np.argmax(outlier_probs, axis=1)

# 아웃라이어의 클러스터 재할당
df_features.loc[outlier_indices, 'Reassigned_Cluster_GMM'] = reassigned_clusters

# 재할당된 클러스터를 최종 클러스터로 업데이트
df_features['Final_Cluster_GMM'] = df_features['Cluster']
df_features.loc[outlier_indices, 'Final_Cluster_GMM'] = df_features.loc[outlier_indices, 'Reassigned_Cluster_GMM']


In [None]:
df_features

In [None]:
# 재할당된 클러스터를 사용하여 시각화 업데이트
df_tsne['Final_Cluster_GMM'] = df_features['Final_Cluster_GMM'].astype(str)
df_tsne['GMM_Max_Prob'] = df_features['GMM_Max_Prob']
fig_gmm = px.scatter(
    df_tsne,
    x='TSNE1',
    y='TSNE2',
    color='Final_Cluster_GMM',
    hover_data=['clean_text', 'Silhouette', 'GMM_Max_Prob'],
    title='2D t-SNE Scatter Plot with GMM Reassigned Outliers',
    opacity=0.7,
    width=800,
    height=600
)

fig_gmm.update_layout(
    legend_title_text='Final Cluster (GMM)'
)

fig_gmm.show()
