In [None]:
!pip install pandas matplotlib scikit-learn matplotlib-venn

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib_venn import venn3
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor
from sklearn.cluster import DBSCAN
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

In [None]:
def compare_outlier_methods_with_venn_tfidf(
    df, 
    text_column, 
    contamination=0.2, 
    pca_dim=50, 
    perplexity=5,
    dbscan_eps=1.5,
    dbscan_min_samples=2
):
    texts = df[text_column].astype(str).tolist()

    # 1. TF-IDF 벡터화
    print("🔄 TF-IDF 벡터화 중...")
    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform(texts).toarray()

    # 2. PCA로 차원 축소
    print(f"📉 PCA로 {pca_dim}차원 축소 중...")
    pca = PCA(n_components=min(pca_dim, X.shape[1]), random_state=42)
    X_pca = pca.fit_transform(X)

    # 3. t-SNE로 2차원 축소
    print("🎨 t-SNE 시각화 좌표 계산 중...")
    tsne = TSNE(n_components=2, perplexity=perplexity, random_state=42, init='pca', learning_rate='auto')
    X_tsne = tsne.fit_transform(X_pca)

    # 4. Isolation Forest
    iso = IsolationForest(contamination=contamination, random_state=42)
    labels_iso = iso.fit_predict(X)

    # 5. LOF
    lof = LocalOutlierFactor(n_neighbors=5, contamination=contamination)
    labels_lof = lof.fit_predict(X)

    # 6. DBSCAN
    dbscan = DBSCAN(eps=dbscan_eps, min_samples=dbscan_min_samples)
    db_labels = dbscan.fit_predict(X)
    labels_dbscan = np.where(db_labels == -1, -1, 1)

    # 7. 이상치 인덱스 집합
    set_iso = {i for i, l in enumerate(labels_iso) if l == -1}
    set_lof = {i for i, l in enumerate(labels_lof) if l == -1}
    set_db = {i for i, l in enumerate(labels_dbscan) if l == -1}

    # 8. Venn Diagram
    plt.figure(figsize=(8, 6))
    venn3(
        subsets=(set_iso, set_lof, set_db),
        set_labels=('Isolation Forest', 'LOF', 'DBSCAN')
    )
    plt.title("이상치 탐지 알고리즘 비교 (Venn Diagram)")
    plt.show()

    # 9. t-SNE 시각화
    fig, axes = plt.subplots(1, 3, figsize=(20, 6))
    colors_iso = ['red' if l == -1 else 'blue' for l in labels_iso]
    colors_lof = ['red' if l == -1 else 'blue' for l in labels_lof]
    colors_db = ['red' if l == -1 else 'blue' for l in labels_dbscan]

    axes[0].scatter(X_tsne[:, 0], X_tsne[:, 1], c=colors_iso, s=100, alpha=0.7)
    axes[0].set_title("Isolation Forest")
    axes[1].scatter(X_tsne[:, 0], X_tsne[:, 1], c=colors_lof, s=100, alpha=0.7)
    axes[1].set_title("LOF")
    axes[2].scatter(X_tsne[:, 0], X_tsne[:, 1], c=colors_db, s=100, alpha=0.7)
    axes[2].set_title(f"DBSCAN (eps={dbscan_eps}, min_samples={dbscan_min_samples})")

    for ax in axes:
        ax.grid(True)

    plt.suptitle("TF-IDF 기반 이상치 탐지 비교 (t-SNE 시각화)", fontsize=14)
    plt.tight_layout()
    plt.show()

    # 10. 결과 반환
    results = {
        "IsolationForest": [{"text": texts[i], "label": "이상치" if i in set_iso else "정상"} for i in range(len(texts))],
        "LOF": [{"text": texts[i], "label": "이상치" if i in set_lof else "정상"} for i in range(len(texts))],
        "DBSCAN": [{"text": texts[i], "label": "이상치" if i in set_db else "정상"} for i in range(len(texts))]
    }
    return results

In [None]:
# 샘플 데이터프레임
data = {
    "id": [1, 2, 3, 4, 5, 6, 7, 8],
    "text": [
        "오늘 날씨가 좋다.",
        "주식 시장이 상승했다.",
        "점심으로 파스타를 먹었다.",
        "비트코인 가격이 급락했다.",
        "asdfghjkl qwertyuiop",
        "내일은 비가 올 것 같다.",
        "환율이 급등했다.",
        "저녁에 치킨을 먹었다."
    ]
}
df = pd.DataFrame(data)
df

In [None]:
results = compare_outlier_methods_with_venn_tfidf(
    df, 
    text_column="text", 
    contamination=0.2, 
    pca_dim=50, 
    perplexity=5,
    dbscan_eps=1.5,
    dbscan_min_samples=2
)

print("\n=== Isolation Forest 결과 ===")
for r in results["IsolationForest"]:
    print(f"{r['text']} -> {r['label']}")

print("\n=== LOF 결과 ===")
for r in results["LOF"]:
    print(f"{r['text']} -> {r['label']}")

print("\n=== DBSCAN 결과 ===")
for r in results["DBSCAN"]:
    print(f"{r['text']} -> {r['label']}")