# DKTC Baseline 모델링 (TF-IDF + Topic Features)

## 1. 데이터 준비

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# 데이터 로드
train_df = pd.read_csv('../../data/train.csv')
general_df = pd.read_csv('../../data/train_general_conversation.csv')

# 데이터 합치기
df = pd.concat([train_df, general_df], ignore_index=True)

# 클래스 라벨 인코딩
encoder = LabelEncoder()
df['encoded_class'] = encoder.fit_transform(df['class'])

# 훈련/검증 데이터 분리
X_train, X_val, y_train, y_val = train_test_split(
    df['conversation'], 
    df['encoded_class'], 
    test_size=0.2, 
    random_state=42,
    stratify=df['encoded_class']
)

print(f'Train data size: {len(X_train)}')
print(f'Validation data size: {len(X_val)}')

Train data size: 3960
Validation data size: 990


## 2. 특징 추출 (TF-IDF + Topic Modeling)

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from mecab_ko import Tagger
from scipy.sparse import hstack
import numpy as np

# Tokenizer 정의
mecab_tagger = Tagger()
def mecab_tokenizer(text):
    parsed = mecab_tagger.parse(text)
    return [line.split('\t')[0] for line in parsed.split('\n') if '\t' in line]

# TF-IDF 벡터화
print("TF-IDF 벡터화 진행 중...")
tfidf_vectorizer = TfidfVectorizer(tokenizer=mecab_tokenizer, ngram_range=(1, 3), min_df=3, max_df=0.9)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_val_tfidf = tfidf_vectorizer.transform(X_val)

# LDA 특징 추출
print("토픽 모델링(LDA) 특징 추출 중...")
def extract_nouns(text):
    nouns = []
    parsed = mecab_tagger.parse(text)
    lines = parsed.split('\n')
    for line in lines:
        if '\t' in line:
            parts = line.split('\t')
            pos = parts[1].split(',')[0]
            if pos.startswith('N'):
                nouns.append(parts[0])
    return ' '.join(nouns)

noun_list_train = X_train.apply(extract_nouns)
vectorizer_lda = CountVectorizer(max_df=0.9, min_df=2, stop_words=['입니다', '있습니다', '그냥', '정말', '진짜'])
dtm = vectorizer_lda.fit_transform(noun_list_train)

lda = LatentDirichletAllocation(n_components=5, random_state=42)
lda.fit(dtm)

X_train_topics = lda.transform(dtm)
X_val_topics = lda.transform(vectorizer_lda.transform(X_val.apply(extract_nouns)))

# 특징 합치기
print("TF-IDF 특징과 토픽 모델링 특징을 합치는 중...")
X_train_combined = hstack([X_train_tfidf, X_train_topics])
X_val_combined = hstack([X_val_tfidf, X_val_topics])
print("특징 추출 완료!")

TF-IDF 벡터화 진행 중...




토픽 모델링(LDA) 특징 추출 중...
TF-IDF 특징과 토픽 모델링 특징을 합치는 중...
특징 추출 완료!


## 3. 로지스틱 회귀 모델 학습 및 평가

In [6]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, classification_report

# 모델 초기화 및 학습
lr_model = LogisticRegression(random_state=42, max_iter=1000)
lr_model.fit(X_train_combined, y_train)

# 예측
y_pred = lr_model.predict(X_val_combined)

# 평가
weighted_f1 = f1_score(y_val, y_pred, average='weighted')
print(f'\nWeighted F1 Score (TF-IDF + Topics): {weighted_f1:.4f}')

print("Classification Report:")
print(classification_report(y_val, y_pred, target_names=encoder.classes_))


Weighted F1 Score (TF-IDF + Topics): 0.9017
Classification Report:
              precision    recall  f1-score   support

       갈취 대화       0.91      0.84      0.87       196
   기타 괴롭힘 대화       0.87      0.89      0.88       219
       일반 대화       1.00      1.00      1.00       200
 직장 내 괴롭힘 대화       0.87      0.94      0.90       196
       협박 대화       0.86      0.84      0.85       179

    accuracy                           0.90       990
   macro avg       0.90      0.90      0.90       990
weighted avg       0.90      0.90      0.90       990

