# DKTC Baseline 모델링 (TF-IDF + Logistic Regression)

## 1. 데이터 준비

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# 데이터 로드
train_df = pd.read_csv('../../data/train.csv')
general_df = pd.read_csv('../../data/train_general_conversation.csv')

# 데이터 합치기
df = pd.concat([train_df, general_df], ignore_index=True)

# 클래스 라벨 인코딩
encoder = LabelEncoder()
df['encoded_class'] = encoder.fit_transform(df['class'])

# 훈련/검증 데이터 분리
X_train, X_val, y_train, y_val = train_test_split(
    df['conversation'], 
    df['encoded_class'], 
    test_size=0.2, 
    random_state=42,
    stratify=df['encoded_class']
)

print(f'Train data size: {len(X_train)}')
print(f'Validation data size: {len(X_val)}')

Train data size: 3960
Validation data size: 990


## 2. TF-IDF 벡터화 (with N-grams)

In [None]:
# mecab_ko.Tagger를 사용한 토크나이저 함수 (오류 수정)
mecab_tagger = Tagger()
def mecab_tokenizer(text):
    morphs = []
    parsed = mecab_tagger.parse(text)
    lines = parsed.split('\n')
    for line in lines:
        if '\t' in line:
            parts = line.split('\t')
            morphs.append(parts[0])
    return morphs

# TF-IDF Vectorizer 초기화 (1-gram 및 2-gram 사용)
tfidf_vectorizer = TfidfVectorizer(tokenizer=mecab_tokenizer, ngram_range=(1, 3), min_df=3,
max_df=0.9)

# 훈련 데이터에 대해 TF-IDF 학습 및 변환
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)

# 검증 데이터에 대해 TF-IDF 변환
X_val_tfidf = tfidf_vectorizer.transform(X_val)

print(f'Vocabulary size: {len(tfidf_vectorizer.vocabulary_)}')
print(f'Train TF-IDF matrix shape: {X_train_tfidf.shape}')



Vocabulary size: 39823
Train TF-IDF matrix shape: (3960, 39823)


## 3. 로지스틱 회귀 모델 학습 및 평가

In [3]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, classification_report

# 모델 초기화 및 학습
lr_model = LogisticRegression(random_state=42, max_iter=1000)
lr_model.fit(X_train_tfidf, y_train)

# 예측
y_pred = lr_model.predict(X_val_tfidf)

# 평가
weighted_f1 = f1_score(y_val, y_pred, average='weighted')
print(f'Weighted F1 Score: {weighted_f1:.4f}')

# 클래스별 상세 리포트
print("Classification Report:")
print(classification_report(y_val, y_pred, target_names=encoder.classes_))

Weighted F1 Score: 0.9138
Classification Report:
              precision    recall  f1-score   support

       갈취 대화       0.92      0.84      0.88       196
   기타 괴롭힘 대화       0.89      0.89      0.89       219
       일반 대화       1.00      1.00      1.00       200
 직장 내 괴롭힘 대화       0.88      0.95      0.92       196
       협박 대화       0.87      0.88      0.87       179

    accuracy                           0.91       990
   macro avg       0.91      0.91      0.91       990
weighted avg       0.91      0.91      0.91       990

