In [None]:
# pip install optuna
# pip install pycaret


In [3]:
import pandas as pd
import numpy as np

In [4]:
train = pd.read_csv('./open/train.csv')
test = pd.read_csv('./open//test.csv')

1. target의 분포 : 불균형발견 -> 다운샘플링

In [5]:
target_distribution = train['first_party_winner'].value_counts()
print(target_distribution)


1    1649
0     829
Name: first_party_winner, dtype: int64


In [6]:
from imblearn.under_sampling import RandomUnderSampler

# 다운샘플링 객체 초기화
rus = RandomUnderSampler(random_state=42)

# 다운샘플링을 수행할 feature와 target 데이터를 준비합니다.
X_train = train.drop('first_party_winner', axis=1)
y_train = train['first_party_winner']

# 다운샘플링 수행
X_train_resampled, y_train_resampled = rus.fit_resample(X_train, y_train)

# 다운샘플링된 데이터셋 확인
print("다운샘플링 후 클래스 분포:")
print(pd.Series(y_train_resampled).value_counts())
y_train_resampled = pd.DataFrame(y_train_resampled) # serise -> df

다운샘플링 후 클래스 분포:
0    829
1    829
Name: first_party_winner, dtype: int64


# 1. 토큰화

In [None]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string

In [None]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

In [17]:
#-----------토큰화 함수를 정의합니다.-----------
def tokenize_text(text):
    tokens = word_tokenize(text)  # 단어 단위로 토큰화
    return tokens

#-----------전처리 함수를 정의합니다.-----------
# 토큰화 이후에 적용할 수 있는 전처리 단계는 다음과 같습니다:
# 1. 소문자 변환: 단어를 모두 소문자로 변환하여 대소문자의 구분을 없애거나 일관성을 유지할 수 있습니다.
# 2. 특수 문자 제거: 문장 부호, 기호, 특수 문자 등을 제거하여 모델에 불필요한 잡음을 줄일 수 있습니다.
# 3. 불용어 제거: 자주 등장하지만 의미를 갖지 않는 불용어(stop words)를 제거하여 모델의 성능을 개선할 수 있습니다.
# 4. 정규화: 단어들을 원형으로 변환하거나 어간 추출(stemming) 등을 수행하여 단어의 다양한 형태를 통합할 수 있습니다.
#-----------------------------------------
def preprocess_text(text):
    # 소문자 변환 / 특수 문자 제거
    text = text.lower() 
    text = text.translate(str.maketrans("", "", string.punctuation))

    tokens = tokenize_text(text) # 토큰화

    # 불용어 제거
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    # 정규화
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    return tokens

In [None]:
# X_train_resampled
# y_train_resampled

In [10]:
X_train_preprocessed = pd.DataFrame(X_train_resampled)
y_train_preprocessed = pd.DataFrame(y_train_resampled)
X_test_preprocessed = pd.DataFrame(test)

In [18]:
X_train_preprocessed['facts_preprocessed'] = train['facts'].apply(preprocess_text)
X_test_preprocessed['facts_preprocessed'] = test['facts'].apply(preprocess_text)

In [59]:
# 디버그
from sklearn.feature_extraction.text import TfidfVectorizer

# X_train과 y_train을 이용하여 TF-IDF 벡터화를 수행합니다.
vectorizer = TfidfVectorizer(tokenizer=lambda x: preprocess_text(x))
X_train_tfidf_matrix = vectorizer.fit_transform(X_train_preprocessed['facts'])
X_test_tfidf_matrix = vectorizer.transform(X_test_preprocessed['facts'])

# TF-IDF 벡터화된 결과를 확인합니다.
# ---희소벡터---
print("---희소벡터---")
print(X_train_tfidf_matrix.shape)
print(X_test_tfidf_matrix.shape)


# TF-IDF 벡터화된 결과를 확인합니다.
# ---밀접벡터---
X_train_dense = X_train_tfidf_matrix.toarray()
X_test_dense = X_test_tfidf_matrix.toarray()
print("\n---밀접벡터---")
print(X_train_dense.shape)
print(X_test_dense.shape)

---희소벡터---
(1658, 14326)
(1240, 14326)

---밀접벡터---
(1658, 14326)
(1240, 14326)


# ML모델 추천(파이케럿)

In [40]:
from pycaret.classification import *

# 벡터화된 데이터를 X와 y로 나누어 준비합니다.
X = X_train_dense  # TF-IDF 벡터화된 데이터
y = y_train_preprocessed['first_party_winner'].values  # 타겟 변수

# 파이캐럿을 설정합니다.
setup(data=X, target=y)

Unnamed: 0,Description,Value
0,Session id,8602
1,Target,target
2,Target type,Binary
3,Original data shape,"(1658, 14327)"
4,Transformed data shape,"(1658, 14327)"
5,Transformed train set shape,"(1160, 14327)"
6,Transformed test set shape,"(498, 14327)"
7,Numeric features,14326
8,Preprocess,True
9,Imputation type,simple


<pycaret.classification.oop.ClassificationExperiment at 0x280873eb0>

# cross validation (top-1)

In [41]:
# 모델 학습 및 비교를 수행합니다.
best_model = compare_models(sort='Accuracy', fold=10)

# 모델의 하이퍼파라미터 튜닝을 수행합니다.
tuned_model = tune_model(best_model, fold=10)

# 튜닝된 모델의 성능을 평가합니다.
evaluate_model(tuned_model)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lr,Logistic Regression,0.5569,0.5854,0.5397,0.5589,0.5476,0.1138,0.1145,0.671
rf,Random Forest Classifier,0.5526,0.5708,0.5586,0.5526,0.5539,0.1052,0.1059,0.597
ridge,Ridge Classifier,0.55,0.0,0.5414,0.5501,0.5444,0.1,0.1005,0.4
lda,Linear Discriminant Analysis,0.5431,0.5439,0.4948,0.5486,0.5178,0.0862,0.0871,2.543
svm,SVM - Linear Kernel,0.5353,0.0,0.4672,0.5479,0.4942,0.0707,0.0734,0.481
lightgbm,Light Gradient Boosting Machine,0.5353,0.5596,0.531,0.5334,0.5316,0.0707,0.0707,0.496
et,Extra Trees Classifier,0.5345,0.5666,0.5517,0.5328,0.5412,0.069,0.0695,0.878
dt,Decision Tree Classifier,0.5319,0.5319,0.5379,0.534,0.535,0.0638,0.0635,0.469
xgboost,Extreme Gradient Boosting,0.5259,0.529,0.5345,0.5232,0.5267,0.0517,0.0522,16.89
nb,Naive Bayes,0.5241,0.5239,0.5293,0.5258,0.526,0.0483,0.0485,0.409


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.5172,0.5835,0.5,0.5179,0.5088,0.0345,0.0345
1,0.569,0.613,0.5517,0.5714,0.5614,0.1379,0.138
2,0.5862,0.5785,0.6552,0.5758,0.6129,0.1724,0.1741
3,0.5431,0.618,0.569,0.541,0.5546,0.0862,0.0863
4,0.5862,0.5886,0.5862,0.5862,0.5862,0.1724,0.1724
5,0.5948,0.6605,0.6379,0.5873,0.6116,0.1897,0.1904
6,0.5431,0.5273,0.4655,0.551,0.5047,0.0862,0.0873
7,0.5259,0.5606,0.5172,0.5263,0.5217,0.0517,0.0517
8,0.5086,0.4816,0.431,0.5102,0.4673,0.0172,0.0175
9,0.5948,0.6278,0.5,0.617,0.5524,0.1897,0.1932


Fitting 10 folds for each of 10 candidates, totalling 100 fits
Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).


interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…

ML 모델학습( Logistic Regression )

In [66]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

# Logistic Regression 모델 초기화
model = LogisticRegression()

# k-fold 교차 검증 및 하이퍼파라미터 조정
k = 10
scores = cross_val_score(model, X_train_dense, y_train_resampled, cv=k, scoring='accuracy')

# 각 애포크의 정확도 출력
for epoch, score in enumerate(scores, 1):
    print(f"Epoch {epoch}: Accuracy = {score:.4f}")

# 전체 데이터로 모델 학습
model.fit(X_train_dense, y_train_resampled)

# 테스트 데이터셋으로 추론 수행
predictions = model.predict(X_test_dense)

Epoch 1: Accuracy = 0.5783
Epoch 2: Accuracy = 0.5422
Epoch 3: Accuracy = 0.6145
Epoch 4: Accuracy = 0.5241
Epoch 5: Accuracy = 0.4699
Epoch 6: Accuracy = 0.5361
Epoch 7: Accuracy = 0.5060
Epoch 8: Accuracy = 0.5602
Epoch 9: Accuracy = 0.5030
Epoch 10: Accuracy = 0.5091


# Inference & Submission

In [67]:
submit = pd.read_csv('open/sample_submission.csv')
len(predictions)

1240

In [68]:
submit['first_party_winner'] = predictions
submit.to_csv('submision/submission0.csv', index=False)
print('Done')

Done
