In [2]:
import pandas as pd
import numpy as np

In [3]:
train = pd.read_csv('./open/train.csv')
test = pd.read_csv('./open//test.csv')

In [4]:
train

Unnamed: 0,ID,first_party,second_party,facts,first_party_winner
0,TRAIN_0000,Phil A. St. Amant,Herman A. Thompson,"On June 27, 1962, Phil St. Amant, a candidate ...",1
1,TRAIN_0001,Stephen Duncan,Lawrence Owens,Ramon Nelson was riding his bike when he suffe...,0
2,TRAIN_0002,Billy Joe Magwood,"Tony Patterson, Warden, et al.",An Alabama state court convicted Billy Joe Mag...,1
3,TRAIN_0003,Linkletter,Walker,Victor Linkletter was convicted in state court...,0
4,TRAIN_0004,William Earl Fikes,Alabama,"On April 24, 1953 in Selma, Alabama, an intrud...",1
...,...,...,...,...,...
2473,TRAIN_2473,"HollyFrontier Cheyenne Refining, LLC, et al.","Renewable Fuels Association, et al.",Congress amended the Clean Air Act through the...,1
2474,TRAIN_2474,"Grupo Mexicano de Desarrollo, S. A.","Alliance Bond Fund, Inc.","Alliance Bond Fund, Inc., an investment fund, ...",1
2475,TRAIN_2475,Peguero,United States,"In 1992, the District Court sentenced Manuel D...",0
2476,TRAIN_2476,Immigration and Naturalization Service,St. Cyr,"On March 8, 1996, Enrico St. Cyr, a lawful per...",0


1. target의 분포 : 불균형발견 -> 다운샘플링

In [5]:
target_distribution = train['first_party_winner'].value_counts()
print(target_distribution)


1    1649
0     829
Name: first_party_winner, dtype: int64


In [6]:
from imblearn.under_sampling import RandomUnderSampler

# 다운샘플링 객체 초기화
rus = RandomUnderSampler(random_state=42)

# 다운샘플링을 수행할 feature와 target 데이터를 준비합니다.
X_train = train.drop('first_party_winner', axis=1)
y_train = train['first_party_winner']

# 다운샘플링 수행
X_train_resampled, y_train_resampled = rus.fit_resample(X_train, y_train)

# 다운샘플링된 데이터셋 확인
print("다운샘플링 후 클래스 분포:")
print(pd.Series(y_train_resampled).value_counts())
y_train_resampled = pd.DataFrame(y_train_resampled) # serise -> df

다운샘플링 후 클래스 분포:
0    829
1    829
Name: first_party_winner, dtype: int64


In [7]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string

In [8]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to /Users/being/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/being/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/being/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /Users/being/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [9]:
# 토큰화 함수를 정의합니다.
def tokenize_text(text):
    tokens = word_tokenize(text)  # 단어 단위로 토큰화
    return tokens

#-----------전처리 함수를 정의합니다.-----------
# 토큰화 이후에 적용할 수 있는 전처리 단계는 다음과 같습니다:
# 1. 소문자 변환: 단어를 모두 소문자로 변환하여 대소문자의 구분을 없애거나 일관성을 유지할 수 있습니다.
# 2. 특수 문자 제거: 문장 부호, 기호, 특수 문자 등을 제거하여 모델에 불필요한 잡음을 줄일 수 있습니다.
# 3. 불용어 제거: 자주 등장하지만 의미를 갖지 않는 불용어(stop words)를 제거하여 모델의 성능을 개선할 수 있습니다.
# 4. 정규화: 단어들을 원형으로 변환하거나 어간 추출(stemming) 등을 수행하여 단어의 다양한 형태를 통합할 수 있습니다.
#-----------------------------------------
def preprocess_text(text):
    # 소문자 변환 / 특수 문자 제거
    text = text.lower() 
    text = text.translate(str.maketrans("", "", string.punctuation))

    tokens = tokenize_text(text) # 토큰화

    # 불용어 제거
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    # 정규화
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    return tokens

In [10]:
preprocessed_df = X_train_resampled.copy()

In [11]:
# 'facts' 열에 대해 전처리를 수행하여 새로운 DataFrame을 생성합니다.
preprocessed_df['facts_preprocessed'] = preprocessed_df['facts'].apply(preprocess_text)
preprocessed_df

Unnamed: 0,ID,first_party,second_party,facts,facts_preprocessed
0,TRAIN_0001,Stephen Duncan,Lawrence Owens,Ramon Nelson was riding his bike when he suffe...,"[ramon, nelson, riding, bike, suffered, lethal..."
1,TRAIN_0003,Linkletter,Walker,Victor Linkletter was convicted in state court...,"[victor, linkletter, convicted, state, court, ..."
2,TRAIN_0014,"James J. Thole, et al.","U.S. Bank, N.A., et al.",Named plaintiff James Thole and others brought...,"[named, plaintiff, james, thole, others, broug..."
3,TRAIN_0016,Plyler,Doe,A revision to the Texas education laws in 1975...,"[revision, texas, education, law, 1975, allowe..."
4,TRAIN_0021,Bassam Yacoub Salman,United States,Maher Kara joined Citigroup’s healthcare inves...,"[maher, kara, joined, citigroup, ’, healthcare..."
...,...,...,...,...,...
1653,TRAIN_0788,United States,"Arnold Schwinn & Co., Schwinn Cycle Distributo...",The United States brought an antitrust action ...,"[united, state, brought, antitrust, action, ar..."
1654,TRAIN_0350,Vaughan,Atkinson,The general maritime law of the United States ...,"[general, maritime, law, united, state, long, ..."
1655,TRAIN_1628,Florida,Joe Elton Nixon,A Florida court convicted Joe Elton Nixon of m...,"[florida, court, convicted, joe, elton, nixon,..."
1656,TRAIN_1820,B. C. Foreman et al.,"Dallas County, Texas et al.","In 1972, Texas became a covered jurisdiction f...","[1972, texas, became, covered, jurisdiction, p..."


In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer

# 'facts' feature의 텍스트 데이터를 문서로 결합합니다.
documents = preprocessed_df['facts'].tolist()

# TF-IDF 벡터화 객체를 초기화합니다.
vectorizer = TfidfVectorizer()

# TF-IDF 벡터화를 수행하여 문서를 벡터로 표현합니다.
tfidf_matrix = vectorizer.fit_transform(documents)

# TF-IDF 벡터화된 결과를 확인합니다.
print(tfidf_matrix.shape)

(1658, 14357)


# ML모델 추천

파이캐럿 설치

In [13]:
# pip install pycaret

In [14]:
from pycaret.classification import *

# 벡터화된 데이터를 X와 y로 나누어 준비합니다.
X = tfidf_matrix.toarray()  # TF-IDF 벡터화된 데이터
y = y_train_resampled['first_party_winner'].values  # 타겟 변수

# 파이캐럿을 설정합니다.
setup(data=X, target=y)

# 모델 학습 및 비교를 수행합니다.
compare_models()

Unnamed: 0,Description,Value
0,Session id,5785
1,Target,target
2,Target type,Binary
3,Original data shape,"(1658, 14358)"
4,Transformed data shape,"(1658, 14358)"
5,Transformed train set shape,"(1160, 14358)"
6,Transformed test set shape,"(498, 14358)"
7,Numeric features,14357
8,Preprocess,True
9,Imputation type,simple


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
ridge,Ridge Classifier,0.5526,0.0,0.5621,0.55,0.5555,0.1052,0.1056,0.431
lr,Logistic Regression,0.5474,0.5861,0.5448,0.5463,0.5451,0.0948,0.0949,0.873
knn,K Neighbors Classifier,0.5466,0.5555,0.5448,0.5461,0.5441,0.0931,0.0936,0.448
et,Extra Trees Classifier,0.5466,0.5542,0.4948,0.5511,0.5197,0.0931,0.0937,0.77
svm,SVM - Linear Kernel,0.5457,0.0,0.5569,0.5525,0.5409,0.0914,0.0968,0.522
nb,Naive Bayes,0.5422,0.543,0.5483,0.5433,0.5452,0.0845,0.0847,0.393
rf,Random Forest Classifier,0.5414,0.5585,0.5207,0.5439,0.531,0.0828,0.0831,0.543
lda,Linear Discriminant Analysis,0.5388,0.5508,0.5483,0.539,0.5427,0.0776,0.0779,1.995
dt,Decision Tree Classifier,0.5353,0.5353,0.531,0.5385,0.5333,0.0707,0.0714,0.423
gbc,Gradient Boosting Classifier,0.5233,0.5287,0.5138,0.5261,0.5184,0.0466,0.0468,2.377


# cross validation (ridge)

In [23]:
pip install optuna

Note: you may need to restart the kernel to use updated packages.


In [19]:
# Ridge Classifier 모델을 생성합니다.
ridge = create_model('ridge')

# 하이퍼파라미터 그리드를 정의합니다.
params = {
    'alpha': [0.1, 1.0, 10.0],
    'fit_intercept': [True, False],
    'solver': ['auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga']
}

# 하이퍼파라미터 튜닝을 수행합니다.
tuned_ridge = tune_model(ridge, custom_grid=params, fold=5, search_library='optuna')

# 튜닝된 모델의 성능을 평가합니다.
evaluate_model(tuned_ridge)

ModuleNotFoundError: 
'optuna' is a soft dependency and not included in the pycaret installation. Please run: `pip install optuna` to install.
Alternately, you can install this by running `pip install pycaret[tuners]`