In [2]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder

train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

# 결측치 처리
train_data['job_type'].fillna('Unknown', inplace=True)
test_data['job_type'].fillna('Unknown', inplace=True)

# 범주형 변수 인코딩
categorical_cols = ['income_type', 'edu_type', 'marital status', 'house_type', 'job_type']
label_encoders = {}

for col in categorical_cols:
    label_encoders[col] = LabelEncoder()
    train_data[col] = label_encoders[col].fit_transform(train_data[col])
    test_data[col] = label_encoders[col].transform(test_data[col])

# 수치형 변수 정규화/표준화
numerical_cols = ['income_year', 'family_size']
scaler = StandardScaler()
train_data[numerical_cols] = scaler.fit_transform(train_data[numerical_cols])
test_data[numerical_cols] = scaler.transform(test_data[numerical_cols])

# 새로운 특성 생성
train_data['income_per_family_member'] = train_data['income_year'] / train_data['family_size']
test_data['income_per_family_member'] = test_data['income_year'] / test_data['family_size']

# 데이터 분할
X = train_data.drop('credit', axis=1)
y = train_data['credit']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# 하이퍼파라미터 설정으로 랜덤포레스트 모델 생성 및 훈련
rf_tuned_model = RandomForestClassifier(max_depth=20, min_samples_leaf=1, min_samples_split=5, n_estimators=200, random_state=42)
rf_tuned_model.fit(X_train, y_train)

# 결과 확인
"Random Forest model trained with the specified hyperparameters."


'Random Forest model trained with the specified hyperparameters.'

In [2]:
from sklearn.metrics import classification_report

# 검증 세트에 대한 예측 수행
y_pred_tuned = rf_tuned_model.predict(X_val)

# 클래스별 정밀도, 재현율, F1 점수 계산
class_report_tuned = classification_report(y_val, y_pred_tuned, target_names=['Class 0', 'Class 1', 'Class 2'])

class_report_tuned


'              precision    recall  f1-score   support\n\n     Class 0       0.53      0.08      0.14       520\n     Class 1       0.55      0.14      0.23      1039\n     Class 2       0.67      0.96      0.79      2842\n\n    accuracy                           0.66      4401\n   macro avg       0.58      0.39      0.38      4401\nweighted avg       0.62      0.66      0.58      4401\n'

In [3]:
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split

# 데이터를 다시 분할
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# SMOTE를 사용하여 오버샘플링
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

# 오버샘플링 된 데이터셋의 크기 확인
print(X_train_smote.shape, y_train_smote.shape)


(33945, 14) (33945,)


In [4]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split

# 데이터를 다시 분할
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# SMOTE를 사용하여 오버샘플링
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

# 오버샘플링된 데이터로 랜덤포레스트 모델 훈련
rf_model_smote = RandomForestClassifier(max_depth=20, min_samples_leaf=1, min_samples_split=5, n_estimators=200, random_state=42)
rf_model_smote.fit(X_train_smote, y_train_smote)

# 검증 세트에 대한 예측 수행
y_pred_smote = rf_model_smote.predict(X_val)

# 클래스별 정밀도, 재현율, F1 점수 계산
print(classification_report(y_val, y_pred_smote, target_names=['Class 0', 'Class 1', 'Class 2']))


              precision    recall  f1-score   support

     Class 0       0.34      0.21      0.26       520
     Class 1       0.44      0.27      0.33      1039
     Class 2       0.69      0.84      0.76      2842

    accuracy                           0.63      4401
   macro avg       0.49      0.44      0.45      4401
weighted avg       0.59      0.63      0.60      4401



In [5]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier

# 이미 훈련된 랜덤포레스트 모델 (rf_model_smote)
# 부스팅 모델 생성 및 훈련
boosting_model = GradientBoostingClassifier(random_state=42)
boosting_model.fit(X_train_smote, y_train_smote)

# 앙상블 모델 생성
ensemble_model = VotingClassifier(estimators=[
    ('random_forest', rf_model_smote), 
    ('boosting', boosting_model)
], voting='soft')

# 앙상블 모델 훈련
ensemble_model.fit(X_train_smote, y_train_smote)

# 검증 세트에 대한 예측 수행 및 평가
y_pred_ensemble = ensemble_model.predict(X_val)
print(classification_report(y_val, y_pred_ensemble, target_names=['Class 0', 'Class 1', 'Class 2']))


              precision    recall  f1-score   support

     Class 0       0.31      0.12      0.18       520
     Class 1       0.45      0.15      0.23      1039
     Class 2       0.67      0.90      0.77      2842

    accuracy                           0.64      4401
   macro avg       0.48      0.39      0.39      4401
weighted avg       0.58      0.64      0.57      4401



In [3]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE

# 데이터를 다시 분할
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# 데이터 정규화
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

# SMOTE를 사용하여 오버샘플링
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train_scaled, y_train)

# SVM 모델 생성 및 훈련
svm_model = SVC(probability=True, random_state=42)
svm_model.fit(X_train_smote, y_train_smote)

# 랜덤포레스트 및 부스팅 모델 생성 및 훈련
rf_model = RandomForestClassifier(random_state=42)
gb_model = GradientBoostingClassifier(random_state=42)
rf_model.fit(X_train_smote, y_train_smote)
gb_model.fit(X_train_smote, y_train_smote)

# 앙상블 모델 생성
ensemble_model = VotingClassifier(estimators=[
    ('random_forest', rf_model), 
    ('gradient_boosting', gb_model),
    ('svm', svm_model)
], voting='soft')

# 앙상블 모델 훈련
ensemble_model.fit(X_train_smote, y_train_smote)

# 검증 세트에 대한 예측 수행 및 평가
y_pred_ensemble = ensemble_model.predict(X_val_scaled)
print(classification_report(y_val, y_pred_ensemble, target_names=['Class 0', 'Class 1', 'Class 2']))


NameError: name 'classification_report' is not defined

In [7]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE

# 데이터를 다시 분할
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# 데이터 정규화
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

# SMOTE를 사용하여 오버샘플링
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train_scaled, y_train)

# 의사결정나무 모델 생성 및 훈련
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train_smote, y_train_smote)

# 랜덤포레스트 및 부스팅 모델 생성 및 훈련
rf_model = RandomForestClassifier(random_state=42)
gb_model = GradientBoostingClassifier(random_state=42)
rf_model.fit(X_train_smote, y_train_smote)
gb_model.fit(X_train_smote, y_train_smote)

# 앙상블 모델 생성
ensemble_model = VotingClassifier(estimators=[
    ('random_forest', rf_model), 
    ('gradient_boosting', gb_model),
    ('decision_tree', dt_model)
], voting='soft')

# 앙상블 모델 훈련
ensemble_model.fit(X_train_smote, y_train_smote)

# 검증 세트에 대한 예측 수행 및 평가
y_pred_ensemble = ensemble_model.predict(X_val_scaled)
print(classification_report(y_val, y_pred_ensemble, target_names=['Class 0', 'Class 1', 'Class 2']))


              precision    recall  f1-score   support

     Class 0       0.23      0.32      0.27       520
     Class 1       0.31      0.33      0.32      1039
     Class 2       0.69      0.63      0.66      2842

    accuracy                           0.52      4401
   macro avg       0.41      0.43      0.42      4401
weighted avg       0.55      0.52      0.53      4401



In [8]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier

# 하이퍼파라미터 범위 설정
rf_params = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

gb_params = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 10]
}

dt_params = {
    'max_depth': [10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# 모델 초기화
rf_model = RandomForestClassifier(random_state=42)
gb_model = GradientBoostingClassifier(random_state=42)
dt_model = DecisionTreeClassifier(random_state=42)

# RandomizedSearchCV 설정
rf_random = RandomizedSearchCV(estimator=rf_model, param_distributions=rf_params, n_iter=100, cv=3, verbose=2, random_state=42, n_jobs=-1)
gb_random = RandomizedSearchCV(estimator=gb_model, param_distributions=gb_params, n_iter=100, cv=3, verbose=2, random_state=42, n_jobs=-1)
dt_random = RandomizedSearchCV(estimator=dt_model, param_distributions=dt_params, n_iter=100, cv=3, verbose=2, random_state=42, n_jobs=-1)

# 최적의 하이퍼파라미터 찾기
rf_random.fit(X_train_smote, y_train_smote)
gb_random.fit(X_train_smote, y_train_smote)
dt_random.fit(X_train_smote, y_train_smote)

# 최적의 하이퍼파라미터로 모델 재훈련 및 검증 세트에 대한 성능 평가

Fitting 3 folds for each of 81 candidates, totalling 243 fits




[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   3.9s
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   4.0s
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   4.4s
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   5.9s
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   7.8s
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   8.1s
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time=   7.3s
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time=  10.6s
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time=   2.7s
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total tim

[CV] END max_depth=10, min_samples_leaf=4, min_samples_split=10, n_estimators=300; total time=  11.4s
[CV] END max_depth=20, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   4.0s
[CV] END max_depth=20, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   9.1s
[CV] END max_depth=20, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   8.0s
[CV] END max_depth=20, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=  13.5s
[CV] END max_depth=20, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time=  13.4s
[CV] END max_depth=20, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time=  12.2s
[CV] END max_depth=20, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time=   4.4s
[CV] END max_depth=20, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time=   4.0s
[CV] END max_depth=20, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total ti

[CV] END max_depth=30, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   4.8s
[CV] END max_depth=30, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   4.7s
[CV] END max_depth=30, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   9.5s
[CV] END max_depth=30, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   9.7s
[CV] END max_depth=30, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=  16.7s
[CV] END max_depth=30, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time=  14.4s
[CV] END max_depth=30, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time=  15.5s
[CV] END max_depth=30, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time=   5.1s
[CV] END max_depth=30, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time=   6.5s
[CV] END max_depth=30, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total tim



Fitting 3 folds for each of 27 candidates, totalling 81 fits
[CV] END ..learning_rate=0.01, max_depth=3, n_estimators=100; total time=  15.2s
[CV] END ..learning_rate=0.01, max_depth=3, n_estimators=100; total time=  18.2s
[CV] END ..learning_rate=0.01, max_depth=3, n_estimators=100; total time=  23.2s
[CV] END ..learning_rate=0.01, max_depth=3, n_estimators=200; total time=  28.7s
[CV] END ..learning_rate=0.01, max_depth=3, n_estimators=200; total time=  32.6s
[CV] END ..learning_rate=0.01, max_depth=3, n_estimators=200; total time=  44.8s
[CV] END ..learning_rate=0.01, max_depth=3, n_estimators=300; total time=  42.7s
[CV] END ..learning_rate=0.01, max_depth=3, n_estimators=300; total time=  48.9s
[CV] END ..learning_rate=0.01, max_depth=5, n_estimators=100; total time=  25.7s
[CV] END ..learning_rate=0.01, max_depth=5, n_estimators=100; total time=  22.4s
[CV] END ..learning_rate=0.01, max_depth=3, n_estimators=300; total time= 1.2min
[CV] END ..learning_rate=0.01, max_depth=5, n_es



Fitting 3 folds for each of 27 candidates, totalling 81 fits
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=2; total time=   0.2s
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=2; total time=   0.2s
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=2; total time=   0.2s
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=5; total time=   0.3s
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=5; total time=   0.3s
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=5; total time=   0.2s
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=10; total time=   0.2s
[CV] END max_depth=10, min_samples_leaf=2, min_samples_split=2; total time=   0.2s
[CV] END max_depth=10, min_samples_leaf=1, min_samples_split=10; total time=   0.3s
[CV] END max_depth=10, min_samples_leaf=2, min_samples_split=2; total time=   0.2s
[CV] END max_depth=10, min_samples_leaf=2, min_samples_split=2; total time=   0.2s
[CV] END max_depth=10, m

In [None]:
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import classification_report

# 최적의 하이퍼파라미터로 각 모델 재훈련
rf_best = RandomForestClassifier(**rf_random.best_params_, random_state=42)
gb_best = GradientBoostingClassifier(**gb_random.best_params_, random_state=42)
dt_best = DecisionTreeClassifier(**dt_random.best_params_, random_state=42)

rf_best.fit(X_train_smote, y_train_smote)
gb_best.fit(X_train_smote, y_train_smote)
dt_best.fit(X_train_smote, y_train_smote)

# 앙상블 모델 생성
ensemble_model = VotingClassifier(estimators=[
    ('random_forest', rf_best),
    ('gradient_boosting', gb_best),
    ('decision_tree', dt_best)
], voting='soft')

# 앙상블 모델 훈련
ensemble_model.fit(X_train_smote, y_train_smote)

# 검증 세트에 대한 예측 수행
y_pred_ensemble = ensemble_model.predict(X_val_scaled)

# 성능 평가
print(classification_report(y_val, y_pred_ensemble, target_names=['Class 0', 'Class 1', 'Class 2']))

In [5]:
test_pr = ensemble_model.predict(test_data)



In [7]:

# 예측 결과를 DataFrame으로 변환
predictions_df = pd.DataFrame(test_pr, columns=['credit'])

# 예측 결과 CSV 파일로 저장
predictions_df.to_csv('predictions_sub12.csv', index=False)

In [10]:
test_predictions = ensemble_model.predict(test_data)

# 예측 결과 확인
print(test_predictions)



[2 2 2 ... 2 2 2]




In [11]:
import pandas as pd

# 예측 결과를 DataFrame으로 변환
predictions_df = pd.DataFrame(test_predictions, columns=['credit'])

# 예측 결과 CSV 파일로 저장
predictions_df.to_csv('predictions_sub10.csv', index=False)


In [12]:
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import classification_report

# SVM 모델 생성 및 훈련
svm_model = SVC(probability=True, random_state=42)
svm_model.fit(X_train_smote, y_train_smote)

# 앙상블 모델에 기존 모델들과 함께 SVM 모델 추가
ensemble_model_with_svm = VotingClassifier(estimators=[
    ('random_forest', rf_best),
    ('gradient_boosting', gb_best),
    ('decision_tree', dt_best),
    ('svm', svm_model)
], voting='soft')

# 앙상블 모델 훈련
ensemble_model_with_svm.fit(X_train_smote, y_train_smote)

# 검증 데이터셋에 대한 예측 수행
y_pred_ensemble_svm = ensemble_model_with_svm.predict(X_val_scaled)

# 성능 평가
print(classification_report(y_val, y_pred_ensemble_svm, target_names=['Class 0', 'Class 1', 'Class 2']))


              precision    recall  f1-score   support

     Class 0       0.28      0.29      0.28       520
     Class 1       0.34      0.33      0.33      1039
     Class 2       0.70      0.71      0.70      2842

    accuracy                           0.57      4401
   macro avg       0.44      0.44      0.44      4401
weighted avg       0.57      0.57      0.57      4401



In [None]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, StackingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score

# 기본 모델들 정의
base_models = [
    ('random_forest', RandomForestClassifier(random_state=42)),
    ('gradient_boosting', GradientBoostingClassifier(random_state=42)),
    ('decision_tree', DecisionTreeClassifier(random_state=42)),
    ('svc', SVC(probability=True, random_state=42)),
    ('knn', KNeighborsClassifier()),
    ('logistic_regression', LogisticRegression(random_state=42))
]

# 메타 모델 정의 (랜덤포레스트)
meta_model = RandomForestClassifier(random_state=42)

# 스태킹 앙상블 모델 생성
stacking_model = StackingClassifier(estimators=base_models, final_estimator=meta_model, cv=5)

# 스태킹 모델 훈련
stacking_model.fit(X_train_smote, y_train_smote)

# 검증 데이터셋에 대한 예측 수행
y_pred = stacking_model.predict(X_val_scaled)
y_pred_proba = stacking_model.predict_proba(X_val_scaled)[:, 1]  # AUC를 위한 클래스 1의 확률

# 성능 평가: AUC
auc_score = roc_auc_score(y_val, y_pred_proba)
print(f"AUC Score: {auc_score}")

# 성능 평가: 정밀도, 재현율, F1 점수
print(classification_report(y_val, y_pred))
