In [None]:
# LogisticRegression
# KNeighborsClassifier
# RandomForestClassifier
# AdaBoostClassifier
# XGBClassifier
# 변수선택
# PCA
# voting

In [1]:
import pandas as pd
import numpy as np

np.random.seed(1234)

n = 1000

age = np.random.randint(18, 70, size=n)
income = np.random.normal(50000, 15000, size=n).astype(int)
owns_car = np.random.choice([0, 1], size=n)
gender = np.random.choice([0, 1], size=n)  # 0: Female, 1: Male

# 변수 생성 (예시적으로 연령과 소득이 높고 차량 소유 시 보험 가입 확률 높다고 가정)
logits = -5 + 0.05 * age + 0.00005 * income + 1.0 * owns_car + 0.5 * gender
probabilities = 1 / (1 + np.exp(-logits))
buys_insurance = np.random.binomial(1, probabilities)

df = pd.DataFrame({
    'age': age,
    'income': income,
    'owns_car': owns_car,
    'gender': gender,
    'buys_insurance': buys_insurance
})

df.head()

Unnamed: 0,age,income,owns_car,gender,buys_insurance
0,65,51095,1,1,1
1,37,39687,0,1,0
2,56,49763,0,1,1
3,30,46362,1,1,1
4,42,60458,0,0,1


In [11]:
# 표준화
from sklearn.preprocessing import StandardScaler

ss = StandardScaler()
scaled = ss.fit_transform(df[['age', 'income']])
df_s = pd.DataFrame(scaled, columns=['age_z', 'income_z'])

df = pd.concat([df, df_s], axis=1)
df.head()

Unnamed: 0,age,income,owns_car,gender,buys_insurance,age_z,income_z
0,65,51095,1,1,1,1.355976,0.028841
1,37,39687,0,1,0,-0.449756,-0.736319
2,56,49763,0,1,1,0.775562,-0.0605
3,30,46362,1,1,1,-0.90119,-0.288612
4,42,60458,0,0,1,-0.127304,0.656838


In [12]:
from sklearn.model_selection import train_test_split

X = df[['age_z', 'income_z', 'owns_car', 'gender']]
y = df['buys_insurance']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1234)

In [15]:
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score

In [19]:
# LogisticRegression
from sklearn.linear_model import LogisticRegression

model1 = LogisticRegression(random_state=1234, solver='lbfgs')
model1.fit(X_train, y_train)
y_pred = model1.predict(X_test)

print("Accuracy :", accuracy_score(y_test, y_pred))
print("Recall :", recall_score(y_test, y_pred))
print("Precision :", precision_score(y_test, y_pred))
print("F1 Score:", f1_score(y_test, y_pred, pos_label=1))

Accuracy : 0.705
Recall : 0.7851239669421488
Precision : 0.7421875
F1 Score: 0.7630522088353414


In [23]:
# KNeighborsClassifier
from sklearn.neighbors import KNeighborsClassifier

for k in [30, 50, 70]:
    model_k = KNeighborsClassifier(n_neighbors=k)
    model_k.fit(X_train, y_train)
    y_pred = model_k.predict(X_test)

    print("k :", k)
    print("Accuracy :", accuracy_score(y_test, y_pred))
    print("Recall :", recall_score(y_test, y_pred))
    print("Precision :", precision_score(y_test, y_pred))
    print("F1 Score:", f1_score(y_test, y_pred, pos_label=1))
    print()

k : 30
Accuracy : 0.67
Recall : 0.7024793388429752
Precision : 0.7391304347826086
F1 Score: 0.7203389830508475

k : 50
Accuracy : 0.69
Recall : 0.7355371900826446
Precision : 0.7478991596638656
F1 Score: 0.7416666666666667

k : 70
Accuracy : 0.685
Recall : 0.7355371900826446
Precision : 0.7416666666666667
F1 Score: 0.7385892116182573



In [25]:
# RandomForestClassifier
from sklearn.ensemble import RandomForestClassifier

for n in [20, 50, 70]:
    for md in [9, 13, 15]:
        model = RandomForestClassifier(n_estimators=n, max_depth=md)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        print("n :", n, ",", "md :", md)
        print("Accuracy :", accuracy_score(y_test, y_pred))
        print("Recall :", recall_score(y_test, y_pred))
        print("Precision :", precision_score(y_test, y_pred))
        print("F1 Score:", f1_score(y_test, y_pred, pos_label=1))
        print()

n : 20 , md : 9
Accuracy : 0.63
Recall : 0.6446280991735537
Precision : 0.7155963302752294
F1 Score: 0.6782608695652174

n : 20 , md : 13
Accuracy : 0.59
Recall : 0.5867768595041323
Precision : 0.6893203883495146
F1 Score: 0.6339285714285714

n : 20 , md : 15
Accuracy : 0.57
Recall : 0.5867768595041323
Precision : 0.6635514018691588
F1 Score: 0.6228070175438597

n : 50 , md : 9
Accuracy : 0.615
Recall : 0.6363636363636364
Precision : 0.7
F1 Score: 0.6666666666666666

n : 50 , md : 13
Accuracy : 0.595
Recall : 0.6115702479338843
Precision : 0.6851851851851852
F1 Score: 0.6462882096069869

n : 50 , md : 15
Accuracy : 0.59
Recall : 0.6033057851239669
Precision : 0.6822429906542056
F1 Score: 0.6403508771929824

n : 70 , md : 9
Accuracy : 0.61
Recall : 0.6446280991735537
Precision : 0.6902654867256637
F1 Score: 0.6666666666666667

n : 70 , md : 13
Accuracy : 0.58
Recall : 0.5867768595041323
Precision : 0.6761904761904762
F1 Score: 0.6283185840707965

n : 70 , md : 15
Accuracy : 0.595
Recall

In [27]:
# AdaBoostClassifier
from sklearn.ensemble import AdaBoostClassifier

for n in [20, 50, 100]:
    model = AdaBoostClassifier(random_state=1234, n_estimators=n)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    print("n :", n)
    print("Accuracy :", accuracy_score(y_test, y_pred))
    print("Recall :", recall_score(y_test, y_pred))
    print("Precision :", precision_score(y_test, y_pred))
    print("F1 Score:", f1_score(y_test, y_pred, pos_label=1))
    print()

n : 20
Accuracy : 0.695
Recall : 0.768595041322314
Precision : 0.7380952380952381
F1 Score: 0.7530364372469636

n : 50
Accuracy : 0.695
Recall : 0.7768595041322314
Precision : 0.734375
F1 Score: 0.755020080321285

n : 100
Accuracy : 0.675
Recall : 0.7520661157024794
Precision : 0.7222222222222222
F1 Score: 0.736842105263158



In [29]:
# XGBoost
from xgboost import XGBClassifier

for lr in [0.01, 0.1, 0.3, 0.5]:
    for md in [2, 3, 5, 10]:
        model = XGBClassifier(n_estimators=200, learning_rate=lr, max_depth=md, random_state=1234)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        print("lr :", lr, ",", "md :", md)
        print("Accuracy :", accuracy_score(y_test, y_pred))
        print("Recall :", recall_score(y_test, y_pred))
        print("Precision :", precision_score(y_test, y_pred))
        print("F1 Score:", f1_score(y_test, y_pred, pos_label=1))
        print()

lr : 0.01 , md : 2
Accuracy : 0.705
Recall : 0.8181818181818182
Precision : 0.7279411764705882
F1 Score: 0.7704280155642023

lr : 0.01 , md : 3
Accuracy : 0.68
Recall : 0.7520661157024794
Precision : 0.728
F1 Score: 0.7398373983739837

lr : 0.01 , md : 5
Accuracy : 0.675
Recall : 0.71900826446281
Precision : 0.7372881355932204
F1 Score: 0.7280334728033473

lr : 0.01 , md : 10
Accuracy : 0.665
Recall : 0.6859504132231405
Precision : 0.7410714285714286
F1 Score: 0.7124463519313305

lr : 0.1 , md : 2
Accuracy : 0.715
Recall : 0.7768595041322314
Precision : 0.7580645161290323
F1 Score: 0.7673469387755103

lr : 0.1 , md : 3
Accuracy : 0.66
Recall : 0.71900826446281
Precision : 0.71900826446281
F1 Score: 0.7190082644628101

lr : 0.1 , md : 5
Accuracy : 0.645
Recall : 0.6694214876033058
Precision : 0.7232142857142857
F1 Score: 0.6952789699570815

lr : 0.1 , md : 10
Accuracy : 0.64
Recall : 0.6611570247933884
Precision : 0.7207207207207207
F1 Score: 0.689655172413793

lr : 0.3 , md : 2
Accurac

In [40]:
result = pd.DataFrame(y_test)

result['pred'] = y_pred

result

Unnamed: 0,buys_insurance,pred
681,0,0
990,1,1
155,1,0
768,1,1
438,0,1
...,...,...
376,0,1
731,1,1
594,0,1
65,1,1


In [42]:
result.to_csv("test.csv", header=True, index=False)