In [None]:
# 데이터 불러오기 : 로그 변환 수치형 데이터
import pandas as pd
path = "C:\\Users\\Playdata2\\Downloads\\re_log_model_preprocessed.csv"
df = pd.read_csv(path)
df.head()

In [None]:
# LogisticRegression, PolynomialFeatures, StandardScaler(연속형 컬럼만), GridSearchCV
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

# 1. 데이터 분리 (X: feature, y: target)
X = df.drop('churn', axis=1)
y = df['churn']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 2. 연속형 컬럼만 정규화
# 연속형 컬럼 지정
num_cols = ['bill_avg_log', 'download_avg_log', 'upload_avg_log', 'service_failure_count']

# ColumnTransformer: 지정한 컬럼만 스케일링, 나머지는 그대로 둠
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), num_cols)
    ],
    remainder='passthrough'  # 나머지 컬럼은 그대로
)

# 3. 파이프라인 구성
pipeline = Pipeline([
    ('preprocess', preprocessor),
    ('poly', PolynomialFeatures(include_bias=True)),
    ('lr', LogisticRegression(max_iter=1000))
])

# 4. 그리드 서치 파라미터 설정
param_grid = {
    'poly__degree': [1, 2, 3],
    'lr__C': [0.01, 0.1, 1, 10, 100],
    'lr__max_iter': [500, 1000, 2000]
}

# 5. GridSearchCV 설정
grid_search = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    cv=5,
    n_jobs=-1,
    scoring='f1',
    verbose=2
)

# 6. 모델 학습
grid_search.fit(X_train, y_train)

# 7. 결과 출력
print("\nBest Parameters:")
print(grid_search.best_params_)

# 8. 최적 모델로 예측
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

# 9. 성능 평가
print("\n=== Logistic Regression (with selective scaling and gridsearch) ===")
print(classification_report(y_test, y_pred))