#### 로지스틱회귀 분류 모델

In [None]:
# # 데이터 불러오기 : 기존 수치형 데이터
# import pandas as pd
# path = "C:\\Users\\Playdata2\\Downloads\\regression_model_preprocessed.csv"
# df = pd.read_csv(path)
# df.head()

In [None]:
# 데이터 불러오기 : 로그 변환 수치형 데이터
import pandas as pd
path = "C:\\Users\\Playdata2\\Downloads\\re_log_model_preprocessed.csv"
df = pd.read_csv(path)
df.head()

#### LogisticRegression Basic
- 하이퍼파라미터 지정 X

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.metrics import classification_report

# 1. 데이터 분리 (X: feature, y: target)
X = df.drop('churn', axis=1)
y = df['churn']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 2. 모델 생성 및 학습
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(X_train,y_train)

# 3. 예측
y_pred = lr.predict(X_test)

# 4. 성능 조회
print(f"\n=== LogisticRegression ===")
print(classification_report(y_test, y_pred))

In [None]:
# 기존 수치형 데이터 : LogisticRegression (기본값 - 하이퍼파리미터 지정 X)
# === LogisticRegression ===
#               precision    recall  f1-score   support

#            0       0.89      0.94      0.91      6327
#            1       0.95      0.91      0.93      8052

#     accuracy                           0.92     14379
#    macro avg       0.92      0.92      0.92     14379
# weighted avg       0.92      0.92      0.92     14379

In [None]:
# 로그 변환 수치형 데이터 : LogisticRegression (기본값 - 하이퍼파리미터 지정 X)
# === LogisticRegression ===
#               precision    recall  f1-score   support

#            0       0.91      0.94      0.93      6327
#            1       0.95      0.93      0.94      8052

#     accuracy                           0.93     14379
#    macro avg       0.93      0.93      0.93     14379
# weighted avg       0.93      0.93      0.93     14379

#### LogisticRegression Hyperparameter
- 하이퍼파라미터 항목 지정
```
StandardScaler : 되도록 원본 데이터의 크기를 너무 수정하지 않는 방향으로 스케일링하기.
                 연속형데이터만 스케일링 고려.
GridSearch - PolynomialFeatures - degree
             LogisticRegression - C , max_iter
```

##### PolynomialFeatures, StandardScaler 적용

In [None]:
# PolynomialFeatures, StandardScaler 적용
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

# 1. 데이터 분리 (X: feature, y: target)
X = df.drop('churn', axis=1)
y = df['churn']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 2. 파이프라인 생성
pipeline = Pipeline([
    ('poly',PolynomialFeatures(degree=3,include_bias=True)),
    ('scaler', StandardScaler()),
    ('lr', LogisticRegression(C=100, max_iter=1000))
])

# 4. 모델 학습
pipeline.fit(X_train, y_train)

# 5. 예측
y_pred = pipeline.predict(X_test)

# 6. 성능 조회
print(f"\n=== LogisticRegression ===")
print(classification_report(y_test, y_pred))

##### GridSearch 적용하여 하이퍼파라미터 최적값 찾고 적용

In [None]:
# %pip install tqdm tqdm-joblib

In [None]:
# LogisticRegression, PolynomialFeatures, GridSearchCV
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

# 1. 데이터 분리 (X: feature, y: target)
X = df.drop('churn', axis=1)
y = df['churn']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 2. 파이프라인 생성
pipeline = Pipeline([
    ('poly',PolynomialFeatures(degree=3,include_bias=True)),
    ('scaler', StandardScaler()),
    ('lr', LogisticRegression(max_iter=1000))
])

# 3. 그리드 서치 파라미터 설정
param_grid = {
    'poly__degree': [1, 2, 3],
    'lr__C': [0.01, 0.1, 1, 10, 100],
    'lr__max_iter': [500, 1000, 2000]
}

# 4. GridSearchCV 설정
grid_search = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    cv=5,                # 교차검증 5-fold
    n_jobs=-1,           # 병렬 처리
    scoring='f1',        # 분류 문제이므로 f1-score 사용 가능
    verbose=2            # 진행현황 표시
)

# 5. 모델 학습
grid_search.fit(X_train, y_train)

# 6. 결과 출력
print("\nBest Parameters:")
print(grid_search.best_params_)

# 7. 최적 모델 생성 및 예측
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

# 8. 성능 평가
print("\n=== Logistic Regression (with GridSearch) ===")
print(classification_report(y_test, y_pred))

In [None]:
# 20251104

In [None]:
df.columns

In [None]:
# LogisticRegression, PolynomialFeatures, StandardScaler(연속형 컬럼만), GridSearchCV
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

# 1. 데이터 분리 (X: feature, y: target)
X = df.drop('churn', axis=1)
y = df['churn']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 2. 연속형 컬럼만 정규화
# 연속형 컬럼 지정
num_cols = ['bill_avg_log', 'download_avg_log', 'upload_avg_log', 'service_failure_count']

# ColumnTransformer: 지정한 컬럼만 스케일링, 나머지는 그대로 둠
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), num_cols)
    ],
    remainder='passthrough'  # 나머지 컬럼은 그대로
)

# 3. 파이프라인 구성
pipeline = Pipeline([
    ('preprocess', preprocessor),
    ('poly', PolynomialFeatures(include_bias=True)),
    ('lr', LogisticRegression(max_iter=1000))
])

# 4. 그리드 서치 파라미터 설정
param_grid = {
    'poly__degree': [1, 2, 3],
    'lr__C': [0.01, 0.1, 1, 10, 100],
    'lr__max_iter': [500, 1000, 2000]
}

# 5. GridSearchCV 설정
grid_search = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    cv=5,
    n_jobs=-1,
    scoring='f1',
    verbose=2
)

# 6. 모델 학습
grid_search.fit(X_train, y_train)

# 7. 결과 출력
print("\nBest Parameters:")
print(grid_search.best_params_)

# 8. 최적 모델로 예측
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

# 9. 성능 평가
print("\n=== Logistic Regression (with selective scaling and gridsearch) ===")
print(classification_report(y_test, y_pred))
