In [None]:
# 베이스 모델

# 머신러닝
# Decision Tree
# Random Forest
# Logistic Regression

# 딥러닝
# MLP

In [None]:
%pip install pandas matplotlib numpy scikit-learn


#### 데이터 불러오기

In [None]:
# 데이터 불러오기
import pandas as pd
url = "C:\\Users\\Playdata2\\Downloads\\tree_model_preprocessed.csv"
df = pd.read_csv(url)
df.head()

In [None]:
# 데이터 총 갯수
total = df.size
print(total)

In [None]:
df.describe()

In [None]:
import matplotlib.pyplot as plt
for col in df.columns:
    plt.figure(figsize=(6,4))
    
    if df[col].dtype in ['int64', 'float64'] and df[col].nunique() > 10:
        # 연속형 컬럼: 히스토그램
        plt.hist(df[col], bins=30, color='skyblue', edgecolor='black')
        plt.title(f'{col} 분포 (연속형)')
    else:
        # 범주형 컬럼: 막대그래프
        df[col].value_counts().sort_index().plot(kind='bar', color='lightgreen', edgecolor='black')
        plt.title(f'{col} 분포 (범주형)')
    
    plt.xlabel(col)
    plt.ylabel('Count')
    plt.show()

#### 머신러닝 베이스 라인 모델 선정 : Random Forest
- Decision Tree
- Random Forest
- Logistic Regression

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

# 1. 데이터 분리 (X: feature, y: target)
X = df.drop('churn', axis=1)
y = df['churn']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 2. 모델 정의
models = {
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'Random Forest': RandomForestClassifier(random_state=42, n_estimators=100),
    'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42)
}

# 3. 모델 학습
results = {}
for name, model in models.items():
    model.fit(X_train, y_train)
    # y_pred = model.predict(X_test)
    # results[name] = {
    #     'Accuracy': accuracy_score(y_test, y_pred),
    #     'Precision': precision_score(y_test, y_pred, average='weighted'),
    #     'Recall': recall_score(y_test, y_pred, average='weighted'),
    #     'F1-score': f1_score(y_test, y_pred, average='weighted')
    # }

# 4. 학습 완료 후 성능 평가
# 1)
# results_df = pd.DataFrame(results).T
# print(results_df.sort_values(by='F1-score', ascending=False))

# 2)
for name, model in models.items():
    print(f"\n=== {name} ===")
    y_pred = model.predict(X_test)
    print(classification_report(y_test, y_pred))

##### 하이퍼 파라미터를 하지않은 기준, RandomForest가 성능이 높다.

```
Accuracy : 정확도
Precision : 정밀도
Recall : 재현율, 민감도
F1-score : Precision(정밀도)과 Recall(재현율)의 균형
```

#### 딥러닝 베이스 라인 모델 : MLP

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import pandas as pd

# 1. 데이터 분리
X = df.drop('churn', axis=1)
y = df['churn']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 2. 스케일링 : 연속형 컬럼만 정규화
scaler = StandardScaler()
num_cols = ['bill_avg', 'download_avg', 'upload_avg', 'service_failure_count']
X_train[num_cols] = scaler.fit_transform(X_train[num_cols])
X_test[num_cols] = scaler.transform(X_test[num_cols])

# X_train = scaler.fit_transform(X_train)
# X_test = scaler.fit_transform(X_test)


# 3. MLP 모델 정의
mlp = MLPClassifier(hidden_layer_sizes=(64, 32),  # 두 개의 히든 레이어, 64, 32 유닛
                    activation='relu',           # ReLU 활성화
                    solver='adam',              # 최적화 알고리즘
                    max_iter=500,
                    random_state=42)

# 4. 학습
mlp.fit(X_train, y_train)

# 5. 예측
y_pred = mlp.predict(X_test)

# 6. 성능 평가
mlp_results = {
    'Accuracy': accuracy_score(y_test, y_pred),
    'Precision': precision_score(y_test, y_pred, average='weighted'),
    'Recall': recall_score(y_test, y_pred, average='weighted'),
    'F1-score': f1_score(y_test, y_pred, average='weighted')
}

# 7. 출력
print(pd.DataFrame([mlp_results], index=['MLP']))

     Accuracy  Precision    Recall  F1-score
MLP  0.933792   0.934235  0.933792  0.933879


In [None]:
# 정규화 연속형데이터만 한 경우
#     Accuracy  Precision    Recall  F1-score
# MLP  0.933792   0.934235  0.933792  0.933879

In [None]:
# 정규화 X 데이터 전체를 한 경우
#     Accuracy  Precision    Recall  F1-score
# MLP  0.930245   0.930308  0.930245  0.930269

### 하이퍼파라미터 튜닝
- GridSearch

In [None]:
# 머신러닝 베이스 라인 모델 : RandomForest

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

# 1. 데이터 분리 (X: feature, y: target)
X = df.drop('churn', axis=1)
y = df['churn']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 2. 모델 생성
model = RandomForestClassifier(random_state=42, n_estimators=100)

# 3. 모델 학습
model.fit(X_train, y_train)

# 4. 성능 조회
y_pred = model.predict(X_test)
print(f"\n=== RandomForest ===")
print(classification_report(y_test, y_pred))

In [14]:
# 딥러닝 베이스 라인 모델 : MLP
    # 연속형 컬럼만 정규화 스케일링처리 이게 조금 더 정확도가 올라간다.

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score,classification_report
import pandas as pd

# 1. 데이터 분리
X = df.drop('churn', axis=1)
y = df['churn']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 2. 스케일링 : 연속형 컬럼만 정규화
scaler = StandardScaler()
num_cols = ['bill_avg', 'download_avg', 'upload_avg', 'service_failure_count']
X_train[num_cols] = scaler.fit_transform(X_train[num_cols])
X_test[num_cols] = scaler.transform(X_test[num_cols])


# 3. MLP 모델 정의
mlp = MLPClassifier(hidden_layer_sizes=(64, 32),  # 두 개의 히든 레이어, 첫 번째 레이어 64유닛, 두 번째 레이어 32유닛
                    activation='relu',  # ReLU 활성화 함수 사용
                    solver='adam',      # Adam 옵티마이저 사용
                    max_iter=500,       # 최대 500번 반복해서 학습
                    random_state=42)

# 4. 학습
mlp.fit(X_train, y_train)

# 5. 예측
y_pred = mlp.predict(X_test)

# 6. 성능 조회
print(f"\n=== MLP ===")
print(classification_report(y_test, y_pred))


=== MLP ===
              precision    recall  f1-score   support

           0       0.91      0.94      0.93      6327
           1       0.95      0.93      0.94      8052

    accuracy                           0.93     14379
   macro avg       0.93      0.93      0.93     14379
weighted avg       0.93      0.93      0.93     14379

