# Business Analytics II – Project II
### 2020120083 손영진

## 개요 (Overview)
이 프로젝트는 에어비앤비 호스트가 '슈퍼호스트'인지 예측하는 분류 모델을 구축합니다. building classification models to predict whether an Airbnb host is a "Superhost".
The process includes:
1. ## 3. 데이터 전처리 (Data Preprocessing) (Handling missing values, Encoding, Balancing, Scaling)
2. ## 6. 모델 구축 및 하이퍼파라미터 튜닝 (Logistic Regression, Decision Tree, Random Forest, MLP, KNN, Naive Bayes)
3. ## 7. 최종 예측 (Final Prediction) on Test Data using the best model

In [1]:
# Install requirements
!pip install -r requirements.txt

Collecting numpy==2.1.1 (from -r requirements.txt (line 2))
  Downloading numpy-2.1.1-cp312-cp312-macosx_14_0_arm64.whl.metadata (60 kB)
Collecting matplotlib==3.10.1 (from -r requirements.txt (line 3))
  Downloading matplotlib-3.10.1-cp312-cp312-macosx_11_0_arm64.whl.metadata (11 kB)
Collecting scikit-learn==1.6.1 (from -r requirements.txt (line 5))
  Downloading scikit_learn-1.6.1-cp312-cp312-macosx_12_0_arm64.whl.metadata (31 kB)
INFO: pip is looking at multiple versions of contourpy to determine which version is compatible with other requirements. This could take a while.
Collecting contourpy>=1.0.1 (from matplotlib==3.10.1->-r requirements.txt (line 3))
  Downloading contourpy-1.3.3-cp312-cp312-macosx_11_0_arm64.whl.metadata (5.5 kB)
INFO: pip is looking at multiple versions of scipy to determine which version is compatible with other requirements. This could take a while.
Collecting scipy>=1.6.0 (from scikit-learn==1.6.1->-r requirements.txt (line 5))
  Downloading scipy-1.16.3-c

# 1. Import Libraries

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import shap
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.utils import resample
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, recall_score, f1_score, ConfusionMatrixDisplay
from sentence_transformers import SentenceTransformer
import warnings
warnings.filterwarnings('ignore')

# 한글 폰트 설정 (Mac OS 기준)
plt.rc('font', family='AppleGothic')
plt.rcParams['axes.unicode_minus'] = False

# 2. Data Load

In [12]:
# Load datasets
train_path = 'train_data.csv'
test_path = 'test_f25.xlsx'

df_train = pd.read_csv(train_path)
df_test = pd.read_excel(test_path)

print(f"Train Shape: {df_train.shape}")
print(f"Test Shape: {df_test.shape}")
df_train.head()

Train Shape: (26304, 54)
Test Shape: (130, 55)


Unnamed: 0,estimated_revenue_l365d,last_review,calculated_host_listings_count_shared_rooms,review_scores_communication,review_scores_location,availability_60,host_listings_count,host_identity_verified,host_response_rate,number_of_reviews_l30d,...,reviews_per_month,maximum_minimum_nights,availability_30,beds,maximum_maximum_nights,host_response_time,has_availability,host_is_superhost,bedrooms,review_scores_value
0,13890.0,2024-10-19,0,4.94,4.94,52,8.0,t,80%,0,...,0.33,2,25,4.0,1125,within a day,t,f,3.0,4.85
1,0.0,,0,,,0,7.0,t,,0,...,,4,0,5.0,730,,t,f,3.0,
2,0.0,2022-06-16,0,4.71,4.86,58,7.0,t,100%,0,...,0.05,14,29,2.0,1125,within a few hours,t,f,1.0,5.0
3,0.0,2016-04-15,0,5.0,5.0,60,2.0,t,,0,...,0.01,14,30,1.0,30,,t,,1.0,4.0
4,,,0,,,0,1.0,f,,0,...,,3,0,,730,,,f,2.0,


# 3. 데이터 전처리 (Data Preprocessing)

In [13]:
# Target Variable
target_col = 'host_is_superhost'

# Convert target to binary (t/f -> 1/0)
df_train[target_col] = df_train[target_col].map({'t': 1, 'f': 0})

# Check for missing values in target and drop them
df_train = df_train.dropna(subset=[target_col])
print(f"Train Shape after dropping missing targets: {df_train.shape}")

# Separate Target and Features
y = df_train[target_col]
X = df_train.drop(columns=[target_col])

# Drop columns that are entirely empty (all NaNs)
X = X.dropna(axis=1, how='all')

# --- Date Processing ---
print("Processing Dates...")
date_cols = ['last_review', 'first_review']
for col in date_cols:
    if col in X.columns:
        # Convert to datetime
        X[col] = pd.to_datetime(X[col], errors='coerce')
        if col in df_test.columns:
            df_test[col] = pd.to_datetime(df_test[col], errors='coerce')
        
        # Create 'days_since' feature (relative to a reference date, e.g., today or max date)
        ref_date = pd.Timestamp('2024-12-01') # Use a fixed recent date
        X[f'days_since_{col}'] = (ref_date - X[col]).dt.days
        df_test[f'days_since_{col}'] = (ref_date - df_test[col]).dt.days
        
        # Drop original date columns
        X = X.drop(columns=[col])
        df_test = df_test.drop(columns=[col])

# --- Text Embedding (Amenities) ---
print("Embedding Amenities (this may take a while)...")
if 'amenities' in X.columns:
    # Load pre-trained model
    model = SentenceTransformer('all-MiniLM-L6-v2')
    
    # Fill NaNs with empty string
    X['amenities'] = X['amenities'].fillna('')
    df_test['amenities'] = df_test['amenities'].fillna('')
    
    # Encode
    train_embeddings = model.encode(X['amenities'].tolist(), show_progress_bar=True)
    test_embeddings = model.encode(df_test['amenities'].tolist(), show_progress_bar=True)
    
    # Create DataFrame from embeddings
    embedding_cols = [f'amenity_emb_{i}' for i in range(train_embeddings.shape[1])]
    train_emb_df = pd.DataFrame(train_embeddings, columns=embedding_cols, index=X.index)
    test_emb_df = pd.DataFrame(test_embeddings, columns=embedding_cols, index=df_test.index)
    
    # Concatenate and drop original column
    X = pd.concat([X, train_emb_df], axis=1).drop(columns=['amenities'])
    df_test = pd.concat([df_test, test_emb_df], axis=1).drop(columns=['amenities'])

# Identify categorical and numerical columns (Refresh after new features)
cat_cols = X.select_dtypes(include=['object']).columns
num_cols = X.select_dtypes(include=['int64', 'float64']).columns

print(f"Categorical Columns: {len(cat_cols)}")
print(f"Numerical Columns: {len(num_cols)}")

Train Shape after dropping missing targets: (25386, 54)
Categorical Columns: 15
Numerical Columns: 38


In [14]:
# Handle Missing Values
# For numerical, fill with median
for col in num_cols:
    median_val = X[col].median()  # Calculate median on Train
    X[col] = X[col].fillna(median_val)
    if col in df_test.columns:
        df_test[col] = df_test[col].fillna(median_val)  # Apply Train median to Test

# For categorical, fill with mode
for col in cat_cols:
    mode_val = X[col].mode()[0]  # Calculate mode on Train
    X[col] = X[col].fillna(mode_val)
    if col in df_test.columns:
        df_test[col] = df_test[col].fillna(mode_val)  # Apply Train mode to Test

In [15]:
# One-Hot Encoding
X_encoded = pd.get_dummies(X, columns=cat_cols, drop_first=True)
df_test_encoded = pd.get_dummies(df_test, columns=cat_cols, drop_first=True)

# Align columns (Ensure train and test have same features)
X_encoded, df_test_encoded = X_encoded.align(df_test_encoded, join='left', axis=1, fill_value=0)

print(f"Encoded Train Features Shape: {X_encoded.shape}")
print(f"Encoded Test Features Shape: {df_test_encoded.shape}")

Encoded Train Features Shape: (25386, 34086)
Encoded Test Features Shape: (130, 34086)


### 클래스 불균형 처리 (Class Imbalance Handling)
데이터의 클래스 비율을 맞추기 위해 리샘플링(Resampling)을 수행합니다.
- **옵션**: `SAMPLING_METHOD`를 `'upsample'`(업샘플링) 또는 `'downsample'`(다운샘플링)로 설정할 수 있습니다.
- **업샘플링 (Upsampling)**: 소수 클래스 데이터를 복제하여 늘립니다. 정보 손실이 없지만 과적합(Overfitting) 위험이 있습니다.
- **다운샘플링 (Downsampling)**: 다수 클래스 데이터를 줄여서 맞춥니다. 학습 속도는 빠르지만 중요한 정보가 손실될 수 있습니다.

In [16]:
SAMPLING_METHOD = 'upsample'  # Options: 'upsample', 'downsample'

# Combine X and y for resampling
train_data = pd.concat([X_encoded, y], axis=1)

# Check class distribution
print("Original Class Distribution:")
print(y.value_counts())

# Separate majority and minority classes
df_majority = train_data[train_data[target_col] == 0]
df_minority = train_data[train_data[target_col] == 1]

if SAMPLING_METHOD == 'upsample':
    print("\nPerforming Upsampling...")
    df_minority_upsampled = resample(df_minority, 
                                     replace=True,     # sample with replacement
                                     n_samples=len(df_majority),    # to match majority class
                                     random_state=42)
    df_balanced = pd.concat([df_majority, df_minority_upsampled])
    
elif SAMPLING_METHOD == 'downsample':
    print("\nPerforming Downsampling...")
    df_majority_downsampled = resample(df_majority, 
                                       replace=False,    # sample without replacement
                                       n_samples=len(df_minority),    # to match minority class
                                       random_state=42)
    df_balanced = pd.concat([df_majority_downsampled, df_minority])

# Display new class counts
print("\nBalanced Class Distribution:")
print(df_balanced[target_col].value_counts())

# Separate X and y again
X_balanced = df_balanced.drop(columns=[target_col])
y_balanced = df_balanced[target_col]

Original Class Distribution:
host_is_superhost
0.0    17165
1.0     8221
Name: count, dtype: int64

Performing Upsampling...

Balanced Class Distribution:
host_is_superhost
0.0    17165
1.0    17165
Name: count, dtype: int64


## 5. 특성 스케일링 (Feature Scaling)

In [None]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_balanced)
X_test_scaled = scaler.transform(df_test_encoded)

X_scaled = pd.DataFrame(X_scaled, columns=X_balanced.columns)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=df_test_encoded.columns)

### 하이퍼파라미터 튜닝 전략 (Hyperparameter Tuning Strategy)
`GridSearchCV`를 사용하여 각 모델의 최적 하이퍼파라미터를 찾습니다. 각 매개변수의 선정 이유는 다음과 같습니다.

- **MLP (신경망)**:
    - `hidden_layer_sizes`: 모델의 복잡도를 결정합니다. `(50,)`은 단순 모델, `(50, 50)`은 더 깊은 패턴을 학습합니다.
    - `activation`: `relu`는 일반적인 딥러닝 활성화 함수이며, `tanh`도 테스트합니다.
    - `alpha`: 과적합을 막기 위한 규제 파라미터입니다.

- **Random Forest & Decision Tree**:
    - `max_depth`: 트리의 깊이입니다. 너무 깊으면 과적합될 수 있어 제한을 둡니다.
    - `min_samples_split`: 노드를 분할하기 위한 최소 샘플 수입니다. 클수록 일반화에 유리합니다.

- **KNN**:
    - `n_neighbors`: 이웃의 수입니다. 작으면 노이즈에 민감하고, 크면 너무 단순해질 수 있습니다.
    - `weights`: `distance`는 가까운 이웃에게 더 큰 가중치를 줍니다.

- **Logistic Regression**:
    - `C`: 규제 강도의 역수입니다. 작을수록 규제가 강해져 과적합을 막습니다.

In [None]:
# Initialize results list and best_models dictionary
# Run this cell once before running individual model cells
results = []
best_models = {}

def visualize_model_result(name, model, X, y):
    # 혼동 행렬 시각화
    y_pred = model.predict(X)
    cm = confusion_matrix(y, y_pred)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['Not Superhost', 'Superhost'])
    disp.plot(cmap=plt.cm.Blues)
    plt.title(f'{name} - 혼동 행렬 (Confusion Matrix)')
    plt.show()
    
    # 특성 중요도 시각화 (트리 기반 모델)
    if hasattr(model, 'feature_importances_'):
        importances = model.feature_importances_
        indices = np.argsort(importances)[::-1][:20] # 상위 20개
        
        plt.figure(figsize=(10, 6))
        plt.title(f'{name} - 상위 20개 특성 중요도 (Feature Importance)')
        plt.bar(range(len(indices)), importances[indices], align='center')
        plt.xticks(range(len(indices)), [X.columns[i] for i in indices], rotation=90)
        plt.tight_layout()
        plt.show()
        


In [None]:
print("--- Processing Logistic Regression ---")
lr_params = {
    'C': [1]
}
clf = GridSearchCV(LogisticRegression(max_iter=100, random_state=42), lr_params, cv=5, scoring='accuracy', n_jobs=1, verbose=3)
clf.fit(X_scaled, y_balanced)
    # Calculate additional metrics
    y_pred = clf.best_estimator_.predict(X_scaled)
    rec = recall_score(y_balanced, y_pred)
    f1 = f1_score(y_balanced, y_pred)


best_models['Logistic Regression'] = clf.best_estimator_
results.append({
    'Model': 'Logistic Regression',
    'Best Score': clf.best_score_,
    'Best Params': clf.best_params_,
        'Recall': rec,
        'F1 Score': f1
})
print(f"  Best Score: {clf.best_score_:.4f}")
print(f"  Best Params: {clf.best_params_}")
    print(f"  Recall: {rec:.4f}")
    print(f"  F1 Score: {f1:.4f}")
    # 결과 시각화
    visualize_model_result(name, clf.best_estimator_, X_scaled, y_balanced)


In [None]:
print("--- Processing Decision Tree ---")
dt_params = {
    'max_depth': [None],
    'min_samples_split': [2]
}
clf = GridSearchCV(DecisionTreeClassifier(random_state=42), dt_params, cv=5, scoring='accuracy', n_jobs=1, verbose=3)
clf.fit(X_scaled, y_balanced)
    # Calculate additional metrics
    y_pred = clf.best_estimator_.predict(X_scaled)
    rec = recall_score(y_balanced, y_pred)
    f1 = f1_score(y_balanced, y_pred)


best_models['Decision Tree'] = clf.best_estimator_
results.append({
    'Model': 'Decision Tree',
    'Best Score': clf.best_score_,
    'Best Params': clf.best_params_,
        'Recall': rec,
        'F1 Score': f1
})
print(f"  Best Score: {clf.best_score_:.4f}")
print(f"  Best Params: {clf.best_params_}")
    print(f"  Recall: {rec:.4f}")
    print(f"  F1 Score: {f1:.4f}")
    # 결과 시각화
    visualize_model_result(name, clf.best_estimator_, X_scaled, y_balanced)


In [None]:
print("--- Processing Random Forest ---")
rf_params = {
    'n_estimators': [100],
    'max_depth': [None],
    'min_samples_split': [2]
}
clf = GridSearchCV(RandomForestClassifier(random_state=42), rf_params, cv=5, scoring='accuracy', n_jobs=1, verbose=3)
clf.fit(X_scaled, y_balanced)
    # Calculate additional metrics
    y_pred = clf.best_estimator_.predict(X_scaled)
    rec = recall_score(y_balanced, y_pred)
    f1 = f1_score(y_balanced, y_pred)


best_models['Random Forest'] = clf.best_estimator_
results.append({
    'Model': 'Random Forest',
    'Best Score': clf.best_score_,
    'Best Params': clf.best_params_,
        'Recall': rec,
        'F1 Score': f1
})
print(f"  Best Score: {clf.best_score_:.4f}")
print(f"  Best Params: {clf.best_params_}")
    print(f"  Recall: {rec:.4f}")
    print(f"  F1 Score: {f1:.4f}")
    # 결과 시각화
    visualize_model_result(name, clf.best_estimator_, X_scaled, y_balanced)


In [None]:
print("--- Processing MLP (Neural Network) ---")
mlp_params = {
    'hidden_layer_sizes': [(50, 50)],
    'activation': ['relu'],
    'alpha': [0.001]
}
clf = GridSearchCV(MLPClassifier(max_iter=100, random_state=42), mlp_params, cv=5, scoring='accuracy', n_jobs=1, verbose=3)
clf.fit(X_scaled, y_balanced)
    # Calculate additional metrics
    y_pred = clf.best_estimator_.predict(X_scaled)
    rec = recall_score(y_balanced, y_pred)
    f1 = f1_score(y_balanced, y_pred)


best_models['MLP (Neural Network)'] = clf.best_estimator_
results.append({
    'Model': 'MLP (Neural Network)',
    'Best Score': clf.best_score_,
    'Best Params': clf.best_params_,
        'Recall': rec,
        'F1 Score': f1
})
print(f"  Best Score: {clf.best_score_:.4f}")
print(f"  Best Params: {clf.best_params_}")
    print(f"  Recall: {rec:.4f}")
    print(f"  F1 Score: {f1:.4f}")
    # 결과 시각화
    visualize_model_result(name, clf.best_estimator_, X_scaled, y_balanced)


In [None]:
print("--- Processing KNN ---")
knn_params = {
    'n_neighbors': [7],
    'weights': ['distance']
}
clf = GridSearchCV(KNeighborsClassifier(), knn_params, cv=5, scoring='accuracy', n_jobs=1, verbose=3)
clf.fit(X_scaled, y_balanced)
    # Calculate additional metrics
    y_pred = clf.best_estimator_.predict(X_scaled)
    rec = recall_score(y_balanced, y_pred)
    f1 = f1_score(y_balanced, y_pred)


best_models['KNN'] = clf.best_estimator_
results.append({
    'Model': 'KNN',
    'Best Score': clf.best_score_,
    'Best Params': clf.best_params_,
        'Recall': rec,
        'F1 Score': f1
})
print(f"  Best Score: {clf.best_score_:.4f}")
print(f"  Best Params: {clf.best_params_}")
    print(f"  Recall: {rec:.4f}")
    print(f"  F1 Score: {f1:.4f}")
    # 결과 시각화
    visualize_model_result(name, clf.best_estimator_, X_scaled, y_balanced)


In [None]:
print("--- Processing Naive Bayes ---")
nb_params = {}
clf = GridSearchCV(GaussianNB(), nb_params, cv=5, scoring='accuracy', n_jobs=1, verbose=3)
clf.fit(X_scaled, y_balanced)
    # Calculate additional metrics
    y_pred = clf.best_estimator_.predict(X_scaled)
    rec = recall_score(y_balanced, y_pred)
    f1 = f1_score(y_balanced, y_pred)


best_models['Naive Bayes'] = clf.best_estimator_
results.append({
    'Model': 'Naive Bayes',
    'Best Score': clf.best_score_,
    'Best Params': clf.best_params_,
        'Recall': rec,
        'F1 Score': f1
})
print(f"  Best Score: {clf.best_score_:.4f}")
print(f"  Best Params: {clf.best_params_}")
    print(f"  Recall: {rec:.4f}")
    print(f"  F1 Score: {f1:.4f}")
    # 결과 시각화
    visualize_model_result(name, clf.best_estimator_, X_scaled, y_balanced)


In [None]:
# Display results dataframe
results_df = pd.DataFrame(results).sort_values(by='Best Score', ascending=False)
print("\n--- Model Selection Report ---")
print(results_df)

# Select the best performing model overall
if not results_df.empty:
    best_model_name = results_df.iloc[0]['Model']
    best_model_score = results_df.iloc[0]['Best Score']
    final_model = best_models[best_model_name]

    print(f"\nSelected Best Model: {best_model_name}")
    print(f"Reason: It achieved the highest cross-validation accuracy of {best_model_score:.4f} among all tested models.")
    
    # Compare with Rule-Based
else:
    print("No results found. Please run the model cells above.")


In [None]:
# --- Rule-Based Validation ---
print("\n--- Rule-Based Validation ---")

def check_superhost_criteria(row):
    # Criteria based on Airbnb Superhost requirements
    # 1. 10+ stays (or 3 stays + 100 nights) -> approximated by number_of_reviews_ltm >= 10
    # 2. Response rate >= 90%
    # 3. Rating >= 4.8
    # 4. Cancellation rate < 1% (Not available in dataset, assumed met)
    
    # Rating Check
    rating_ok = False
    if pd.notna(row.get('review_scores_rating')):
        rating_ok = row['review_scores_rating'] >= 4.8
    elif pd.notna(row.get('review_scores_value')):
        rating_ok = row['review_scores_value'] >= 4.8
        
    # Response Rate Check
    response_ok = False
    if pd.notna(row.get('host_response_rate')):
        # Convert '100%' string to 100 number
        try:
            rate = float(str(row['host_response_rate']).replace('%', ''))
            response_ok = rate >= 90
        except:
            pass
    else:
        # If missing, assume ok if other criteria met (lenient)
        response_ok = True
            
    # Stays Check
    stays_ok = False
    if pd.notna(row.get('number_of_reviews_ltm')):
        stays_ok = row['number_of_reviews_ltm'] >= 10
    elif pd.notna(row.get('number_of_reviews')):
        stays_ok = row['number_of_reviews'] >= 10
        
    return 1 if (rating_ok and response_ok and stays_ok) else 0

# Apply to Test Data (Need original columns, so we reload or use df_test before dropping)
# Since we dropped columns in preprocessing, we'll reload a fresh copy for this validation
df_test_raw = pd.read_excel('test_f25.xlsx')
rule_based_preds = df_test_raw.apply(check_superhost_criteria, axis=1)

print(f"Rule-Based Predictions (First 10): {rule_based_preds.head(10).tolist()}")


# 7. 최종 예측 (Final Prediction)

In [None]:
# Predict on Test Data
final_predictions = final_model.predict(X_test_scaled)

# Create submission dataframe
submission = pd.DataFrame({
    'No': df_test['No'],
    'host_is_superhost_pred': final_predictions
})

# Map 1/0 back to t/f
submission['host_is_superhost_pred'] = submission['host_is_superhost_pred'].map({1: 't', 0: 'f'})

submission.head()

In [None]:
# 결과 저장 (test_f25.xlsx 파일의 빈 열 채우기)
try:
    # 원본 파일 다시 로드 (포맷 유지)
    submission_df = pd.read_excel('test_f25.xlsx')
    
    # 예측값 매핑 (1 -> t, 0 -> f)
    pred_labels = ['t' if p == 1 else 'f' for p in final_predictions]
    
    # 타겟 컬럼 이름 확인 (보통 비어있거나 'host_is_superhost'일 것)
    target_col_name = 'host_is_superhost'
    if target_col_name not in submission_df.columns:
        # 만약 컬럼이 없다면 새로 생성, 있다면 덮어쓰기
        submission_df[target_col_name] = pred_labels
    else:
        submission_df[target_col_name] = pred_labels
        
    # 저장
    submission_df.to_excel('prediction_result.xlsx', index=False)
    print("예측 결과가 'prediction_result.xlsx'에 저장되었습니다.")
    
except Exception as e:
    print(f"결과 저장 중 오류 발생: {e}")


# 8. 요약 (Summary)

**모델 구축 과정 (Model Building Process):**
1.  **데이터 전처리 (Data Preprocessing)**: 
    -   타겟 변수 `host_is_superhost`를 이진값(binary)으로 변환했습니다.
    -   결측치를 대체했습니다 (수치형 변수는 중앙값, 범주형 변수는 최빈값).
    -   범주형 변수에 대해 원-핫 인코딩(One-Hot Encoding)을 수행했습니다.
    -   데이터 일관성을 보장하기 위해 학습(Train) 및 테스트(Test)의 특성(Feature)들을 동일하게 맞췄습니다.
    -   클래스 불균형 문제를 해결하기 위해 업샘플링(upsampling)을 사용하여 데이터 균형을 맞췄습니다.
    -   StandardScaler를 사용하여 변수 스케일링을 수행했습니다.

2.  **모델 평가 및 튜닝 (Model Evaluation & Tuning)**:
    -   다양한 모델(로지스틱 회귀, 의사결정나무, 랜덤 포레스트, MLP, KNN, 나이브 베이즈)에 대해 GridSearchCV를 수행했습니다.
    -   MLP의 은닉층, 의사결정나무/랜덤 포레스트의 트리 깊이, KNN의 이웃 수와 같은 하이퍼파라미터를 튜닝했습니다.
    -   안정적인 평가를 위해 5-fold 교차 검증(cross-validation)을 사용했습니다.

3.  **예측 (Prediction)**:
    -   교차 검증 정확도가 가장 높은 모델을 선택했습니다.
    -   테스트 데이터셋에 대한 예측을 생성하여 `prediction_result.xlsx` 파일로 저장했습니다.

# 8. 설명 가능한 AI 및 분석 (Analysis & Explainable AI)

여기서는 최종 선택된 모델이 **왜** 그런 예측을 했는지(SHAP), 그리고 우리가 알고 있는 **룰베이스(Airbnb 공식 기준)**와 얼마나 일치하는지 사후 분석합니다.

## **Airbnb 기준**

### ✅ 슈퍼호스트가 되기 위한 주요 조건

(최근 12개월 활동을 기준으로, 평가 날짜마다 자동 평가됨) [Airbnb**+2**Airbnb**+2**](https://www.airbnb.com/help/article/829?utm_source=chatgpt.com)

1. 숙소 소유자 계정이어야 함 (공동호스트  또는 체험/서비스 호스트는 기준 대상이 아님) [Airbnb**+1**](https://www.airbnb.co.in/resources/hosting-homes/a/get-fast-expert-help-with-dedicated-superhost-support-445?utm_source=chatgpt.com)
2. 최소 “10건 이상의 예약 완료” 또는 “3건 이상의 예약이면서 총 100박 이상” 숙박 완료여야 함. [Airbnb**+1**](https://www.airbnb.com/help/article/829?utm_source=chatgpt.com)
3. 응답률(response rate)이 90% 이상이어야 함. [Airbnb**+1**](https://www.airbnb.com/help/article/829?utm_source=chatgpt.com)
4. 예약 취소율(cancellation rate)이 1% 미만이어야 함 (단, 자연재해 등의 불가피한 사유는 예외) [Airbnb**+1**](https://www.airbnb.com/help/article/829?utm_source=chatgpt.com)
5. 전체 숙박평점(overall rating)이 4.8 이상이어야 함. [Airbnb**+1**](https://www.airbnb.com/help/article/829?utm_source=chatgpt.com)

---

### 🕒 평가 시점

* 평가주기는 **매 3개월마다(분기마다)** 진행됨. [Airbnb**+1**](https://www.airbnb.com.mt/help/article/3526?utm_source=chatgpt.com)
* 평가 날짜는 대략 **1 월 1일, 4 월 1일, 7 월 1일, 10 월 1일** 등임. [Airbnb](https://www.airbnb.co.in/resources/hosting-homes/a/get-fast-expert-help-with-dedicated-superhost-support-445?utm_source=chatgpt.com)
* 즉, 최근 12개월 동안의 지표를 이 평가일 기준으로 계산해서 자격 충족 여부를 판단함. [Airbnb](https://www.airbnb.com/help/article/829?utm_source=chatgpt.com)

---

### 🔍 참고할 만한 점

* “예약 완료 건수”는 단순히 예약만이 아니라 실제 숙박이 완료된 건수를 의미함. [Airbnb](https://www.airbnb.com/help/article/829?utm_source=chatgpt.com)
* “응답률”은 게스트 문의나 예약 요청에 대해 일정 시간 안에 응답한 비율임. [Hostex](https://hostex.io/blog/airbnb-superhost/?utm_source=chatgpt.com)
* “취소율”은 호스트가 자의적으로 예약을 취소한 비율을 의미하며, 불가피한 사유는 카운트에서 제외될 수 있음.

In [None]:
# --- 8. 설명 가능한 AI 및 분석 (Explainable AI & Analysis) ---
print("\n--- 설명 가능한 AI 및 분석 (Explainable AI & Analysis) ---")

# 1. 룰베이스 모델과 비교 분석 (Comparison with Rule-Based Logic)
print("\n--- 룰베이스 모델과 비교 분석 (Comparison with Rule-Based Logic) ---")
print("룰베이스 로직: Airbnb 공식 슈퍼호스트 기준 (평점 4.8+, 응답률 90%+, 10+ 숙박 등) 적용")

final_preds = final_model.predict(X_test_scaled)
agreement = (final_preds == rule_based_preds).mean()
print(f"최적 모델과 룰베이스 로직 간의 일치율: {agreement:.2%}")

# 불일치 사례 분석 (옵션)
disagreements = df_test_raw[final_preds != rule_based_preds]
if not disagreements.empty:
    print(f"\n불일치 사례 수: {len(disagreements)}")
    print("불일치 사례 일부:")
    print(disagreements[['host_response_rate', 'review_scores_rating', 'number_of_reviews_ltm']].head())

# 2. SHAP을 이용한 모델 설명 (Explainable AI with SHAP)
print("\n--- SHAP을 이용한 모델 설명 (Explainable AI with SHAP) ---")

# 최적 모델이 트리 기반인지 확인 (Random Forest 등)
if hasattr(final_model, 'feature_importances_'):
    # 요약 플롯 (Summary Plot)
    explainer = shap.TreeExplainer(final_model)
    # 데이터가 너무 크면 일부만 샘플링하여 설명
    X_sample = X_scaled.sample(n=min(100, len(X_scaled)), random_state=42)
    shap_values = explainer.shap_values(X_sample)
    
    print("SHAP Summary Plot (전체적인 특성 영향력):")
    shap.summary_plot(shap_values, X_sample, plot_type="bar")
    print("SHAP Summary Plot (클래스별 상세 영향력):")
    # 다중 클래스일 경우, 1번 클래스(슈퍼호스트)에 대한 설명만 출력
    if isinstance(shap_values, list):
        shap.summary_plot(shap_values[1], X_sample)
    else:
        shap.summary_plot(shap_values, X_sample)
else:
    print("선택된 모델은 SHAP TreeExplainer를 지원하지 않거나 특성 중요도를 제공하지 않습니다.")
