In [9]:
# 导入必要的库
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# 导入模型相关的库
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb

# 导入评估和交叉验证相关的库
from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.preprocessing import StandardScaler


In [10]:
train_df = pd.read_csv('./data/clean_train.csv')
X = train_df.drop('credit_score', axis=1)
y = train_df['credit_score']

In [11]:
# 定义评估指标
scoring = {
    'accuracy': 'accuracy',
    'precision_macro': 'precision_macro',
    'recall_macro': 'recall_macro',
    'f1_macro': 'f1_macro'
}

# 定义交叉验证函数
def evaluate_model(model, X, y, model_name):
    # 执行5折交叉验证
    cv_results = cross_validate(model, X, y, cv=5, scoring=scoring)
    
    # 计算平均评估指标
    results = {
        'Model': model_name,
        'Accuracy': cv_results['test_accuracy'].mean(),
        'Precision': cv_results['test_precision_macro'].mean(),
        'Recall': cv_results['test_recall_macro'].mean(),
        'F1 Score': cv_results['test_f1_macro'].mean()
    }
    
    return results

In [None]:
# 初始化模型
lr_model = LogisticRegression(max_iter=10000, random_state=42)
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
xgb_model = xgb.XGBClassifier(random_state=42)

# 评估模型
models = [
    (lr_model, 'Logistic Regression'),
    (rf_model, 'Random Forest'),
    (xgb_model, 'XGBoost')
]

results = []
for model, name in models:
    print(f"Training {name}...")
    result = evaluate_model(model, X, y, name)
    results.append(result)

# 创建结果DataFrame
results_df = pd.DataFrame(results)
results_df.set_index('Model', inplace=True)
results_df

Training Logistic Regression...


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

KeyboardInterrupt: 

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

bool_cols = train_df.select_dtypes(include='bool').columns
train_df = train_df.copy()
train_df[bool_cols] = train_df[bool_cols].astype(int)

def calculate_vif(df):
    vif_data = pd.DataFrame()
    vif_data["Feature"] = df.columns
    vif_data["VIF"] = [variance_inflation_factor(df.values, i) 
                       for i in range(df.shape[1])]
    return vif_data.sort_values('VIF', ascending=False)

# 计算VIF
vif_results = calculate_vif(train_df)
print("\nVIF值:")
print(vif_results)


VIF值:
                        Feature        VIF
38            occupation_Lawyer  13.407065
40          occupation_Mechanic  12.880611
35          occupation_Engineer  12.868964
31        occupation_Accountant  12.841798
41     occupation_Media_Manager  12.696023
33         occupation_Developer  12.691311
43         occupation_Scientist  12.590520
32         occupation_Architect  12.587456
37        occupation_Journalist  12.481650
44           occupation_Teacher  12.476863
36      occupation_Entrepreneur  12.429127
34            occupation_Doctor  12.340480
39           occupation_Manager  12.205907
45            occupation_Writer  11.914536
42          occupation_Musician  11.852992
4         monthly_inhand_salary   8.913051
19              monthly_balance   6.424539
13                   credit_mix   6.069420
18      amount_invested_monthly   5.159902
8                   num_of_loan   3.999262
14             outstanding_debt   2.697045
7                 interest_rate   2.608494
16  

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

bool_cols = train_df.select_dtypes(include='bool').columns
train_df = train_df.copy()
train_df[bool_cols] = train_df[bool_cols].astype(int)

def calculate_vif(df):
    vif_data = pd.DataFrame()
    vif_data["Feature"] = df.columns
    vif_data["VIF"] = [variance_inflation_factor(df.values, i) 
                       for i in range(df.shape[1])]
    return vif_data.sort_values('VIF', ascending=False)

# 计算VIF
vif_results = calculate_vif(train_df)
print("\nVIF值:")
print(vif_results)


VIF值:
                        Feature        VIF
38            occupation_Lawyer  13.407065
40          occupation_Mechanic  12.880611
35          occupation_Engineer  12.868964
31        occupation_Accountant  12.841798
41     occupation_Media_Manager  12.696023
33         occupation_Developer  12.691311
43         occupation_Scientist  12.590520
32         occupation_Architect  12.587456
37        occupation_Journalist  12.481650
44           occupation_Teacher  12.476863
36      occupation_Entrepreneur  12.429127
34            occupation_Doctor  12.340480
39           occupation_Manager  12.205907
45            occupation_Writer  11.914536
42          occupation_Musician  11.852992
4         monthly_inhand_salary   8.913051
19              monthly_balance   6.424539
13                   credit_mix   6.069420
18      amount_invested_monthly   5.159902
8                   num_of_loan   3.999262
14             outstanding_debt   2.697045
7                 interest_rate   2.608494
16  

In [None]:
# 可视化模型性能比较
plt.figure(figsize=(12, 6))
metrics = ['Accuracy', 'Precision', 'Recall', 'F1 Score']

x = np.arange(len(models))
width = 0.2

for i, metric in enumerate(metrics):
    plt.bar(x + i*width, results_df[metric], width, label=metric)

plt.xlabel('Models')
plt.ylabel('Score')
plt.title('Model Performance Comparison')
plt.xticks(x + width*1.5, results_df.index)
plt.legend()
plt.tight_layout()
plt.savefig('./image/model_performance_comparison.png')
plt.show()