In [1]:
import pandas as pd
from lightgbm.sklearn import LGBMClassifier
from sklearn.metrics import accuracy_score, auc, roc_auc_score
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
## 0. 打印设置
pd.set_option('display.max_columns', None)
# pd.set_option('display.max_rows', None)  ## 显示全部结果，不带省略点
# pd.set_option('display.width', 1000)
pd.set_option('display.float_format', '{:.0f}'.format)

In [2]:
## 1.1 读取数据
train_Base = pd.read_csv(r"data/train.csv")
test_Base = pd.read_csv(r"data/test.csv")

In [3]:
## 1.2 数据合并
# data = pd.concat([test_Base, train_Base], axis=0)
# data

In [4]:
## 1.3 数据清洗
## 1.3.1 索引完善
# data.index = range(len(data))

In [5]:
## 1.4 数据探索
## 1.4.1 空值数量
# data.isnull().sum()

In [6]:
# 1.4.2 唯一值个数
# for col in data.columns:
#     print(col, data[col].nunique())

In [7]:
## 1.4.3 字符串的字段，唯一值统计
# cat_columns = data.select_dtypes(include='object').columns  

# column_name = []
# unique_value = []
 
# for col in cat_columns:
#     column_name.append(col)
#     unique_value.append(data[col].nunique())

# df = pd.DataFrame()
# df['col_name'] = column_name
# df['value'] = unique_value
# df = df.sort_values('value', ascending=False)
 
# df

In [8]:
## 2 特征工程
## 2.0 特征编码--property_damage、police_report_available
# data['property_damage'].value_counts()
# data['property_damage'] = data['property_damage'].map({'NO': 0, 'YES': 1, '?': 2})
# data['property_damage'].value_counts()

# data['police_report_available'].value_counts()
# data['police_report_available'] = data['police_report_available'].map({'NO': 0, 'YES': 1, '?': 2})
# data['police_report_available'].value_counts()

In [9]:
# ## 2.2 去除无关的特征
# data.drop(['policy_id'], axis=1, inplace=True)
# data.columns

In [10]:
## 2.1 特征编码--加入一个新的日期编码
# policy_bind_date, incident_date
# data['policy_bind_date'] = pd.to_datetime(data['policy_bind_date'])
# data['incident_date'] = pd.to_datetime(data['incident_date'])
 
# # 查看最大日期，最小日期
# data['policy_bind_date'].min() # 1990-01-08
# data['policy_bind_date'].max() # 2015-02-22

# data['incident_date'].min() # 2015-01-01
# data['incident_date'].max() # 2015-03-01

# base_date = data['policy_bind_date'].min()
# # 转换为date_diff
# data['policy_bind_date_diff'] = (data['policy_bind_date'] - base_date).dt.days
# data['incident_date_diff'] = (data['incident_date'] - base_date).dt.days

# #去掉原始日期字段 policy_bind_date    incident_date
# data.drop(['policy_bind_date', 'incident_date'], axis=1, inplace=True)
# data

In [3]:
## 2.3 标签编码
cat_columns = train_Base.select_dtypes(include=['object']).columns
le = LabelEncoder()
for col in cat_columns:
    train_Base[col] = le.fit_transform(train_Base[col])
    test_Base[col] = le.fit_transform(test_Base[col])

In [4]:
cat_columns

Index(['policy_bind_date', 'policy_state', 'policy_csl', 'insured_sex',
       'insured_education_level', 'insured_occupation', 'insured_hobbies',
       'insured_relationship', 'incident_date', 'incident_type',
       'collision_type', 'incident_severity', 'authorities_contacted',
       'incident_state', 'incident_city', 'property_damage',
       'police_report_available', 'auto_make', 'auto_model'],
      dtype='object')

In [12]:
## 2.4 分箱编码

# ## 1）age分箱
# for x in range(10,70,10):
#     train_Base[train_Base['age'].between(x,x+10)].loc[:,['age']]=x

In [13]:
## 3. 数据集切分
## 3.1 切分训练集和测试集
# train = data[data['fraud'].notnull()]
# test = data[data['fraud'].isnull()]
X_train = train_Base.drop(columns=['policy_id', 'fraud'])
Y_train = train_Base['fraud']

In [14]:
## 3.2 训练集中，训练集和验证集的划分

# x_train, x_train_01 = train_test_split(train.drop(['fraud'],axis=1), test_size=0.2, random_state=42)  # 25% of remaining data as validation set  
# y_train, y_train_01 = train_test_split(train['fraud'], test_size=0.2, random_state=42)  # Split labels accordingly  

# x_train, x_train_01, y_train, y_train_01 = train_test_split(X, Y, test_size=0.2, random_state=42)

In [15]:
## 4. 模型训练
## 4.1 建立模型
# gbm = LGBMClassifier(n_estimators=600, learning_rate=0.01, boosting_type='gbdt',  ## 模型训练超参数 调优参考：https://blog.51cto.com/u_16213313/7201851
#                      objective='binary',   ## LGBMClassifier详解： https://blog.csdn.net/yeshang_lady/article/details/118638269
#                      max_depth=-1,
#                      random_state=2022,
#                      metric='auc')

gbm = LGBMClassifier(num_leaves=2**5-1, reg_alpha=0.25, reg_lambda=0.25, objective='binary',
            max_depth=-1, learning_rate=0.005, min_child_samples=3, random_state=2022,
            n_estimators=300, subsample=1, colsample_bytree=1,)

In [16]:
## 4.2 交叉验证训练

n_folds=5
auc_mean=0
accuracy_mean = 0
precision_mean = 0
recall_mean = 0
f1_mean = 0
pred01 = []


i=1


kf = KFold(n_splits=n_folds, shuffle=True, random_state=2022)
for index_train_train, index_train_test in kf.split(X_train):
    #     print(index_train)
    #     print('------------------')
    #     print(index_yanzheng)
    x_train_train = X_train.iloc[index_train_train]
    y_train_train = Y_train.iloc[index_train_train]
    #     print(train_X)
    x_train_test = X_train.iloc[index_train_test]
    y_train_test = Y_train.iloc[index_train_test]
    ## 3.2 模型训练
    gbm.fit(x_train_train, y_train_train)
    ## 3.3 模型预测
    y_train_test_pred = gbm.predict_proba(x_train_test)[:, 1]
    

    ## 3.4 模型评估指标计算
    print(f'-------------模型评估第{i}次-------------')
    auc = roc_auc_score(y_train_test, y_train_test_pred)
    print("auc值:", auc)
    
    y_train_test_pred[y_train_test_pred > 0.5] = 1
    y_train_test_pred[y_train_test_pred <= 0.5] = 0

    accuracy = accuracy_score(y_train_test, y_train_test_pred)  ## 计算准确率
    precision = precision_score(y_train_test, y_train_test_pred) # 计算精确率
    recall = recall_score(y_train_test, y_train_test_pred) # 计算召回率
    f1 = f1_score(y_train_test, y_train_test_pred) # 计算F1值
    
    print("Accuracy:", accuracy)
    print("Precision:", precision)
    print("Recall:", recall)
    print("F1 Score:", f1)

    auc_mean += auc / n_folds
    accuracy_mean +=accuracy / n_folds
    precision_mean +=precision / n_folds
    recall_mean += recall / n_folds
    f1_mean += f1 / n_folds
    
    ## 3.5 测试集，预测并记录结果，最后求平均 
    y_test_pred = gbm.predict_proba(test_Base.drop(['policy_id'],axis=1))[:,1]
    pred01.append(y_test_pred)
 
    i += 1

print(f'-------------模型评估最后结果-------------')
print(f'auc平均值： {auc_mean}')
print(f'accuracy平均值： {accuracy_mean}')
print(f'precision平均值： {precision_mean}')
print(f'recall平均值： {recall_mean}')
print(f'f1平均值： {f1_mean}')

-------------模型评估第1次-------------
auc值: 0.8205222961320522
Accuracy: 0.7785714285714286
Precision: 0.6923076923076923
Recall: 0.43902439024390244
F1 Score: 0.5373134328358209
-------------模型评估第2次-------------
auc值: 0.7914816870144283
Accuracy: 0.7928571428571428
Precision: 0.5862068965517241
Recall: 0.5
F1 Score: 0.5396825396825397
-------------模型评估第3次-------------
auc值: 0.8418329637841834
Accuracy: 0.8
Precision: 0.6585365853658537
Recall: 0.6585365853658537
F1 Score: 0.6585365853658537
-------------模型评估第4次-------------
auc值: 0.8658536585365854
Accuracy: 0.8142857142857143
Precision: 0.6829268292682927
Recall: 0.6829268292682927
F1 Score: 0.6829268292682927
-------------模型评估第5次-------------
auc值: 0.8171695402298851
Accuracy: 0.8428571428571429
Precision: 0.5416666666666666
Recall: 0.5416666666666666
F1 Score: 0.5416666666666666
-------------模型评估最后结果-------------
auc平均值： 0.8273720291394269
accuracy平均值： 0.8057142857142858
precision平均值： 0.6323289340320458
recall平均值： 0.5644308943089431
f1

In [17]:

# 4.2 模型训练
## train.drop(['fraud'],axis=1) ## axis=0 表示行，axis=1 表示列
# gbm.fit(x_train, y_train)

In [18]:
# 4.3 模型预测，以proba进行提交，结果会更好
# y_train_01_pred = gbm.predict_proba(x_train_01)

In [25]:
## 5. 模型评估

auc = auc_mean
accuracy= accuracy_mean  ## 计算准确率
precision = precision_mean  # 计算精确率
recall = recall_mean # 计算召回率
f1 = f1_mean # 计算F1值


# 输出计算得到的准确率、召回率和F1值
print("auc:", auc)
v_code = str(round(auc,5)).split('.')[1]
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

auc: 0.8273720291394269
Accuracy: 0.8057142857142858
Precision: 0.6323289340320458
Recall: 0.5644308943089431
F1 Score: 0.5920252107638347


In [20]:
# ## 5.1 模型命名，版本控制
model_name=f'model_0_{v_code}_base'

In [21]:
## 6 结果输出

## test集预测，k交叉已预测
## y_test_pred = gbm.predict_proba(test_Base.drop(['policy_id'],axis=1))

## 6.0 测试集结果，求平均
pred02 = sum(pred01) / n_folds  ## pred01预测集的中间结果

pred02[ pred02 > 0.5] = '1'
pred02[ pred02 <= 0.5] = '0'
pred02

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.,
       0., 1., 0., 0., 0., 1., 0., 0., 0., 0., 0., 1., 0., 0., 0., 1., 0.,
       0., 0., 1., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 1., 0., 0., 0., 1., 0., 1., 0., 0., 0., 1., 0., 0., 1., 1.,
       0., 0., 0., 0., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 1., 0., 1., 1., 0., 0., 0., 0., 0., 1., 0., 0., 0.,
       0., 0., 0., 1., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 1.,
       0., 0., 0., 1., 0., 0., 1., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.,
       0., 1., 0., 0., 0., 0., 1., 0., 1., 1., 0., 0., 0., 0., 0., 1., 1.,
       1., 0., 0., 1., 0., 0., 0., 1., 0., 1., 1., 1., 0., 0., 0., 1., 0.,
       0., 0., 1., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.,
       1., 0., 0., 0., 0., 0., 1., 1., 0., 0., 0., 0., 0., 0., 0., 1., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 1., 0., 0., 1., 0.,
       0., 0., 0., 0., 1.

In [22]:
# 6.2 输出结果
result = pd.read_csv('./data/submission.csv')
result['fraud'] = pred02
result.to_csv(f'./data/{model_name}.csv', index=False)

In [33]:
## 7 模型评估结果输出
evalue_result=pd.read_csv('./data/evalue_result.csv', encoding='utf-8')
evalue_result

Unnamed: 0,model_name,update_time,Accuracy,Precision,Recall,F1 Score,auc,sub_score,update_content
0,model_0_8496_base,2024/4/26 11:55,1,1,1,1,1,1,base model
1,model_0_8914_base,2024/4/26 13:03,1,1,1,1,1,1,delete# 1.2 1.3 1.4 2.0(date-diff) ; add #3.1(...
2,model_0_8351_base,2024/4/26 13:19,1,1,1,1,1,1,base on model_0_8496_base; update ## 3.2 -- te...


In [37]:
## 5.1 模型评估结果输出
import datetime 

new_row = {'model_name': model_name, 'update_time': datetime.datetime.now() , 'Accuracy': accuracy, 'Precision': precision, 'Recall': recall
 , 'F1 Score': f1, 'auc': auc, 'sub_score': 0.8128, 'update_content': 'base on baseModel; delete #3.2(data-split); add ## 4.2(k_evalue)'}  
evalue_result.loc[len(evalue_result.index)] = new_row 
evalue_result
evalue_result.to_csv('./data/evalue_result.csv', index=False)