In [1]:
import pandas as pd
from lightgbm.sklearn import LGBMClassifier
from sklearn.metrics import accuracy_score, auc, roc_auc_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
## 0. 打印设置
pd.set_option('display.max_columns', None)
# pd.set_option('display.max_rows', None)  ## 显示全部结果，不带省略点
# pd.set_option('display.width', 1000)
pd.set_option('display.float_format', '{:.0f}'.format)

In [2]:
## 1.1 读取数据
train_Base = pd.read_csv(r"data/train.csv")
test_Base = pd.read_csv(r"data/test.csv")

In [3]:
## 1.2 数据合并
# data = pd.concat([test_Base, train_Base], axis=0)
# data

In [4]:
## 1.3 数据清洗
## 1.3.1 索引完善
# data.index = range(len(data))

In [5]:
## 1.4 数据探索
## 1.4.1 空值数量
# data.isnull().sum()

In [6]:
# 1.4.2 唯一值个数
# for col in data.columns:
#     print(col, data[col].nunique())

In [7]:
## 1.4.3 字符串的字段，唯一值统计
# cat_columns = data.select_dtypes(include='object').columns  

# column_name = []
# unique_value = []
 
# for col in cat_columns:
#     column_name.append(col)
#     unique_value.append(data[col].nunique())

# df = pd.DataFrame()
# df['col_name'] = column_name
# df['value'] = unique_value
# df = df.sort_values('value', ascending=False)
 
# df

In [8]:
## 2 特征工程
## 2.0 特征编码--property_damage、police_report_available
# data['property_damage'].value_counts()
# data['property_damage'] = data['property_damage'].map({'NO': 0, 'YES': 1, '?': 2})
# data['property_damage'].value_counts()

# data['police_report_available'].value_counts()
# data['police_report_available'] = data['police_report_available'].map({'NO': 0, 'YES': 1, '?': 2})
# data['police_report_available'].value_counts()

In [9]:
# ## 2.2 去除无关的特征
# data.drop(['policy_id'], axis=1, inplace=True)
# data.columns

In [10]:
## 2.3 标签编码
cat_columns = train_Base.select_dtypes(include=['object']).columns
le = LabelEncoder()
for col in cat_columns:
    train_Base[col] = le.fit_transform(train_Base[col])
    test_Base[col] = le.fit_transform(test_Base[col])

In [11]:
## 2.4 分箱编码

# ## 1）age分箱
# for x in range(10,70,10):
#     train_Base[train_Base['age'].between(x,x+10)].loc[:,['age']]=x

In [12]:
## 3. 数据集切分
## 3.1 切分训练集和测试集
# train = data[data['fraud'].notnull()]
# test = data[data['fraud'].isnull()]
X = train_Base.drop(columns=['policy_id', 'fraud'])
Y = train_Base['fraud']

In [13]:
## 3.2 训练集中，训练集和验证集的划分

# x_train, x_train_01 = train_test_split(train.drop(['fraud'],axis=1), test_size=0.2, random_state=42)  # 25% of remaining data as validation set  
# y_train, y_train_01 = train_test_split(train['fraud'], test_size=0.2, random_state=42)  # Split labels accordingly  

x_train, x_train_01, y_train, y_train_01 = train_test_split(X, Y, test_size=0.2, random_state=42)

In [14]:
## 4. 模型训练
## 4.1 建立模型
gbm = LGBMClassifier(n_estimators=600, learning_rate=0.01, boosting_type='gbdt',  ## 模型训练超参数 调优参考：https://blog.51cto.com/u_16213313/7201851
                     objective='binary',   ## LGBMClassifier详解： https://blog.csdn.net/yeshang_lady/article/details/118638269
                     max_depth=-1,
                     random_state=2022,
                     metric='auc')

In [15]:
# 4.2 模型训练
## train.drop(['fraud'],axis=1) ## axis=0 表示行，axis=1 表示列
gbm.fit(x_train, y_train)

LGBMClassifier(learning_rate=0.01, metric='auc', n_estimators=600,
               objective='binary', random_state=2022)

In [16]:
# 4.3 模型预测，以proba进行提交，结果会更好
y_train_01_pred = gbm.predict_proba(x_train_01)

In [17]:
## 5. 模型评估
## 5.1 评估auc值
auc = roc_auc_score(y_train_01, y_train_01_pred[:,-1]) # 计算auc值
print("auc值:", auc)
v_code = str(round(auc,5)).split('.')[1]

## 5.2 概率转换
y_train_01_pred[:, 1][y_train_01_pred[:, 1] > 0.5] = '1'
y_train_01_pred[:, 1][y_train_01_pred[:, 1] <= 0.5] = '0'
y_train_01_pred

## 5.3 评估accuracy，precision，recall，f1
from sklearn.metrics import precision_score, recall_score, f1_score
 
accuracy=accuracy_score(y_train_01, y_train_01_pred[:,-1])  ## 计算准确率
precision = precision_score(y_train_01, y_train_01_pred[:,-1]) # 计算精确率
recall = recall_score(y_train_01, y_train_01_pred[:,-1]) # 计算召回率
f1 = f1_score(y_train_01, y_train_01_pred[:,-1]) # 计算F1值


# 输出计算得到的准确率、召回率和F1值
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

auc值: 0.825578231292517
Accuracy: 0.8214285714285714
Precision: 0.631578947368421
Recall: 0.6857142857142857
F1 Score: 0.6575342465753424


In [18]:
# ## 5.1 模型命名，版本控制
model_name=f'model_0_{v_code}_base'

In [19]:
## 6 结果输出
## 6.1 test集的预测
y_test_pred = gbm.predict_proba(test_Base.drop(['policy_id'],axis=1))
y_test_pred[:, 1][y_test_pred[:, 1] > 0.5] = '1'
y_test_pred[:, 1][y_test_pred[:, 1] <= 0.5] = '0'
y_test_pred

array([[0.9632268 , 0.        ],
       [0.36388375, 1.        ],
       [0.98821778, 0.        ],
       [0.95145253, 0.        ],
       [0.98189911, 0.        ],
       [0.95605624, 0.        ],
       [0.97538168, 0.        ],
       [0.9850762 , 0.        ],
       [0.98756046, 0.        ],
       [0.98260209, 0.        ],
       [0.98987076, 0.        ],
       [0.98399295, 0.        ],
       [0.26981446, 1.        ],
       [0.94720864, 0.        ],
       [0.99033894, 0.        ],
       [0.60745107, 0.        ],
       [0.90867083, 0.        ],
       [0.94976572, 0.        ],
       [0.13030434, 1.        ],
       [0.97556139, 0.        ],
       [0.97249955, 0.        ],
       [0.86674971, 0.        ],
       [0.48363067, 1.        ],
       [0.91932692, 0.        ],
       [0.96334722, 0.        ],
       [0.17758365, 1.        ],
       [0.99505919, 0.        ],
       [0.96957817, 0.        ],
       [0.44375565, 1.        ],
       [0.99350068, 0.        ],
       [0.

In [20]:
# 6.2 输出结果

result = pd.read_csv('./data/submission.csv')
result['fraud'] = y_test_pred[:, 1]
result.to_csv(f'./data/{model_name}.csv', index=False)

In [23]:
## 7 模型评估结果输出
evalue_result=pd.read_csv('./data/evalue_result.csv', encoding='utf-8')
evalue_result

Unnamed: 0,model_name,update_time,Accuracy,Precision,Recall,F1 Score,auc,sub_score,update_content
0,model_0_86224_base,2024/4/26 11:55,1,1,1,1,1,1,base model


In [26]:
## 5.1 模型评估结果输出
import datetime 

new_row = {'model_name': model_name, 'update_time': datetime.datetime.now() , 'Accuracy': accuracy, 'Precision': precision, 'Recall': recall
 , 'F1 Score': f1, 'auc': auc, 'sub_score': 0.8914, 'update_content': 'delete# 1.2 1.3 1.4 2.0(date-diff) ; add #3.1(data-split);update ## 3.2-test_size-random_state '}  
evalue_result.loc[len(evalue_result.index)] = new_row 
evalue_result
evalue_result.to_csv('./data/evalue_result.csv', index=False)