In [17]:
import pandas as pd
import lightgbm
from sklearn.model_selection import KFold

In [18]:
data1=pd.read_csv('DataSet\糖尿病遗传风险预测挑战赛公开数据\糖尿病遗传风险预测挑战赛公开数据\比赛训练集.csv',encoding='gbk')
data2=pd.read_csv('DataSet\糖尿病遗传风险预测挑战赛公开数据\糖尿病遗传风险预测挑战赛公开数据\比赛测试集.csv',encoding='gbk')
#label标记为-1
data2['患有糖尿病标识']=-1
#训练集和测试机合并
data=pd.concat([data1,data2],axis=0,ignore_index=True)
#将舒张压特征中的缺失值填充为-1
data['舒张压']=data['舒张压'].fillna(-1)

In [19]:
data2.describe()

Unnamed: 0,编号,性别,出生年份,体重指数,舒张压,口服耐糖量测试,胰岛素释放实验,肱三头肌皮褶厚度,患有糖尿病标识
count,1000.0,1000.0,1000.0,1000.0,951.0,1000.0,1000.0,1000.0,1000.0
mean,500.5,0.481,1986.386,39.439,89.638275,5.872314,4.1027,7.06424,-1.0
std,288.819436,0.499889,8.816163,11.284861,9.379124,1.93088,8.594005,13.900938,0.0
min,1.0,0.0,1958.0,0.0,28.0,-1.0,0.0,0.0,-1.0
25%,250.75,0.0,1979.0,29.975,85.0,4.516,0.0,0.0,-1.0
50%,500.5,0.0,1987.0,38.9,89.0,5.8515,0.0,0.0,-1.0
75%,750.25,1.0,1994.0,48.95,96.0,7.465,7.2025,3.82,-1.0
max,1000.0,1.0,2003.0,60.0,112.0,10.613,123.89,44.9,-1.0


In [20]:
#特征工程
"""
将出生年份换算成年龄
"""
data['出生年份']=2022-data['出生年份']  #换成年龄

"""
人体的成人体重指数正常值是在18.5-24之间
低于18.5是体重指数过轻
在24-27之间是体重超重
27以上考虑是肥胖
高于32了就是非常的肥胖。
"""
def BMI(a):
    if a<18.5:
        return 0
    elif 18.5<=a<=24:
        return 1
    elif 24<a<=27:
        return 2
    elif 27<a<=32:
        return 3
    else:
        return 4
    
data['BMI']=data['体重指数'].apply(BMI)

#糖尿病家族史
"""
无记录
叔叔或者姑姑有一方患有糖尿病/叔叔或姑姑有一方患有糖尿病
父母有一方患有糖尿病
"""
def FHOD(a):
    if a=='无记录':
        return 0
    elif a=='叔叔或者姑姑有一方患有糖尿病' or a=='叔叔或姑姑有一方患有糖尿病':
        return 1
    else:
        return 2
    
data['糖尿病家族史']=data['糖尿病家族史'].apply(FHOD)
"""
舒张压范围为60-90
"""
def DBP(a):
    if a<60:
        return 0
    elif 60<=a<=90:
        return 1
    elif a>90:
        return 2
    else:
        return a
data['DBP']=data['舒张压'].apply(DBP)

#------------------------------------
#将处理好的特征工程分为训练集和测试集，其中训练集是用来训练模型，测试集用来评估模型准确度
#其中编号和患者是否得糖尿病没有任何联系，属于无关特征予以删除
train=data[data['患有糖尿病标识'] !=-1]
test=data[data['患有糖尿病标识'] ==-1]
train_label=train['患有糖尿病标识']
train=train.drop(['编号','患有糖尿病标识'],axis=1)
test=test.drop(['编号','患有糖尿病标识'],axis=1)

In [21]:
#使用Lightgbm方法训练数据集，使用5折交叉验证的方法获得5个测试集预测结果
def select_by_lgb(train_data,train_label,test_data,random_state=2022,n_splits=5,metric='auc',num_round=10000,early_stopping_rounds=100):
    kfold = KFold(n_splits=n_splits, shuffle=True, random_state=random_state)
    fold=0
    result=[]
    for train_idx, val_idx in kfold.split(train_data):
        random_state+=1
        train_x = train_data.loc[train_idx]
        train_y = train_label.loc[train_idx]
        test_x = train_data.loc[val_idx]
        test_y = train_label.loc[val_idx]
        clf=lightgbm
        train_matrix=clf.Dataset(train_x,label=train_y)
        test_matrix=clf.Dataset(test_x,label=test_y)
        params={
                'boosting_type': 'gbdt',  
                'objective': 'binary',
                'learning_rate': 0.1,
                'metric': metric,
                'seed': 2020,
                'nthread':-1 }
        model=clf.train(params,train_matrix,num_round,valid_sets=test_matrix,early_stopping_rounds=early_stopping_rounds)
        pre_y=model.predict(test_data)
        result.append(pre_y)
        fold+=1
    return result

test_data=select_by_lgb(train,train_label,test)
#test_data就是5折交叉验证中5次预测的结果
pre_y=pd.DataFrame(test_data).T
#将5次预测的结果求取平均值，当然也可以使用其他的方法
pre_y['averge']=pre_y[[i for i in range(5)]].mean(axis=1)
#因为竞赛需要你提交最后的预测判断，而模型给出的预测结果是概率，因此我们认为概率>0.5的即该患者有糖尿病，概率<=0.5的没有糖尿病
pre_y['label']=pre_y['averge'].apply(lambda x:1 if x>0.5 else 0)
pre_y



[LightGBM] [Info] Number of positive: 1547, number of negative: 2509
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1053
[LightGBM] [Info] Number of data points in the train set: 4056, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.381410 -> initscore=-0.483567
[LightGBM] [Info] Start training from score -0.483567
[1]	valid_0's auc: 0.986377
Training until validation scores don't improve for 100 rounds
[2]	valid_0's auc: 0.989898
[3]	valid_0's auc: 0.989063
[4]	valid_0's auc: 0.990182
[5]	valid_0's auc: 0.990087
[6]	valid_0's auc: 0.990219
[7]	valid_0's auc: 0.990172
[8]	valid_0's auc: 0.990085
[9]	valid_0's auc: 0.990299
[10]	valid_0's auc: 0.989217
[11]	valid_0's auc: 0.989366
[12]	valid_0's auc: 0.98951
[13]	valid_0's auc: 0.989497
[14]	valid_0's auc: 0.9907
[15]	valid_0's auc: 0.990665
[16]	valid_0's auc: 0.990649
[17]	valid_0's auc: 0.99077
[18]	va

Unnamed: 0,0,1,2,3,4,averge,label
0,0.193088,0.261366,0.144240,0.405719,0.307045,0.262292,0
1,0.020794,0.001088,0.014207,0.044715,0.074112,0.030983,0
2,0.024721,0.005525,0.021410,0.070816,0.147284,0.053951,0
3,0.060593,0.018353,0.058337,0.099952,0.088516,0.065150,0
4,0.244923,0.066848,0.148535,0.221646,0.204710,0.177332,0
...,...,...,...,...,...,...,...
995,0.010247,0.000792,0.013880,0.025754,0.060279,0.022190,0
996,0.986770,0.999686,0.982167,0.964190,0.911726,0.968908,1
997,0.010001,0.000445,0.013815,0.025754,0.060279,0.022059,0
998,0.986770,0.999750,0.982167,0.964190,0.911726,0.968921,1


In [23]:
result=pd.read_csv('DataSet\糖尿病遗传风险预测挑战赛公开数据\糖尿病遗传风险预测挑战赛公开数据\提交示例.csv')
result['label']=pre_y['label']
result.to_csv('result.csv',index=False)
data2.to_csv("UTF-8_eval.csv")
data1.to_csv("UTF-8_train.csv")