In [1]:
"""
    使用k折交叉验证
    产生五个子模型，然后进行简单平均得到结果
        13         0.8818800040081638
        12         0.881872001664893
        11         0.8820820022098543
        10         0.8818940000000002
        9          0.8821280002345706
        8          0.8819619999999999
        7          0.881914001538335
        6          0.8819460006522197
"""
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

In [2]:
train = pd.read_csv('./train.csv')
test1 = pd.read_csv('./test1.csv')

In [3]:
features = train.drop(['Unnamed: 0','label'],axis = 1)
labels = train['label']

In [4]:
#类别特征
cate_features = ['apptype','carrier','ntt','version','location','cus_type']
#不需要入模'os'（实际是唯一的，都是安卓）,'lan','sid'
remove_list = ['os','lan','sid']
col = features.columns.tolist()
for i in remove_list:
    col.remove(i)
#特征筛选
features = features[col]

In [5]:
import time  #python底层日期函数
from datetime import datetime  #高级封装接口
#print(features['timestamp'][0]/1000)
#print(time.time())
#print(datetime.fromtimestamp(1615081589.8722358))

def get_date(features):
    #除以1000，转换为日期格式
    features['timestamp'] = features['timestamp'].apply(lambda x:datetime.fromtimestamp(x/1000))
    
    #创建时间戳索引
    temp = pd.DatetimeIndex(features['timestamp'])
    features['year'] = temp.year
    features['month'] = temp.month
    features['day'] = temp.day
    features['week_day'] = temp.weekday
    features['hour'] = temp.hour
    features['minute'] = temp.minute

    #添加time_diff
    start_time = features['timestamp'].min()
    features['time_diff'] = features['timestamp'] - start_time
    
    #将time_diff转换为小时
    features['time_diff'] = features['time_diff'].dt.days * 24 +features['time_diff'].dt.seconds/3600
    
    #Thinking：需不需要这么多特征？
    #使用day，time_diff
    features.drop(['timestamp','year','month','minute','week_day'],axis = 1,inplace = True)
  
    return features

#对训练集提取时间多尺度
features = get_date(features)
#对测试集提取时间多尺度
test1 = get_date(test1)

In [6]:
#对osv进行特征编码LabelEncoder
#还可以转换为Float,需要数据清洗
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
#需要将训练集和测试集合并，然后统一做LabelEncoder
all_df = pd.concat([train,test1])
all_df['osv'] = all_df['osv'].astype('str')
all_df['osv'] = le.fit_transform(all_df['osv'])

In [7]:
# 对于数值过大的异常值，设置为0
features['fea_hash'] = features['fea_hash'].map(lambda x: 0 if len(str(x))>16 else int(x))
features['fea1_hash'] = features['fea1_hash'].map(lambda x: 0 if len(str(x))>16 else int(x))

#对数据清洗，将V3=>3,V1=>1,V6=>6,V2=>2
#针对version，非数值类型 设置0
features['version'] = features['version'].map(lambda x: int(x) if str(x).isdigit() else 0)
features['osv'] = all_df[all_df['label'].notnull()]['osv']

In [8]:
#测试集做预测,保持与features中的columns一致
test_fea = test1[features.columns]

test_fea['fea_hash'] = test_fea['fea_hash'].map(lambda x: 0 if len(str(x))>16 else int(x))
test_fea['fea1_hash'] = test_fea['fea1_hash'].map(lambda x: 0 if len(str(x))>16 else int(x))

test_fea['version'] = test_fea['version'].map(lambda x: int(x) if str(x).isdigit() else 0)
test_fea['osv'] = all_df[all_df['label'].isnull()]['osv']

In [21]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score

n = 13
def ensemble_model(clf,train_x,train_y,test,cate_features):
    #采用十折交叉验证
    sk = StratifiedKFold(n_splits = n,shuffle = True,random_state = 2021)
    prob = [] #记录最终结果
    mean_acc = 0 #记录平均准确率
    for k,(train_index,val_index) in enumerate(sk.split(train_x,train_y)):
        train_x_real = train_x.iloc[train_index]
        train_y_real = train_y.iloc[train_index]
        val_x = train_x.iloc[val_index]
        val_y = train_y.iloc[val_index]
        # 子模型训练
        clf = clf.fit(train_x_real,train_y_real,categorical_feature = cate_features)
        val_y_pred = clf.predict(val_x)
        #子模型评估
        acc_val = accuracy_score(val_y,val_y_pred)
        print('第{}个子模型 acc{}'.format(k+1,acc_val))
        mean_acc += acc_val/n
        #子模型预测0,1
        test_y_pred = clf.predict_proba(test)[:,-1] #soft得到概率值
        prob.append(test_y_pred)
    print(mean_acc)
    mean_prob = sum(prob)/n
    return mean_prob

In [22]:
import lightgbm as lgb
# 使用LightGBM训练
# 相比XGBoost的改进：对类别特征的处理，不需要one-hot
clf = lgb.LGBMClassifier()
#clf.fit(features,labels,categorical_feature = cate_features)
result = ensemble_model(clf,features,labels,test_fea,cate_features)
result

第1个子模型 acc0.8825594092870885
第2个子模型 acc0.8814154230149238
第3个子模型 acc0.8817534189589725
第4个子模型 acc0.8803494358067703
第5个子模型 acc0.8811034267588789
第6个子模型 acc0.8820394155270137
第7个子模型 acc0.8819354167749986
第8个子模型 acc0.8830243623410727
第9个子模型 acc0.8835443696211747
第10个子模型 acc0.8789943059202829
第11个子模型 acc0.8819323470528587
第12个子模型 acc0.8818283455968383
第13个子模型 acc0.8839603754452562
0.8818800040081638


array([0.10038123, 0.74431844, 0.03477555, ..., 1.22265933, 1.24846021,
       1.25404282])