In [1]:
"""
    使用祖传参数n_estimators=2000        Score=0.8836200000000001
                            5000
"""
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

In [2]:
train = pd.read_csv('./train.csv')
test1 = pd.read_csv('./test1.csv')

features = train.drop(['Unnamed: 0','label'],axis = 1)
labels = train['label']

#### 构造新特征，特征比大于15的特征为关键特征

In [3]:
#数据探索，找到导致1的关键特征值
def find_key_feature(train,selected):
    temp = pd.DataFrame(columns = [0,1])
    temp[0] = train[train['label']==0][selected].value_counts()/len(train[train['label']==0]) 
    temp[1] = train[train['label']==1][selected].value_counts()/len(train[train['label']==1]) 
    temp[2] = temp[1]/temp[0]
    #选出大于15倍的特征
    result = temp[temp[2]>15].sort_values(2,ascending = False).index
    return result

key_features = {}
selected_col = ['osv','apptype','carrier','dev_height','dev_ppi','dev_width',
                'media_id','ntt','version','location','fea1_hash','cus_type']
for selected in selected_col:
    key_features[selected] = find_key_feature(train,selected)

#构造新特征，新特征字段 = 原始特征字段+1
def f1(x,selected):
    #判断是否在关键特征值里，是1，否0
    if x in key_features[selected]:
        return 1
    else:
        return 0

for selected in selected_col:
    if len(key_features[selected]) > 0:
        features[selected + '1'] = features[selected].apply(f1,args = (selected,))
        test1[selected + '1'] = test1[selected].apply(f1,args = (selected,))
        print(selected+'1 created')

osv1 created
apptype1 created
dev_height1 created
dev_ppi1 created
dev_width1 created
media_id1 created
ntt1 created
fea1_hash1 created


#### osv转换为float类型

In [4]:
#对osv进行数据清洗
import re
def f2(x):
    x = str(x)
    x = x.replace('Android_','')
    # 4.2.3.2是需要找到的字符
    result = re.match('[\d\.]+',x)
    if result:
        x = result.group()    #返回匹配结果
        if '.' in x:
            #把4.4.3转化成4.43
            #找到第一个小数点的位置
            index1 = x.index('.')
            if index1 > 0:
                #去掉所有小数点
                x = x.replace('.','')
                #加上原来第一个小数点
                x = float(x[0:index1]+'.'+x[index1:])
        else: x = float(x)
    else:
        x = 0
    #如果版本号过大，7930
    if x>1000:
        x = x/10000
    elif x>11:
        x = x/10
    return x
    
features['osv'] = features['osv'].apply(f2)
test1['osv'] = test1['osv'].apply(f2)

#### 特征筛选

In [5]:
#类别特征
cate_features = ['apptype','carrier','ntt','version','location','cus_type']
remove_list = ['os','lan','sid']
col = features.columns.tolist()
for i in remove_list:
    col.remove(i)
features = features[col]

#### 时间

In [6]:
import time  
from datetime import datetime  

def get_date(features):
    #除以1000，转换为日期格式
    features['timestamp'] = features['timestamp'].apply(lambda x:datetime.fromtimestamp(x/1000))
    
    #创建时间戳索引
    temp = pd.DatetimeIndex(features['timestamp'])
    features['year'] = temp.year
    features['month'] = temp.month
    features['day'] = temp.day
    features['week_day'] = temp.weekday
    features['hour'] = temp.hour
    features['minute'] = temp.minute

    #添加time_diff
    start_time = features['timestamp'].min()
    features['time_diff'] = features['timestamp'] - start_time
    
    #将time_diff转换为小时
    features['time_diff'] = features['time_diff'].dt.days * 24 +features['time_diff'].dt.seconds/3600
    
    #使用day，time_diff
    features.drop(['timestamp','year','month','minute','week_day'],axis = 1,inplace = True)
  
    return features

#对训练集提取时间多尺度
features = get_date(features)
#对测试集提取时间多尺度
test1 = get_date(test1)

#### LabelEncoder

In [7]:
#对lan进行特征编码LabelEncoder
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
#需要将训练集和测试集合并，然后统一做LabelEncoder
all_df = pd.concat([train,test1])
all_df['lan'] = all_df['lan'].astype('str')
all_df['lan'] = le.fit_transform(all_df['lan'])

#### 数据清洗

In [8]:
# 对于数值过大的异常值，设置为0
features['fea_hash'] = features['fea_hash'].map(lambda x: 0 if len(str(x))>16 else int(x))
features['fea1_hash'] = features['fea1_hash'].map(lambda x: 0 if len(str(x))>16 else int(x))

#对数据清洗，将V3=>3,V1=>1,V6=>6,V2=>2
#针对version，非数值类型 设置0
features['version'] = features['version'].map(lambda x: int(x) if str(x).isdigit() else 0)
features['lan'] = all_df[all_df['label'].notnull()]['lan']

In [9]:
#测试集做预测,保持与features中的columns一致
test_fea = test1[features.columns]

test_fea['fea_hash'] = test_fea['fea_hash'].map(lambda x: 0 if len(str(x))>16 else int(x))
test_fea['fea1_hash'] = test_fea['fea1_hash'].map(lambda x: 0 if len(str(x))>16 else int(x))

test_fea['version'] = test_fea['version'].map(lambda x: int(x) if str(x).isdigit() else 0)
test_fea['lan'] = all_df[all_df['label'].isnull()]['lan']

In [10]:
test_fea

Unnamed: 0,android_id,apptype,carrier,dev_height,dev_ppi,dev_width,media_id,ntt,osv,package,...,dev_height1,dev_ppi1,dev_width1,media_id1,ntt1,fea1_hash1,day,hour,time_diff,lan
0,317625,1181,46000.0,2196.0,2.0,1080.0,639,2.0,8.10,188,...,0,0,0,0,0,0,7,9,105.721389,0
1,435108,944,46003.0,2280.0,3.0,1080.0,704,6.0,8.10,221,...,0,0,0,0,0,0,5,20,68.891667,17
2,0,1106,46000.0,0.0,0.0,0.0,39,2.0,5.10,1562,...,0,0,0,0,0,0,4,10,34.139167,14
3,451504,761,46000.0,1344.0,0.0,720.0,54,2.0,7.11,9,...,0,0,0,0,0,0,5,1,49.051944,14
4,0,1001,46000.0,665.0,0.0,320.0,29,5.0,8.10,4,...,1,0,0,0,0,0,5,8,56.275278,17
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
149995,0,1001,46000.0,760.0,0.0,360.0,29,2.0,8.10,4,...,1,0,0,0,0,0,8,9,129.358333,17
149996,0,1001,46000.0,780.0,0.0,360.0,29,2.0,9.00,4,...,1,0,0,0,0,0,7,7,103.265000,17
149997,0,1001,46000.0,780.0,0.0,360.0,29,5.0,8.10,4,...,1,0,0,0,0,0,9,8,152.815556,17
149998,500925,1052,46000.0,854.0,240.0,480.0,249,6.0,4.42,0,...,0,0,0,1,0,1,5,6,54.636389,17


In [None]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score

def ensemble_model(clf,train_x,train_y,test,cate_features):
    #采用十折交叉验证
    sk = StratifiedKFold(n_splits = 5,shuffle = True,random_state = 2021)
    prob = [] #记录最终结果
    mean_acc = 0 #记录平均准确率
    for k,(train_index,val_index) in enumerate(sk.split(train_x,train_y)):
        train_x_real = train_x.iloc[train_index]
        train_y_real = train_y.iloc[train_index]
        val_x = train_x.iloc[val_index]
        val_y = train_y.iloc[val_index]
        # 子模型训练
        clf = clf.fit(train_x_real,train_y_real,categorical_feature = cate_features)
        val_y_pred = clf.predict(val_x)
        #子模型评估
        acc_val = accuracy_score(val_y,val_y_pred)
        print('第{}个子模型 acc{}'.format(k+1,acc_val))
        mean_acc += acc_val/5
        #子模型预测0,1
        test_y_pred = clf.predict_proba(test)[:,-1] #soft得到概率值
        prob.append(test_y_pred)
    print(mean_acc)
    mean_prob = sum(prob)/5
    return mean_prob

In [11]:
import lightgbm as lgb
# 使用LightGBM训练
# 相比XGBoost的改进：对类别特征的处理，不需要one-hot
clf = lgb.LGBMClassifier(num_leaves=2**5-1, reg_alpha=0.25, reg_lambda=0.25, objective='binary',
            max_depth=-1, learning_rate=0.005, min_child_samples=3, random_state=2021,
            n_estimators=5000, subsample=1, colsample_bytree=1,
)
#clf.fit(features,labels,categorical_feature = cate_features)
result = ensemble_model(clf,features,labels,test_fea,cate_features)
result

第1个子模型 acc0.88497
第2个子模型 acc0.8851
第3个子模型 acc0.8854
第4个子模型 acc0.88594
第5个子模型 acc0.88507
0.8852960000000001


array([0.07934696, 0.57944083, 0.0245171 , ..., 0.95440543, 0.989794  ,
       0.97819509])

In [13]:
#保存结果
a = pd.DataFrame(test1['sid'])
a['label'] =  result
#转换为二分类
a['label'] = a['label'].apply(lambda x:0 if x<0.5 else 1)
a.to_csv('version7_LGB5000.csv',index = False)

In [14]:
b = pd.DataFrame(test1['sid'])
b['label'] = result
b.to_csv('LGB5000_result.csv',index = False)