In [1]:
"""
     构造 osv1，版本大于9， Score=0.8821399999999999
    构造 osv1，版本大于8， Score=0.882002
    
    构造多个新特征，特征比大于10， Score=0.8834060000000001
                   特征比大于15        0.8828419999999999
    对osv进行数据清洗，设置为float类型
"""
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

In [2]:
train = pd.read_csv('./train.csv')
test1 = pd.read_csv('./test1.csv')

features = train.drop(['Unnamed: 0','label'],axis = 1)
labels = train['label']

In [3]:
#数据探索，找到导致1的关键特征值
def find_key_feature(train,selected):
    temp = pd.DataFrame(columns = [0,1])
    temp[0] = train[train['label']==0][selected].value_counts()/len(train[train['label']==0]) 
    temp[1] = train[train['label']==1][selected].value_counts()/len(train[train['label']==1]) 
    temp[2] = temp[1]/temp[0]
    #选出大于十倍的特征
    result = temp[temp[2]>10].sort_values(2,ascending = False).index
    return result

In [4]:
key_features = {}
selected_col = ['osv','apptype','carrier','dev_height','dev_ppi','dev_width',
                'media_id','ntt','version','location','fea1_hash','cus_type']
for selected in selected_col:
    key_features[selected] = find_key_feature(train,selected)

In [5]:
#构造新特征，新特征字段 = 原始特征字段+1
def f(x,selected):
    #判断是否在关键特征值里，是1，否0
    if x in key_features[selected]:
        return 1
    else:
        return 0

for selected in selected_col:
    if len(key_features[selected]) > 0:
        features[selected + '1'] = features[selected].apply(f,args = (selected,))
        test1[selected + '1'] = test1[selected].apply(f,args = (selected,))
        print(selected+'1 created')

osv1 created
apptype1 created
dev_height1 created
dev_ppi1 created
dev_width1 created
media_id1 created
ntt1 created
fea1_hash1 created


In [6]:
#类别特征
cate_features = ['apptype','carrier','ntt','version','location','cus_type']
#不需要入模'os'（实际是唯一的，都是安卓）,'lan','sid'
remove_list = ['os','lan','sid']
col = features.columns.tolist()
for i in remove_list:
    col.remove(i)
#特征筛选
features = features[col]

In [7]:
import time  #python底层日期函数
from datetime import datetime  #高级封装接口
#print(features['timestamp'][0]/1000)
#print(time.time())
#print(datetime.fromtimestamp(1615081589.8722358))

def get_date(features):
    #除以1000，转换为日期格式
    features['timestamp'] = features['timestamp'].apply(lambda x:datetime.fromtimestamp(x/1000))
    
    #创建时间戳索引
    temp = pd.DatetimeIndex(features['timestamp'])
    features['year'] = temp.year
    features['month'] = temp.month
    features['day'] = temp.day
    features['week_day'] = temp.weekday
    features['hour'] = temp.hour
    features['minute'] = temp.minute

    #添加time_diff
    start_time = features['timestamp'].min()
    features['time_diff'] = features['timestamp'] - start_time
    
    #将time_diff转换为小时
    features['time_diff'] = features['time_diff'].dt.days * 24 +features['time_diff'].dt.seconds/3600
    
    #Thinking：需不需要这么多特征？
    #使用day，time_diff
    features.drop(['timestamp','year','month','minute','week_day'],axis = 1,inplace = True)
  
    return features

#对训练集提取时间多尺度
features = get_date(features)
#对测试集提取时间多尺度
test1 = get_date(test1)

In [8]:
#对osv进行特征编码LabelEncoder
#还可以转换为Float,需要数据清洗
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
#需要将训练集和测试集合并，然后统一做LabelEncoder
all_df = pd.concat([train,test1])
all_df['osv'] = all_df['osv'].astype('str')
all_df['osv'] = le.fit_transform(all_df['osv'])

In [9]:
# 对于数值过大的异常值，设置为0
features['fea_hash'] = features['fea_hash'].map(lambda x: 0 if len(str(x))>16 else int(x))
features['fea1_hash'] = features['fea1_hash'].map(lambda x: 0 if len(str(x))>16 else int(x))

#对数据清洗，将V3=>3,V1=>1,V6=>6,V2=>2
#针对version，非数值类型 设置0
features['version'] = features['version'].map(lambda x: int(x) if str(x).isdigit() else 0)
features['osv'] = all_df[all_df['label'].notnull()]['osv']

In [10]:
#测试集做预测,保持与features中的columns一致
test_fea = test1[features.columns]

test_fea['fea_hash'] = test_fea['fea_hash'].map(lambda x: 0 if len(str(x))>16 else int(x))
test_fea['fea1_hash'] = test_fea['fea1_hash'].map(lambda x: 0 if len(str(x))>16 else int(x))

test_fea['version'] = test_fea['version'].map(lambda x: int(x) if str(x).isdigit() else 0)
test_fea['osv'] = all_df[all_df['label'].isnull()]['osv']

In [11]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score

def ensemble_model(clf,train_x,train_y,test,cate_features):
    #采用十折交叉验证
    sk = StratifiedKFold(n_splits = 10,shuffle = True,random_state = 2021)
    prob = [] #记录最终结果
    mean_acc = 0 #记录平均准确率
    for k,(train_index,val_index) in enumerate(sk.split(train_x,train_y)):
        train_x_real = train_x.iloc[train_index]
        train_y_real = train_y.iloc[train_index]
        val_x = train_x.iloc[val_index]
        val_y = train_y.iloc[val_index]
        # 子模型训练
        clf = clf.fit(train_x_real,train_y_real,categorical_feature = cate_features)
        val_y_pred = clf.predict(val_x)
        #子模型评估
        acc_val = accuracy_score(val_y,val_y_pred)
        print('第{}个子模型 acc{}'.format(k+1,acc_val))
        mean_acc += acc_val/10
        #子模型预测0,1
        test_y_pred = clf.predict_proba(test)[:,-1] #soft得到概率值
        prob.append(test_y_pred)
    print(mean_acc)
    mean_prob = sum(prob)/10
    return mean_prob

In [12]:
import lightgbm as lgb
# 使用LightGBM训练
# 相比XGBoost的改进：对类别特征的处理，不需要one-hot
clf = lgb.LGBMClassifier()
#clf.fit(features,labels,categorical_feature = cate_features)
result = ensemble_model(clf,features,labels,test_fea,cate_features)
result

第1个子模型 acc0.8841529267765857
第2个子模型 acc0.8834149326805385
第3个子模型 acc0.8814889480884153
第4个子模型 acc0.8843509251925985
第5个子模型 acc0.8819749442004464
第6个子模型 acc0.8842588425884259
第7个子模型 acc0.8828548285482855
第8个子模型 acc0.8820088200882009
第9个子模型 acc0.8845288452884529
0.8832260014946611


array([0.07225999, 0.46052101, 0.02298351, ..., 0.85408489, 0.87709334,
       0.87121425])

In [13]:
#保存结果
a = pd.DataFrame(test1['sid'])
a['label'] =  result
#转换为二分类
a['label'] = a['label'].apply(lambda x:0 if x<0.5 else 1)
a.to_csv('version5.csv',index = False)

In [14]:
a

Unnamed: 0,sid,label
0,1440682,0
1,1606824,0
2,1774642,0
3,1742535,0
4,1689686,1
...,...,...
149995,1165373,1
149996,1444115,1
149997,1134378,1
149998,1700238,1
