In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb

In [2]:
pd.set_option('display.max_columns',None)

In [3]:
#读取数据
age_train = pd.read_csv("age_train.csv", names=['uid','age_group'])
age_test = pd.read_csv("age_test.csv", names=['uid'])
user_basic_info = pd.read_csv("user_basic_info.csv", names=['uid','gender','city','prodName','ramCapacity','ramLeftRation','romCapacity','romLeftRation','color','fontSize','ct','carrier','os'])
user_behavior_info = pd.read_csv("user_behavior_info.csv", names=['uid','bootTimes','AFuncTimes','BFuncTimes','CFuncTimes','DFuncTimes','EFuncTimes','FFuncTimes','FFuncSum'])
user_app_actived = pd.read_csv("user_app_actived.csv", names=['uid','appId'])
#user_app_usage = pd.read_csv("user_app_usage.csv")
app_info = pd.read_csv("app_info.csv", names=['appId', 'category'])

In [2]:
#处理数据量较大的user_app_usage.csv，结合app_info.csv简单统计得到appuseProcessed.csv作为特征
def f(x):
    s = x.value_counts()
    return np.nan if len(s) == 0 else s.index[0]
def processUserAppUsage():
    resTable = pd.DataFrame()
    reader = pd.read_csv("user_app_usage.csv", names=['uid','appId','duration','times','use_date'], iterator=True)
    last_df = pd.DataFrame()
    
    app_info = pd.read_csv("app_info.csv", names=['appId','category'])
    cats = list(set(app_info['category']))
    category2id = dict(zip(sorted(cats), range(0,len(cats))))
    id2category = dict(zip(range(0,len(cats)), sorted(cats)))
    app_info['category'] = app_info['category'].apply(lambda x: category2id[x])
    i = 1
    
    while True:
        try:
            print("index: {}".format(i))
            i+=1
            df = reader.get_chunk(1000000)
            df = pd.concat([last_df, df])
            idx = df.shape[0]-1
            last_user = df.iat[idx,0]
            while(df.iat[idx,0]==last_user):
                idx-=1
            last_df = df[idx+1:]
            df = df[:idx+1]

            now_df = pd.DataFrame()
            now_df['uid'] = df['uid'].unique()
            now_df = now_df.merge(df.groupby('uid')['appId'].count().to_frame(), how='left', on='uid')
            now_df = now_df.merge(df.groupby('uid')['appId','use_date'].agg(['nunique']), how='left', on='uid')
            now_df = now_df.merge(df.groupby('uid')['duration','times'].agg(['mean','max','std']), how='left', on='uid')    

            now_df.columns = ['uid','usage_cnt','usage_appid_cnt','usage_date_cnt','duration_mean','duration_max','duration_std','times_mean','times_max','times_std']


            df = df.merge(app_info, how='left', on='appId')
            now_df = now_df.merge(df.groupby('uid')['category'].nunique().to_frame(), how='left', on='uid')
            #print(df.groupby(['uid'])['category'].value_counts().index[0])
            now_df['usage_most_used_category'] = df.groupby(['uid'])['category'].transform(f)
            resTable = pd.concat([resTable, now_df])
        except StopIteration:
            break
    
    resTable.to_csv("appuseProcessed.csv",index=0)
    
    print("Iterator is stopped")

In [5]:
processUserAppUsage()

index: 1




(1000364, 18.0)
index: 2
(1845664, 7.0)
index: 3
(1000107, 17.0)
index: 4
(1060334, 17.0)
index: 5
(1895014, 17.0)
index: 6
(1000237, 17.0)
index: 7
(1114846, 17.0)
index: 8
(1974375, 17.0)
index: 9
(1000143, 17.0)
index: 10
(1139447, 17.0)
index: 11


KeyboardInterrupt: 

In [4]:
#将user_basic_info.csv 和 user_behavior_info.csv中的字符值编码成可以训练的数值类型，合并
class2id = {}
id2class = {}
def mergeBasicTables(baseTable):
    resTable = baseTable.merge(user_basic_info, how='left', on='uid', suffixes=('_base0', '_ubaf'))
    resTable = resTable.merge(user_behavior_info, how='left', on='uid', suffixes=('_base1', '_ubef'))
    cat_columns = ['city','prodName','color','carrier','os','ct']
    for c in cat_columns:
        resTable[c] = resTable[c].apply(lambda x: x if type(x)==str else str(x))
        sort_temp = sorted(list(set(resTable[c])))  
        class2id[c+'2id'] = dict(zip(sort_temp, range(1, len(sort_temp)+1)))
        id2class['id2'+c] = dict(zip(range(1,len(sort_temp)+1), sort_temp))
        resTable[c] = resTable[c].apply(lambda x: class2id[c+'2id'][x])
        
    return resTable

In [5]:
#处理app使用相关数据
#对user_app_actived.csv简单统计
#将之前训练的appuseProcess.csv进行合并
def mergeAppData(baseTable):
    resTable = baseTable.merge(user_app_actived, how='left', on='uid')
    resTable['appId'] = resTable['appId'].apply(lambda x: len(list(x.split('#'))))
    appusedTable = pd.read_csv("appuseProcessed.csv")
    resTable = resTable.merge(appusedTable, how='left', on='uid')
    resTable[['category', 'usage_most_used_category']] = resTable[['category', 'usage_most_used_category']].fillna(41)
    resTable = resTable.fillna(0)
    #print(resTable[:5])
    return resTable

In [6]:
#合并用户基本特征以及app使用相关特征，作为训练集和测试集
df_train = mergeAppData(mergeBasicTables(age_train))
df_test = mergeAppData(mergeBasicTables(age_test))
print(df_train.shape)
print(df_test.shape)

(2010000, 34)
(502500, 33)


In [None]:
#训练模型

from sklearn.feature_selection import SelectFromModel, VarianceThreshold, SelectKBest, chi2, mutual_info_classif, f_classif
from sklearn.preprocessing import Imputer
from sklearn.ensemble import ExtraTreesClassifier

import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, accuracy_score, f1_score
from sklearn.model_selection import StratifiedKFold

In [None]:
print("训练模型：")
param = {
        'boosting': 'dart',
        'learning_rate': 0.02,
        'lambda_l1': 0.05,
        'lambda_l2': 0.1,
        'max_depth': 20,
        'objective': 'multiclass',
        'num_class': 7,
        'num_leaves': 16383,
        'min_data_in_leaf': 31,
        'max_bin': 255,
        'feature_fraction': 0.95,
        'metric': 'multi_error',
        'device':'gpu',
        'num_threads': 8,
        'drop_rate': 0.05
        }

X = df_train.drop(['age_group','uid'], axis=1)
y = df_train['age_group']
uid = df_test['uid']
test = df_test.drop('uid', axis=1)

xx_score = []
cv_pred = []
skf = StratifiedKFold(n_splits=5, random_state=2019, shuffle=True)
for index, (train_index, vali_index) in enumerate(skf.split(X, y)):
    print(index)
    x_train, y_train, x_vali, y_vali = np.array(X)[train_index], np.array(y)[train_index], np.array(X)[vali_index], np.array(y)[vali_index]
    train = lgb.Dataset(x_train, y_train)
    vali =lgb.Dataset(x_vali, y_vali)
    print("training start...")
    model = lgb.train(param, train, num_boost_round=10000, valid_sets=[vali], early_stopping_rounds=100)
    xx_pred = model.predict(x_vali,num_iteration=model.best_iteration)
    xx_pred = [np.argmax(x) for x in xx_pred]
    xx_score.append(f1_score(y_vali,xx_pred,average='weighted'))
    y_test = model.predict(test,num_iteration=model.best_iteration)
    y_test = [np.argmax(x) for x in y_test]
    if index == 0:
        cv_pred = np.array(y_test).reshape(-1, 1)
    else:
        cv_pred = np.hstack((cv_pred, np.array(y_test).reshape(-1, 1)))
        
submit = []
for line in cv_pred:
    submit.append(np.argmax(np.bincount(line)))
df = pd.DataFrame({'id':uid.as_matrix(),'label':submit})
df.to_csv('submission.csv',index=False)

训练模型：
0
training start...
[1]	valid_0's multi_error: 0.701493




[2]	valid_0's multi_error: 0.701493
[3]	valid_0's multi_error: 0.701493
[4]	valid_0's multi_error: 0.701493
[5]	valid_0's multi_error: 0.701488
[6]	valid_0's multi_error: 0.701473
[7]	valid_0's multi_error: 0.701356
[8]	valid_0's multi_error: 0.701353
[9]	valid_0's multi_error: 0.700985
[10]	valid_0's multi_error: 0.698209
[11]	valid_0's multi_error: 0.692736
[12]	valid_0's multi_error: 0.687144
[13]	valid_0's multi_error: 0.681485
[14]	valid_0's multi_error: 0.675816
[15]	valid_0's multi_error: 0.670667
[16]	valid_0's multi_error: 0.666
[17]	valid_0's multi_error: 0.661724
[18]	valid_0's multi_error: 0.658065
[19]	valid_0's multi_error: 0.654381
[20]	valid_0's multi_error: 0.65097
[21]	valid_0's multi_error: 0.64759
[22]	valid_0's multi_error: 0.644012
[23]	valid_0's multi_error: 0.64092
[24]	valid_0's multi_error: 0.637761
[25]	valid_0's multi_error: 0.634786
[26]	valid_0's multi_error: 0.631575
[27]	valid_0's multi_error: 0.628659
[28]	valid_0's multi_error: 0.628592
[29]	valid_0's 

[222]	valid_0's multi_error: 0.573816
[223]	valid_0's multi_error: 0.573716
[224]	valid_0's multi_error: 0.573687
[225]	valid_0's multi_error: 0.573706
[226]	valid_0's multi_error: 0.573726
[227]	valid_0's multi_error: 0.573697
[228]	valid_0's multi_error: 0.57358
[229]	valid_0's multi_error: 0.573575
[230]	valid_0's multi_error: 0.57345
[231]	valid_0's multi_error: 0.573373
[232]	valid_0's multi_error: 0.573341
[233]	valid_0's multi_error: 0.573361
[234]	valid_0's multi_error: 0.573328
[235]	valid_0's multi_error: 0.573308
[236]	valid_0's multi_error: 0.573256
[237]	valid_0's multi_error: 0.57348
[238]	valid_0's multi_error: 0.573291
[239]	valid_0's multi_error: 0.573251
[240]	valid_0's multi_error: 0.573284
[241]	valid_0's multi_error: 0.573353
[242]	valid_0's multi_error: 0.573132
[243]	valid_0's multi_error: 0.573214
[244]	valid_0's multi_error: 0.573246
[245]	valid_0's multi_error: 0.572913
[246]	valid_0's multi_error: 0.572918
[247]	valid_0's multi_error: 0.572965
[248]	valid_0's

[439]	valid_0's multi_error: 0.569858
[440]	valid_0's multi_error: 0.569836
[441]	valid_0's multi_error: 0.569886
[442]	valid_0's multi_error: 0.569749
[443]	valid_0's multi_error: 0.569816
[444]	valid_0's multi_error: 0.569808
[445]	valid_0's multi_error: 0.569796
[446]	valid_0's multi_error: 0.569597
[447]	valid_0's multi_error: 0.569609
[448]	valid_0's multi_error: 0.569453
[449]	valid_0's multi_error: 0.56943
[450]	valid_0's multi_error: 0.569537
[451]	valid_0's multi_error: 0.56954
[452]	valid_0's multi_error: 0.569585
[453]	valid_0's multi_error: 0.56953
[454]	valid_0's multi_error: 0.569522
[455]	valid_0's multi_error: 0.569366
[456]	valid_0's multi_error: 0.569348
[457]	valid_0's multi_error: 0.569331
[458]	valid_0's multi_error: 0.569318
[459]	valid_0's multi_error: 0.569326
[460]	valid_0's multi_error: 0.56951
[461]	valid_0's multi_error: 0.569291
[462]	valid_0's multi_error: 0.569356
[463]	valid_0's multi_error: 0.569358
[464]	valid_0's multi_error: 0.569428
[465]	valid_0's 

In [None]:
y_test = model.predict(test)
    y_test = [np.argmax(x) for x in y_test]
    if index == 0:
        cv_pred = np.array(y_test).reshape(-1, 1)
    else:
        cv_pred = np.hstack((cv_pred, np.array(y_test).reshape(-1, 1)))
submit = []
for line in cv_pred:
    submit.append(np.argmax(np.bincount(line)))
df = pd.DataFrame({'id':uid.as_matrix(),'label':submit})
df.to_csv('submission.csv',index=False)

In [12]:
age_train['age_group'].nunique()

6

In [1]:
history

history
