In [1]:
import gc
import time
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.metrics import log_loss
from sklearn.model_selection import StratifiedKFold
from help_function import LoadData
import warnings
warnings.filterwarnings('ignore')

In [5]:
# 行为数据
behavior = pd.read_csv('../Demo/deviceid_package_start_close.tsv',sep='\t',
                       names = ['device_id','app_id','start','close'])

# 应用label数据
app_label = pd.read_csv('../Demo/package_label.tsv',sep='\t',
                        names=['app_id','label_1','label_2'])
app_label.label_1 = app_label.label_1.apply(lambda x:x.split('(')[0])
app_label.label_2 = app_label.label_2.apply(lambda x:x.split('/')[0])

In [6]:
# trian data , test data
# train test data
train_datapath =  '../Demo/deviceid_train.tsv' 
test_datapath =  '../Demo/deviceid_test.tsv' 
train_data, test_data = LoadData(train_datapath, test_datapath)

## start , close

In [7]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
vectorizer=CountVectorizer()

In [8]:
def TransTt2Hour(x):
    timeArray = time.localtime(float(x)/1000)
    otherStyleTime = time.strftime('%H', timeArray)
    return str(otherStyleTime)
def ret_list(arr):
    return list(arr)

In [9]:
behavior['s_hour'] = behavior.start.apply(TransTt2Hour)
behavior['c_hour'] = behavior.close.apply(TransTt2Hour)

In [11]:
# group-obj
group_obj = behavior.groupby(by='device_id')
features = pd.DataFrame({'device_id':behavior.device_id.unique()})

### start : s_hour

In [12]:
groupfeature = group_obj.s_hour.agg(ret_list).reset_index()

groupfeature.rename(index=str,columns={0:'s_hour'},inplace=True)

s_hours = groupfeature.s_hour.apply(lambda x:' '.join(x)).tolist()

sh_vector = vectorizer.fit_transform(s_hours)

# cntvector
f_names = ['s'+str(x) for x in range(24)]
sh_vector = pd.DataFrame(sh_vector.toarray(),columns=f_names)

sh_vector['device_id'] = groupfeature.device_id.values

### close : c_hour

In [13]:
groupfeature = group_obj.c_hour.agg(ret_list).reset_index()

groupfeature.rename(index=str,columns={0:'c_hour'},inplace=True)

c_hours = groupfeature.c_hour.apply(lambda x:' '.join(x)).tolist()

ch_vector = vectorizer.fit_transform(c_hours)

f_names = ['c'+str(x) for x in range(24)]
ch_vector = pd.DataFrame(ch_vector.toarray(),columns=f_names)

ch_vector['device_id'] = groupfeature.device_id.values

In [14]:
# s_hour + c_hour
sc_vector = sh_vector.merge(ch_vector, on='device_id', how='left')
sc_vector.to_csv('features/h3.csv',index=False)

In [15]:
train_set = train_data.merge(sc_vector, on='device_id', how='left')
test_set = test_data.merge(sc_vector, on='device_id', how='left')

train code

In [16]:
def xgbc_code(train_data, test_data,label, num_class, n_folds=5,
              obj='multi:softprob', metric='mlogloss'):
    labels = train_data[[label]]
    train_data = train_data.drop(['device_id','sex','age','label'],axis=1)
    test_data = test_data.drop(['device_id'],axis=1)
    train_predvec = np.zeros((train_data.shape[0], num_class))
    test_predvec = np.zeros((test_data.shape[0], num_class))
    SKF = StratifiedKFold(n_splits = n_folds, shuffle = True, random_state = 2018)
    train_logloss = []
    valid_logloss = []
    for train_indices, valid_indices in SKF.split(train_data,labels):
        # Training data for the fold
        x_train = train_data.loc[train_indices, :]
        y_train = labels.loc[train_indices, :]
        # Validation data for the fold
        x_valid = train_data.loc[valid_indices, :]
        y_valid = labels.loc[valid_indices, :]
        # XGboost
        xgbc = xgb.XGBClassifier(max_depth=3, learning_rate=0.09, n_estimators=1000,
                         silent=True, objective = obj,
                         booster='gbtree', n_jobs=-1,
                         gamma=0, subsample=1,
                         colsample_bytree=0.6, colsample_bylevel=1.,
                         reg_alpha=0, reg_lambda=1,
                         scale_pos_weight=1,
                         base_score=0.5,
                         max_delta_step = 0,
                         random_state=666)
        xgbc.fit(x_train, y_train,
                 eval_set=[(x_train, y_train),(x_valid, y_valid)],
                 eval_metric = metric,
                 early_stopping_rounds=10,
                 verbose=0)
        # record logloss
        train_logloss.append(log_loss(y_train, xgbc.predict_proba(x_train)))
        valid_logloss.append(log_loss(y_valid, xgbc.predict_proba(x_valid)))
        train_predvec[valid_indices] = xgbc.predict_proba(x_valid)
        test_predvec += xgbc.predict_proba(test_data)/n_folds
        # Clean up memory
        gc.enable()
        del xgbc, x_train, y_train, x_valid, y_valid
        gc.collect()
        print('############## one flod is over ##############')
    train_logloss.append(np.mean(train_logloss))
    valid_logloss.append(log_loss(labels, train_predvec))
    # dataframe of validation scores
    fold_names = list(range(n_folds))
    fold_names.append('overall')
    # Dataframe of validation scores
    metrics = pd.DataFrame({'fold': fold_names,
                            'train_logloss':train_logloss,
                            'valid_logloss':valid_logloss})
    return metrics, train_predvec, test_predvec

In [161]:
# sex+age   num_class = 22
metric, train_1, test_1 = xgbc_code(train_set, test_set, 'label', 22, 10)

[0]	validation_0-mlogloss:3.06968	validation_1-mlogloss:3.07154
Multiple eval metrics have been passed: 'validation_1-mlogloss' will be used for early stopping.

Will train until validation_1-mlogloss hasn't improved in 10 rounds.
[10]	validation_0-mlogloss:2.93741	validation_1-mlogloss:2.95323
[20]	validation_0-mlogloss:2.87419	validation_1-mlogloss:2.902
[30]	validation_0-mlogloss:2.83903	validation_1-mlogloss:2.87767
[40]	validation_0-mlogloss:2.81638	validation_1-mlogloss:2.86526
[50]	validation_0-mlogloss:2.8004	validation_1-mlogloss:2.85862
[60]	validation_0-mlogloss:2.78769	validation_1-mlogloss:2.85486
[70]	validation_0-mlogloss:2.77683	validation_1-mlogloss:2.85286
[80]	validation_0-mlogloss:2.76717	validation_1-mlogloss:2.85167
[90]	validation_0-mlogloss:2.75832	validation_1-mlogloss:2.85115
[100]	validation_0-mlogloss:2.74997	validation_1-mlogloss:2.85054
[110]	validation_0-mlogloss:2.74191	validation_1-mlogloss:2.85043
Stopping. Best iteration:
[104]	validation_0-mlogloss:2

[10]	validation_0-mlogloss:2.93698	validation_1-mlogloss:2.95331
[20]	validation_0-mlogloss:2.87378	validation_1-mlogloss:2.90172
[30]	validation_0-mlogloss:2.83851	validation_1-mlogloss:2.87696
[40]	validation_0-mlogloss:2.81581	validation_1-mlogloss:2.86488
[50]	validation_0-mlogloss:2.79969	validation_1-mlogloss:2.85785
[60]	validation_0-mlogloss:2.78694	validation_1-mlogloss:2.85449
[70]	validation_0-mlogloss:2.77597	validation_1-mlogloss:2.85242
[80]	validation_0-mlogloss:2.76603	validation_1-mlogloss:2.85129
[90]	validation_0-mlogloss:2.75691	validation_1-mlogloss:2.85059
[100]	validation_0-mlogloss:2.74832	validation_1-mlogloss:2.85026
[110]	validation_0-mlogloss:2.74017	validation_1-mlogloss:2.84997
[120]	validation_0-mlogloss:2.73233	validation_1-mlogloss:2.84961
Stopping. Best iteration:
[117]	validation_0-mlogloss:2.73465	validation_1-mlogloss:2.84943

############## one flod is over ##############
[0]	validation_0-mlogloss:3.0698	validation_1-mlogloss:3.07119
Multiple eval 

In [163]:
np.save('new_feature/h3_train.npy',train_1)
np.save('new_feature/h3_test.npy',test_1)

In [164]:
# age num_class = 11
metric, age_train, age_test = xgbc_code(train_set, test_set, 'age', 11, 10)

[0]	validation_0-mlogloss:2.38286	validation_1-mlogloss:2.38348
Multiple eval metrics have been passed: 'validation_1-mlogloss' will be used for early stopping.

Will train until validation_1-mlogloss hasn't improved in 10 rounds.
[10]	validation_0-mlogloss:2.28572	validation_1-mlogloss:2.29291
[20]	validation_0-mlogloss:2.23827	validation_1-mlogloss:2.25077
[30]	validation_0-mlogloss:2.21197	validation_1-mlogloss:2.22957
[40]	validation_0-mlogloss:2.19587	validation_1-mlogloss:2.21863
[50]	validation_0-mlogloss:2.1849	validation_1-mlogloss:2.21245
[60]	validation_0-mlogloss:2.17675	validation_1-mlogloss:2.20877
[70]	validation_0-mlogloss:2.17004	validation_1-mlogloss:2.20652
[80]	validation_0-mlogloss:2.16418	validation_1-mlogloss:2.20501
[90]	validation_0-mlogloss:2.15893	validation_1-mlogloss:2.20443
[100]	validation_0-mlogloss:2.15379	validation_1-mlogloss:2.2039
[110]	validation_0-mlogloss:2.149	validation_1-mlogloss:2.20361
[120]	validation_0-mlogloss:2.1443	validation_1-mlogloss

[130]	validation_0-mlogloss:2.13972	validation_1-mlogloss:2.20232
[140]	validation_0-mlogloss:2.13542	validation_1-mlogloss:2.20205
[150]	validation_0-mlogloss:2.13156	validation_1-mlogloss:2.20161
[160]	validation_0-mlogloss:2.12754	validation_1-mlogloss:2.20117
[170]	validation_0-mlogloss:2.12372	validation_1-mlogloss:2.20095
[180]	validation_0-mlogloss:2.11998	validation_1-mlogloss:2.20053
[190]	validation_0-mlogloss:2.11636	validation_1-mlogloss:2.20085
Stopping. Best iteration:
[180]	validation_0-mlogloss:2.11998	validation_1-mlogloss:2.20053

############## one flod is over ##############
[0]	validation_0-mlogloss:2.38287	validation_1-mlogloss:2.38374
Multiple eval metrics have been passed: 'validation_1-mlogloss' will be used for early stopping.

Will train until validation_1-mlogloss hasn't improved in 10 rounds.
[10]	validation_0-mlogloss:2.28579	validation_1-mlogloss:2.29327
[20]	validation_0-mlogloss:2.23809	validation_1-mlogloss:2.2514
[30]	validation_0-mlogloss:2.212	valid

In [166]:
np.save('new_feature/age_train.npy',age_train)
np.save('new_feature/age_test.npy',age_test)

In [17]:
# sex num_class = 2
metric, sex_train, sex_test = xgbc_code(train_set, test_set, 'sex', 2, 10,
                                        'binary:logistic','logloss')

[0]	validation_0-logloss:0.685316	validation_1-logloss:0.685345
Multiple eval metrics have been passed: 'validation_1-logloss' will be used for early stopping.

Will train until validation_1-logloss hasn't improved in 10 rounds.
[10]	validation_0-logloss:0.651996	validation_1-logloss:0.653425
[20]	validation_0-logloss:0.645463	validation_1-logloss:0.648346
[30]	validation_0-logloss:0.64332	validation_1-logloss:0.647397
[40]	validation_0-logloss:0.642093	validation_1-logloss:0.647242
Stopping. Best iteration:
[39]	validation_0-logloss:0.642219	validation_1-logloss:0.647215

############## one flod is over ##############
[0]	validation_0-logloss:0.685312	validation_1-logloss:0.685377
Multiple eval metrics have been passed: 'validation_1-logloss' will be used for early stopping.

Will train until validation_1-logloss hasn't improved in 10 rounds.
[10]	validation_0-logloss:0.652051	validation_1-logloss:0.652878
[20]	validation_0-logloss:0.645614	validation_1-logloss:0.647236
[30]	validatio

In [19]:
np.save('new_feature/sex_train.npy',sex_train)
np.save('new_feature/sex_test.npy',sex_test)