In [1]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.preprocessing import OneHotEncoder

from IPython.display import display

def reduce_dimen(dataset,column,toreplace):
    for index,i in dataset[column].duplicated(keep=False).iteritems():
        if i==False:
            dataset.set_value(index,column,toreplace)
    return dataset

def act_data_treatment(dsname):
    dataset = dsname

    for col in list(dataset.columns):
        if col not in ['people_id', 'activity_id', 'date', 'char_38', 'outcome']:
            if dataset[col].dtype == 'object':
                dataset[col].fillna('type 0', inplace=True)
                dataset[col] = dataset[col].apply(lambda x: x.split(' ')[1]).astype(np.int32)
            elif dataset[col].dtype == 'bool':
                dataset[col] = dataset[col].astype(np.int8)

    dataset['year'] = dataset['date'].dt.year
    dataset['month'] = dataset['date'].dt.month
    dataset['day'] = dataset['date'].dt.day
    dataset['isweekend'] = (dataset['date'].dt.weekday >= 5).astype(int)
    dataset = dataset.drop('date', axis = 1)

    return dataset

まずはデータの読み込み、preprocessing

MEMO: char_10をドロップしているのはdata leak関連？

In [2]:
act_train_data = pd.read_csv("../inputs/act_train.csv",dtype={'people_id': np.str, 'activity_id': np.str, 'outcome': np.int8}, parse_dates=['date'])
act_test_data  = pd.read_csv("../inputs/act_test.csv", dtype={'people_id': np.str, 'activity_id': np.str}, parse_dates=['date'])
people_data    = pd.read_csv("../inputs/people.csv", dtype={'people_id': np.str, 'activity_id': np.str, 'char_38': np.int32}, parse_dates=['date'])

print("Before drop:")
print("Train data shape: " + format(act_train_data.shape))
print("Test data shape: " + format(act_test_data.shape))
print("People data shape: " + format(people_data.shape))

act_train_data=act_train_data.drop('char_10',axis=1)
act_test_data=act_test_data.drop('char_10',axis=1)

print("After drop:")
print("Train data shape: " + format(act_train_data.shape))
print("Test data shape: " + format(act_test_data.shape))
print("People data shape: " + format(people_data.shape))

act_train_data  = act_data_treatment(act_train_data)
act_test_data   = act_data_treatment(act_test_data)
people_data = act_data_treatment(people_data)

Before drop:
Train data shape: (2197291, 15)
Test data shape: (498687, 14)
People data shape: (189118, 41)
After drop:
Train data shape: (2197291, 14)
Test data shape: (498687, 13)
People data shape: (189118, 41)


In [3]:
print("Head act_train_data")
display(act_train_data.head())

print("Head people_data")
display(people_data.head())

Head act_train_data


Unnamed: 0,people_id,activity_id,activity_category,char_1,char_2,char_3,char_4,char_5,char_6,char_7,char_8,char_9,outcome,year,month,day,isweekend
0,ppl_100,act2_1734928,4,0,0,0,0,0,0,0,0,0,0,2023,8,26,1
1,ppl_100,act2_2434093,2,0,0,0,0,0,0,0,0,0,0,2022,9,27,0
2,ppl_100,act2_3404049,2,0,0,0,0,0,0,0,0,0,0,2022,9,27,0
3,ppl_100,act2_3651215,2,0,0,0,0,0,0,0,0,0,0,2023,8,4,0
4,ppl_100,act2_4109017,2,0,0,0,0,0,0,0,0,0,0,2023,8,26,1


Head people_data


Unnamed: 0,people_id,char_1,group_1,char_2,char_3,char_4,char_5,char_6,char_7,char_8,...,char_33,char_34,char_35,char_36,char_37,char_38,year,month,day,isweekend
0,ppl_100,2,17304,2,5,5,5,3,11,2,...,0,1,1,1,0,36,2021,6,29,0
1,ppl_100002,2,8688,3,28,9,5,3,11,2,...,1,1,1,1,0,76,2021,1,6,0
2,ppl_100003,2,33592,3,4,8,5,2,5,2,...,1,1,0,1,1,99,2022,6,10,0
3,ppl_100004,2,22593,3,40,25,9,4,16,2,...,1,1,1,1,1,76,2022,7,20,0
4,ppl_100006,2,6534,3,40,25,9,3,8,2,...,0,0,1,1,0,84,2022,7,27,0


actionテーブルにpeopleを左結合

reduce_dimenでは、categoricalなcolumnについて、メンバーが1つしか無いカテゴリを全てまとめて1つのカテゴリにしてしまっている。
次元は大きく削減できていそうだけど、これでいい理由はなんだろう？

In [9]:
train = act_train_data.merge(people_data, on='people_id', how='left', left_index=True)
test  = act_test_data.merge(people_data, on='people_id', how='left', left_index=True)
print("Head train:")
display(train.head())

train=train.sort_values(['people_id'], ascending=[1])
test=test.sort_values(['people_id'], ascending=[1])

train_columns = train.columns.values
test_columns = test.columns.values
features = list(set(train_columns) & set(test_columns))

train.fillna('NA', inplace=True)
test.fillna('NA', inplace=True)

y = train.outcome
train_without_outcome=train.drop('outcome',axis=1)

whole=pd.concat([train_without_outcome,test], ignore_index=True)
categorical=['group_1','activity_category','char_1_x','char_2_x','char_3_x','char_4_x','char_5_x','char_6_x','char_7_x','char_8_x','char_9_x','char_2_y','char_3_y','char_4_y','char_5_y','char_6_y','char_7_y','char_8_y','char_9_y']
for category in categorical:
    whole=reduce_dimen(whole,category,9999999)

X=whole[:len(train_without_outcome)]
X_test=whole[len(train_without_outcome):]

print("Head X:")
display(X.head())

Head train:


Unnamed: 0,people_id,activity_id,activity_category,char_1_x,char_2_x,char_3_x,char_4_x,char_5_x,char_6_x,char_7_x,...,char_33,char_34,char_35,char_36,char_37,char_38,year_y,month_y,day_y,isweekend_y
0,ppl_100,act2_1734928,4,0,0,0,0,0,0,0,...,0,1,1,1,0,36,2021,6,29,0
0,ppl_100,act2_2434093,2,0,0,0,0,0,0,0,...,0,1,1,1,0,36,2021,6,29,0
0,ppl_100,act2_3404049,2,0,0,0,0,0,0,0,...,0,1,1,1,0,36,2021,6,29,0
0,ppl_100,act2_3651215,2,0,0,0,0,0,0,0,...,0,1,1,1,0,36,2021,6,29,0
0,ppl_100,act2_4109017,2,0,0,0,0,0,0,0,...,0,1,1,1,0,36,2021,6,29,0


Head X:


Unnamed: 0,people_id,activity_id,activity_category,char_1_x,char_2_x,char_3_x,char_4_x,char_5_x,char_6_x,char_7_x,...,char_33,char_34,char_35,char_36,char_37,char_38,year_y,month_y,day_y,isweekend_y
0,ppl_100,act2_1734928,4,0,0,0,0,0,0,0,...,0,1,1,1,0,36,2021,6,29,0
1,ppl_100,act2_2434093,2,0,0,0,0,0,0,0,...,0,1,1,1,0,36,2021,6,29,0
2,ppl_100,act2_3404049,2,0,0,0,0,0,0,0,...,0,1,1,1,0,36,2021,6,29,0
3,ppl_100,act2_3651215,2,0,0,0,0,0,0,0,...,0,1,1,1,0,36,2021,6,29,0
4,ppl_100,act2_4109017,2,0,0,0,0,0,0,0,...,0,1,1,1,0,36,2021,6,29,0


In [10]:
X=X.sort_values(['people_id'], ascending=[1])

X = X[features].drop(['people_id', 'activity_id'], axis = 1)
X_test = X_test[features].drop(['people_id', 'activity_id'], axis = 1)

In [1]:
categorical=['group_1','activity_category','char_1_x','char_2_x','char_3_x','char_4_x','char_5_x','char_6_x','char_7_x','char_8_x','char_9_x','char_2_y','char_3_y','char_4_y','char_5_y','char_6_y','char_7_y','char_8_y','char_9_y']
not_categorical=[]
for category in X.columns:
    if category not in categorical:
        not_categorical.append(category)

enc = OneHotEncoder(handle_unknown='ignore')
enc=enc.fit(pd.concat([X[categorical],X_test[categorical]]))
X_cat_sparse=enc.transform(X[categorical])
X_test_cat_sparse=enc.transform(X_test[categorical])

NameError: name 'X' is not defined

In [15]:
from scipy.sparse import hstack
X_sparse=hstack((X[not_categorical], X_cat_sparse))
X_test_sparse=hstack((X_test[not_categorical], X_test_cat_sparse))

In [16]:
print("Training data: " + format(X_sparse.shape))
print("Test data: " + format(X_test_sparse.shape))
print("###########")
print("One Hot enconded Test Dataset Script")

print(X_sparse)

Training data: (2197291, 31271)
Test data: (498687, 31271)
###########
One Hot enconded Test Dataset Script
  (0, 2)	1.0
  (0, 3)	36.0
  (0, 4)	26.0
  (0, 5)	29.0
  (0, 6)	1.0
  (0, 7)	1.0
  (0, 13)	6.0
  (0, 15)	1.0
  (0, 16)	1.0
  (0, 17)	1.0
  (0, 18)	2021.0
  (0, 19)	1.0
  (0, 20)	1.0
  (0, 21)	1.0
  (0, 22)	2023.0
  (0, 27)	1.0
  (0, 28)	1.0
  (0, 30)	8.0
  (0, 31)	1.0
  (0, 33)	2.0
  (0, 35)	1.0
  (1, 2)	1.0
  (1, 3)	36.0
  (1, 4)	27.0
  (1, 5)	29.0
  :	:
  (2197289, 31072)	1.0
  (2197289, 31060)	1.0
  (2197289, 31027)	1.0
  (2197289, 30975)	1.0
  (2197289, 30969)	1.0
  (2197289, 11386)	1.0
  (2197290, 31263)	1.0
  (2197290, 31255)	1.0
  (2197290, 31230)	1.0
  (2197290, 31222)	1.0
  (2197290, 31214)	1.0
  (2197290, 31194)	1.0
  (2197290, 31146)	1.0
  (2197290, 31144)	1.0
  (2197290, 31122)	1.0
  (2197290, 31103)	1.0
  (2197290, 31094)	1.0
  (2197290, 31088)	1.0
  (2197290, 31080)	1.0
  (2197290, 31072)	1.0
  (2197290, 31060)	1.0
  (2197290, 31027)	1.0
  (2197290, 30975)	1.0
  (21

In [6]:
dtrain = xgb.DMatrix(X_sparse,label=y)
dtest = xgb.DMatrix(X_test_sparse)

param = {'max_depth':10, 'eta':0.02, 'silent':1, 'objective':'binary:logistic' }
param['nthread'] = 4
param['eval_metric'] = 'auc'
param['subsample'] = 0.7
param['colsample_bytree']= 0.7
param['min_child_weight'] = 0
param['booster'] = "gblinear"

watchlist  = [(dtrain,'train')]
num_round = 300
early_stopping_rounds=10
bst = xgb.train(param, dtrain, num_round, watchlist,early_stopping_rounds=early_stopping_rounds)

ypred = bst.predict(dtest)
output = pd.DataFrame({ 'activity_id' : test['activity_id'], 'outcome': ypred })
output.head()
output.to_csv('without_leak.csv', index = False)

[0]	train-auc:0.886898
Will train until train-auc hasn't improved in 10 rounds.
[1]	train-auc:0.895088
[2]	train-auc:0.903545
[3]	train-auc:0.912225
[4]	train-auc:0.920658
[5]	train-auc:0.928673
[6]	train-auc:0.936026
[7]	train-auc:0.942769
[8]	train-auc:0.949012
[9]	train-auc:0.954792
[10]	train-auc:0.96008
[11]	train-auc:0.964846
[12]	train-auc:0.969078
[13]	train-auc:0.972758
[14]	train-auc:0.975912
[15]	train-auc:0.978599
[16]	train-auc:0.980871
[17]	train-auc:0.982761
[18]	train-auc:0.984354
[19]	train-auc:0.985683
[20]	train-auc:0.986797


KeyboardInterrupt: 