# OOF Stacking 베이스라인 
* (LGBM + XGBOOST + CATBOOST)

In [1]:
import pandas as pd
import os
import random
import numpy as np

import lightgbm as lgb
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
import numpy as np

## 1. 데이터 로딩

In [2]:
data_dir = '/opt/ml/input/data' # 경로는 상황에 맞춰서 수정해주세요!

dtype = {
    'userID': 'int16',
    'answerCode': 'int8',
    'KnowledgeTag': 'int16'
} 

csv_file_path = os.path.join(data_dir, 'train_data.csv') # 데이터는 대회홈페이지에서 받아주세요 :)
df = pd.read_csv(csv_file_path, dtype=dtype, parse_dates=['Timestamp']) 

df = df.sort_values(by=['userID', 'Timestamp']).reset_index(drop=True)

## 2. Feature Engineering

In [3]:
from feature_engineering import FeatureEngineering

class FeatCog(FeatureEngineering):
    def __init__(self, df):
        super(FeatCog, self).__init__()
        self.df = df

    def __preprocessing__(self, df_past):
        self.df.sort_values(by=["userID", "Timestamp"], inplace=True)        

        # Test-Related
        self.df["test_L"] = self.df["assessmentItemID"].apply(lambda x: int(x[2]))
        self.df["test_M"] = self.df["assessmentItemID"].apply(lambda x: int(x[4:7]))
        self.df["test_S"] = self.df["assessmentItemID"].apply(lambda x: int(x[-3:]))

        correct_t = self.df.groupby(["testId"])["answerCode"].agg(["mean", "sum"])
        correct_t.columns = ["test_mean", "test_sum"]
        correct_k = self.df.groupby(["KnowledgeTag"])["answerCode"].agg(["mean", "sum"])
        correct_k.columns = ["tag_mean", "tag_sum"]

        self.df = pd.merge(self.df, correct_t, on=["testId"], how="left")
        self.df = pd.merge(self.df, correct_k, on=["KnowledgeTag"], how="left")
        

        # User-Related
        self.df["user_correct_answer"] = self.df.groupby("userID")["answerCode"].transform(lambda x: x.cumsum().shift(1))
        self.df["user_total_answer"] = self.df.groupby("userID")["answerCode"].cumcount()
        self.df["user_acc"] = self.df["user_correct_answer"] / self.df["user_total_answer"]


        # Tag-Related
        knowledge_clustered = self.df.loc[:, ['userID', 'KnowledgeTag']].groupby('userID').rolling(window=4, closed='right').std()
        self.df['knowledge_clustered'] = knowledge_clustered.values
        self.df['knowledge_clustered'] = self.df.knowledge_clustered.fillna(method='bfill')
        self.df['knowledge_clustered'][self.df.knowledge_clustered > 100] = 100 


        # Time-Related
        self.df["month"] = self.df["Timestamp"].dt.month
        self.df['week'] = self.df['Timestamp'].dt.isocalendar().week
        self.df["hour"] = self.df["Timestamp"].dt.hour

        diff = self.df.loc[:, ['userID', 'Timestamp']].groupby('userID').diff().fillna(pd.Timedelta(seconds=0))
        diff = diff.fillna(pd.Timedelta(seconds=0))
        diff = diff['Timestamp'].apply(lambda x: x.total_seconds())
        self.df['duration'] = diff.values
        mean_duration = diff[(diff <= 135) & (diff >= 0)].mean()
        # criterion: quantile (75%) or NA => mean duration
        self.df['duration'] = self.df.duration.apply(lambda x: x if x <= 135 else mean_duration)
        self.df['duration'] = self.df.duration.apply(lambda x: x if x > 0 else mean_duration)


        # Past_history-Related
        self.df[[f'past_testid_{i}' for i in range(1, 6)]] = df_past[[f'past_testid_{i}' for i in range(1, 6)]]
        ans = df.loc[:, ['userID', 'answerCode']].groupby('userID').rolling(window=4, closed='left').mean()
        self.df['past_OX'] = ans.values
        self.df['past_OX'] = self.df.past_OX.fillna(method='bfill')
        

        # Drop null values (user_correct_answer, user_acc)
        self.df = self.df.dropna()
        return self.df

In [4]:
df_past = pd.read_csv('train_past_tID.csv')

In [5]:
fe = FeatCog(df)
df = fe(df_past = df_past)
df.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.df['knowledge_clustered'][self.df.knowledge_clustered > 100] = 100


Unnamed: 0,userID,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag,test_L,test_M,test_S,test_mean,...,month,week,hour,duration,past_testid_1,past_testid_2,past_testid_3,past_testid_4,past_testid_5,past_OX
1,0,A060001002,A060000001,1,2020-03-24 00:17:14,7225,6,1,2,0.947683,...,3,13,0,3.0,-1,-1,-1,-1,-1,1.0
2,0,A060001003,A060000001,1,2020-03-24 00:17:22,7225,6,1,3,0.947683,...,3,13,0,8.0,-1,-1,-1,-1,-1,1.0
3,0,A060001004,A060000001,1,2020-03-24 00:17:29,7225,6,1,4,0.947683,...,3,13,0,7.0,-1,-1,-1,-1,-1,1.0
4,0,A060001005,A060000001,1,2020-03-24 00:17:36,7225,6,1,5,0.947683,...,3,13,0,7.0,-1,-1,-1,-1,-1,1.0
5,0,A060001007,A060000001,1,2020-03-24 00:17:47,7225,6,1,7,0.947683,...,3,13,0,11.0,-1,-1,-1,-1,-1,1.0


In [6]:
df.columns

Index(['userID', 'assessmentItemID', 'testId', 'answerCode', 'Timestamp',
       'KnowledgeTag', 'test_L', 'test_M', 'test_S', 'test_mean', 'test_sum',
       'tag_mean', 'tag_sum', 'user_correct_answer', 'user_total_answer',
       'user_acc', 'knowledge_clustered', 'month', 'week', 'hour', 'duration',
       'past_testid_1', 'past_testid_2', 'past_testid_3', 'past_testid_4',
       'past_testid_5', 'past_OX'],
      dtype='object')

In [None]:
cate = sum([['testId', 'assessmentItemID', 'KnowledgeTag', 'month', 'hour', 'week'],
            [f'past_testid_{i}' for i in range(1, 6)],
           ], [])
for c in cate:
    df[c] = df[c].astype('category')

In [None]:
df.sample(10)

## 3. Train/Test 데이터 셋 분리

In [None]:
# train과 test 데이터셋은 사용자 별로 묶어서 분리를 해주어야함
random.seed(42)
############################################################ 0.7 #######################################################################
def custom_train_test_split(df, ratio=0.7, split=True):
    
    users = list(zip(df['userID'].value_counts().index, df['userID'].value_counts()))
    random.shuffle(users)
    
    max_train_data_len = ratio*len(df)
    sum_of_train_data = 0
    user_ids =[]

    for user_id, count in users:
        sum_of_train_data += count
        if max_train_data_len < sum_of_train_data:
            break
        user_ids.append(user_id)


    train = df[df['userID'].isin(user_ids)]
    test = df[df['userID'].isin(user_ids) == False]

    #test데이터셋은 각 유저의 마지막 interaction만 추출
    test = test[test['userID'] != test['userID'].shift(-1)]
    return train, test

In [None]:
# 유저별 분리
train, test = custom_train_test_split(df)

# 사용할 Feature 설정
FEATS = ['month', 'hour', 'week', 'past_OX', 'test_L', 'test_M', 'test_S', 'knowledge_clustered',
         'KnowledgeTag', 'user_correct_answer', 'user_total_answer', 'duration', 'testId',
         'user_acc', 'test_mean', 'test_sum', 'tag_mean','tag_sum',
         'assessmentItemID', 'past_testid_1', 'past_testid_2', 'past_testid_3', 'past_testid_4', 'past_testid_5']

In [None]:
np.where(test.answerCode == -1)

In [None]:
# drop outliers
# 1. # of items solved <= 30
# 2. 0.1 <= total score <= 0.95

# total_solved = df.groupby('userID').agg({'assessmentItemID':'count'})
# total_score = df.groupby('userID').agg({'answerCode': 'mean'})

# outlier_i = total_solved[total_solved.assessmentItemID <= 30].index
# outlier_a = total_score[(total_score.answerCode <= 0.1) | (total_score.answerCode >= 0.95)].index
# outlier = outlier_i.union(outlier_a)

# train = train.drop(train[train.userID.isin(outlier)].index)

In [None]:
# X, y 값 분리
y_train = train['answerCode']
train = train.drop(['answerCode'], axis=1)

y_test = test['answerCode']
test = test.drop(['answerCode'], axis=1)

## OOF Stacking

In [None]:
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error

import lightgbm as lgb
import xgboost as xgb
import catboost as cb

from matplotlib import pyplot as plt
import matplotlib

### LGBM

In [None]:
X_train = train[FEATS]
X_test  = test[FEATS]
# y_train = train['answerCode']
# y_test = test['answerCode']

# Number of folds
n_folds = 4
# Empty array to store out-of-fold predictions (single column)
S_train_lgb = np.zeros((X_train.shape[0], 1))
# Empty array to store temporary test set predictions made in each fold
S_test_temp = np.zeros((X_test.shape[0], n_folds))
# Empty list to store scores from each fold
scores = []
# Split initialization
kf = KFold(n_splits=n_folds, shuffle=True, random_state=0)

In [None]:
# Loop across folds
for fold_counter, (tr_index, te_index) in enumerate(kf.split(X_train, y_train)):
    
    # Split data and target
    X_tr = X_train.iloc[tr_index]
    y_tr = y_train.iloc[tr_index]
    X_te = X_train.iloc[te_index]
    y_te = y_train.iloc[te_index]
    
    # Fit
    lgb_train = lgb.Dataset(X_tr, y_tr)
    lgb_test  = lgb.Dataset(X_te, y_te)
    model = lgb.train(
        {'objective': 'binary',
        'metric': 'auc',
        'max_depth':8,
        'num_leaves':64,
        },  
        lgb_train,
        valid_sets=[lgb_train, lgb_test],
        verbose_eval=100,
        num_boost_round=500,
        early_stopping_rounds=100
    )
    
    # Predict out-of-fold part of train set
    S_train_lgb[te_index, :] = model.predict(X_te).reshape(-1, 1)
    
    # Predict test set
    S_test_temp[:, fold_counter] = model.predict(X_test)
    
    # Print score of current fold
    score = mean_absolute_error(y_te, S_train_lgb[te_index, :])
    scores.append(score)
    print('fold %d: [%.8f]' % (fold_counter, score))

In [None]:
preds = model.predict(test[FEATS])
acc = accuracy_score(y_test, np.where(preds >= 0.5, 1, 0))
auc = roc_auc_score(y_test, preds)

print(f'VALID AUC : {auc} ACC : {acc}\n')

In [None]:
# Compute mean of temporary test set predictions to get final test set prediction
S_test_lgb = np.mean(S_test_temp, axis=1).reshape(-1, 1)

# Mean OOF score + std
print('\nMEAN:   [%.8f] + [%.8f]' % (np.mean(scores), np.std(scores)))

# Full OOF score
# !!! FULL score slightly differs from MEAN score because folds contain
# different number of examples (404 can't be divided by 3)
# If we set n_folds=4 scores will be identical for given metric
print('FULL:   [%.8f]' % (mean_absolute_error(y_train, S_train_lgb)))

In [None]:
zeros = np.where(y_test == 0)[0]
ones = np.where(y_test == 1)[0]

fig, ax = plt.subplots(figsize=(12,8))

ax.set_title('Distribution of predicted zeros and ones')

ax.hist(S_test_lgb[zeros], bins=50, alpha=0.5, stacked=True, density=1, label='Zeros')
ax.hist(S_test_lgb[ones], bins=50, alpha=0.5, stacked=True, density=1, label='Ones')

ax.legend()

### XGBoost

In [None]:
xgb.set_config(verbosity=0)

# Empty array to store out-of-fold predictions (single column)
S_train_xgb = np.zeros((X_train.shape[0], 1))
# Empty array to store temporary test set predictions made in each fold
S_test_temp = np.zeros((X_test.shape[0], n_folds))
# Convert to XGBoost matrix
X_test_xgb = xgb.DMatrix(X_test, enable_categorical=True)

params = {'learning_rate': 0.01,
            'max_depth':8,
            'eta' : 0.1,
            'objective': 'binary:logistic',
            'eval_metric': 'auc',
#           'is_training_metric': True,
            'feature_fraction': 1,
            'seed':42,
            'gpu_id':0
            }
# Loop across folds
for fold_counter, (tr_index, te_index) in enumerate(kf.split(X_train, y_train)):
    
    # Split data and target
    X_tr = X_train.iloc[tr_index]
    y_tr = y_train.iloc[tr_index]
    X_te = X_train.iloc[te_index]
    y_te = y_train.iloc[te_index]
    
    # Fit
    xgb_train = xgb.DMatrix(X_tr, y_tr, enable_categorical=True)
    xgb_test  = xgb.DMatrix(X_te, y_te, enable_categorical=True)
    model = xgb.train(
                    params, 
                    xgb_train,
                    evals = [(xgb_train, 'train'), (xgb_test,'eval')],
                    num_boost_round=500,
                    early_stopping_rounds=100
                )
    
    # Predict out-of-fold part of train set
    S_train_xgb[te_index, :] = model.predict(xgb_test).reshape(-1, 1)
    
    # Predict test set
    S_test_temp[:, fold_counter] = model.predict(X_test_xgb)
    
    # Print score of current fold
    score = mean_absolute_error(y_te, S_train_xgb[te_index, :])
    scores.append(score)
    print('fold %d: [%.8f]' % (fold_counter, score))

In [None]:
preds = model.predict(X_test_xgb)
acc = accuracy_score(y_test, np.where(preds >= 0.5, 1, 0))
auc = roc_auc_score(y_test, preds)

print(f'VALID AUC : {auc} ACC : {acc}\n')

In [None]:
# Compute mean of temporary test set predictions to get final test set prediction
S_test_xgb = np.mean(S_test_temp, axis=1).reshape(-1, 1)

# Mean OOF score + std
print('\nMEAN:   [%.8f] + [%.8f]' % (np.mean(scores), np.std(scores)))

# Full OOF score
# !!! FULL score slightly differs from MEAN score because folds contain
# different number of examples (404 can't be divided by 3)
# If we set n_folds=4 scores will be identical for given metric
print('FULL:   [%.8f]' % (mean_absolute_error(y_train, S_train_xgb)))

In [None]:
zeros_tr = np.where(y_train == 0)[0]
ones_tr = np.where(y_train == 1)[0]

fig, ax = plt.subplots(figsize=(12,8))

ax.set_xlabel('Predicted Probability')
ax.set_ylabel('Scaled Counts')
ax.set_title('Distribution of predicted zeros and ones (LGBM)')

ax.hist(S_train_lgb[zeros_tr], bins=50, alpha=0.5, stacked=True, density=1, label='Zeros')
ax.hist(S_train_lgb[ones_tr], bins=50, alpha=0.5, stacked=True, density=1, label='Ones')

ax.legend()

In [None]:
fig, ax = plt.subplots(figsize=(12,8))

ax.set_title('Distribution of predicted zeros and ones')

ax.hist(S_test_xgb[zeros], bins=50, alpha=0.5, stacked=True, density=1, label='Zeros')
ax.hist(S_test_xgb[ones], bins=50, alpha=0.5, stacked=True, density=1, label='Ones')

ax.legend()

### CatBoost

In [None]:
from pandas.api.types import is_numeric_dtype

def get_categorical_indicies(X):
    cats = []
    for col in X.columns:
        if is_numeric_dtype(X[col]):
            pass
        else:
            cats.append(col)
    cat_indicies = []
    for col in cats:
        cat_indicies.append(X.columns.get_loc(col))
    return cat_indicies

train_categorical_indicies = get_categorical_indicies(X_train)
test_categorical_indicies = get_categorical_indicies(X_test)

In [None]:
# Empty array to store out-of-fold predictions (single column)
S_train_cat = np.zeros((X_train.shape[0], 1))
# Empty array to store temporary test set predictions made in each fold
S_test_temp = np.zeros((X_test.shape[0], n_folds))
# Convert to catvoost array
X_test_cat  = cb.Pool(X_test, cat_features=test_categorical_indicies)

scores = []

params = {'learning_rate': 0.01,
            'depth':8,
            'objective': 'Logloss',
            'eval_metric': 'AUC',
#           'is_training_metric': True,
            # 'seed':42,
            'task_type':"GPU",
            # 'devices':0
            }
# Loop across folds
for fold_counter, (tr_index, te_index) in enumerate(kf.split(X_train, y_train)):
    
    # Split data and target
    X_tr = X_train.iloc[tr_index]
    y_tr = y_train.iloc[tr_index]
    X_te = X_train.iloc[te_index]
    y_te = y_train.iloc[te_index]
    
    # Fit
    cat_train = cb.Pool(X_tr, y_tr, cat_features=train_categorical_indicies)
    cat_test  = cb.Pool(X_te, y_te, cat_features=train_categorical_indicies)
    model = cb.train(
                    params, 
                    cat_train,
                    evals = [(cat_train, 'train'), (cat_test,'eval')],
                    num_boost_round=500,
                    early_stopping_rounds=100
                )
    
    # Predict out-of-fold part of train set
    S_train_cat[te_index, :] = model.predict(cat_test).reshape(-1, 1)
    
    # Predict test set
    S_test_temp[:, fold_counter] = model.predict(X_test_cat)
    
    # Print score of current fold
    score = mean_absolute_error(y_te, S_train_cat[te_index, :])
    scores.append(score)
    print('fold %d: [%.8f]' % (fold_counter, score))

In [None]:
preds = model.predict(test[FEATS])
acc = accuracy_score(y_test, np.where(preds >= 0.5, 1, 0))
auc = roc_auc_score(y_test, preds)

print(f'VALID AUC : {auc} ACC : {acc}\n')

In [None]:
# Compute mean of temporary test set predictions to get final test set prediction
S_test_cat = np.mean(S_test_temp, axis=1).reshape(-1, 1)

# Mean OOF score + std
print('\nMEAN:   [%.8f] + [%.8f]' % (np.mean(scores), np.std(scores)))

# Full OOF score
# !!! FULL score slightly differs from MEAN score because folds contain
# different number of examples (404 can't be divided by 3)
# If we set n_folds=4 scores will be identical for given metric
print('FULL:   [%.8f]' % (mean_absolute_error(y_train, S_train_cat)))

### Second Level Analysis (Ensemble)

In [None]:
S_train = np.hstack([S_train_lgb, S_train_xgb, S_train_cat])
S_test = np.hstack([S_test_lgb, S_test_xgb, S_test_cat])
S_train.size

In [None]:
# Convert to LGBM dataset
lgb_train = lgb.Dataset(S_train, y_train)
lgb_test = lgb.Dataset(S_test, y_test)

# Initialize and fit 2nd level model
model = lgb.train(
        {'objective': 'binary',
        'metric': 'auc',
        'max_depth':6,
        'num_leaves':16,
        },  
        lgb_train,
        valid_sets=[lgb_train, lgb_test],
        verbose_eval=100,
        num_boost_round=500,
        early_stopping_rounds=100
    )

# Predict
y_pred = model.predict(S_test)

# Final prediction score
from sklearn.metrics import log_loss
print('Final prediction score: %.8f' % log_loss(y_test, y_pred))

## Store Preprocessed

In [None]:
np.save('S_train_lgb.npy', S_train_lgb)
np.save('S_test_lgb.npy', S_test_lgb)
np.save('S_train_xgb.npy', S_train_xgb)
np.save('S_test_xgb.npy', S_test_xgb)

In [None]:
# ?????????????????????????????????????????????? #
train.to_csv('train_preprocessed.csv', sep=',')
test.to_csv('val_preprocessed.csv', sep=',')
# ?????????????????????????????????????????????? #