In [1]:
import pandas as pd
import numpy as np
import os
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
import gc
num_categorical = 10

In [2]:
def create_target_encoding(train_df, test_df, cat_vars):
    aggs = ["mean","sum","var"]
    test_index = test_df.index
    for column in cat_vars:
        cat_df = train_df.groupby(column)["target"].agg(aggs)
        test_df = test_df.set_index(column)
        for agg in aggs:
            test_df[column+"_target_"+agg] = cat_df[agg]
        test_df = test_df.reset_index()
        test_df.drop(columns=[column],inplace=True)
        del cat_df
        gc.collect()
    test_df.index = test_index
    return train_df, test_df

In [4]:
def create_cat_vars(train_df, test_df):
    feat_importance_path = r"../output/feature importance/files/"
    model_type = "lgb"
    max_score = 0
    file_name = None
    for csv in os.listdir(feat_importance_path):
        csv_model_type, _, score = csv.split(".csv")[0].split("_")
        if model_type == csv_model_type and float(score) > max_score:
            max_score = float(score)
            file_name = csv

    if file_name is None:
        print("No file for model found.")
        return train_df, test_df, None

    print("Using file {}\n".format(file_name))
    feat_importance_df = pd.read_csv(feat_importance_path+file_name, engine="python")
    cat_vars = feat_importance_df.sort_values(by="importance")[:num_categorical].feature.values
    df = pd.concat([train_df, test_df])
    for column in cat_vars:
        df[column] = pd.cut(df[column], 5).cat.codes
    return df[:len(train_df)], df[len(train_df):], cat_vars

In [3]:
train_df = pd.read_csv('../input/train.csv')
test_df = pd.read_csv('../input/test.csv')
print('\nShape of Train Data: {}\t Shape of Test Data: {}'
    .format(train_df.shape, test_df.shape))

train_labels = train_df['target']
train_index = np.array(train_df.index)

train_df.drop(['ID_code', 'target'], axis=1, inplace=True)
test_df.drop(['ID_code'], axis=1, inplace=True)


Shape of Train Data: (200000, 202)	 Shape of Test Data: (200000, 201)


In [5]:
train_df, test_df, cat_vars = create_cat_vars(train_df, test_df)

assert cat_vars is not None

oof_preds = np.zeros(train_df.shape[0])
test_preds = np.zeros(test_df.shape[0])
feature_importance = pd.DataFrame()
feature_names = [str(column) for column in train_df.columns]

skf = StratifiedKFold(n_splits=2, shuffle=True, random_state=42)

Using file lgb_optimal_0.8998770915621923.csv



In [29]:
for counter, ids in enumerate(skf.split(train_index, train_labels.values)):
        print('\nFold {}'.format(counter+1))
        y_train, y_val = train_labels.values[ids[0]], train_labels.values[ids[1]]

        _skf = StratifiedKFold(n_splits=2, shuffle=True, random_state=42)
        _train_index = ids[0]
        _train_labels = y_train
        X_train = pd.DataFrame()

        for _counter, _ids in enumerate(_skf.split(_train_index, _train_labels)):
            _train_df = pd.concat([train_df.loc[_train_index[_ids[0]],:], train_labels.loc[_train_index[_ids[0]]]], axis=1)
            _test_df = train_df.loc[_train_index[_ids[1]],:]
            #print("before\n",_test_df.index)
            _train_df, _test_df = create_target_encoding(_train_df, _test_df, cat_vars)
            X_train = pd.concat([X_train, _test_df])
            #print("inside\n",X_train.index)
            #print(_train_index[_ids[1]])
            del _train_df, _test_df
        print(" before sort\n",X_train.index)
        X_train.sort_index(inplace=True)
        print(" after sort\n",X_train.index)
        print("train ids\n",ids[0])


Fold 1
 before sort
 Int64Index([     1,      7,      9,     10,     14,     17,     18,     29,
                34,     39,
            ...
            199948, 199951, 199953, 199964, 199968, 199980, 199983, 199989,
            199991, 199995],
           dtype='int64', length=100000)
 after sort
 Int64Index([     1,      2,      5,      6,      7,      8,      9,     10,
                14,     15,
            ...
            199977, 199979, 199980, 199981, 199983, 199989, 199990, 199991,
            199995, 199999],
           dtype='int64', length=100000)
train ids
 [     1      2      5 ... 199991 199995 199999]

Fold 2
 before sort
 Int64Index([     0,     12,     13,     23,     24,     25,     35,     36,
                55,     59,
            ...
            199967, 199969, 199972, 199982, 199984, 199992, 199993, 199994,
            199996, 199997],
           dtype='int64', length=100000)
 after sort
 Int64Index([     0,      3,      4,     11,     12,     13,     21,     2