In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/criteo-dataset/dac/test.txt
/kaggle/input/criteo-dataset/dac/readme.txt
/kaggle/input/criteo-dataset/dac/train.txt


In [2]:
import seaborn as sns;
import matplotlib.pyplot as plt

from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.metrics import log_loss

In [15]:
col_names_train = ['Label'] + \
["I"+str(i) for i in range(1, 14)] + \
['C'+str(i) for i in range(1,27)]

#col_names_test = col_names_train[1:]

df_train = pd.read_csv('/kaggle/input/criteo-dataset/dac/train.txt', 
                       sep='\t', names=col_names_train,
                       chunksize=100000) # ten chunks: first 1,000,000

# df_test = pd.read_csv('/kaggle/input/criteo-dataset/dac/test.txt', 
#                       sep='\t', names=col_names_test,
#                       chunksize=100000)

# don't re-run, getting without replacement
df_train_100 = df_train.get_chunk(1000000)
df_test_25 = df_train.get_chunk(250000)

# Using the first one million records for analysis
# use the next 250,000 as testing data

In [16]:
# for simplicity of model fitting, no cross-validation process
# directly train on df_train_100 and test for performance on df_test_25 

df_train_100 = df_train_100.convert_dtypes()
df_test_25 = df_test_25.convert_dtypes()

## Notes
- Using the first one million records for analysis
- Using the next 250,000 as testing data
- for simplicity of model fitting, no cross-validation process
- directly train on df_train_100 and test for performance on df_test_25

## 有大量缺失的特征和标签间的相关性调查

- above 70% missing: I12, C22

In [None]:
# I12 0.770057
p1 = sns.boxplot(x="Label", y="I12", data=df_train_100)
p2 = sns.displot(df_train_100, x="I12", hue="Label")

In [None]:
I12_temp = df_train_100[df_train_100.I12.between(1,5)]
p1 = sns.boxplot(x="Label", y="I12", data=I12_temp)
p2 = sns.displot(I12_temp, x="I12", hue="Label")

In [None]:
I12_temp = df_train_100[df_train_100.I12.between(5,25)]
p1 = sns.boxplot(x="Label", y="I12", data=I12_temp)
p2 = sns.displot(I12_temp, x="I12", hue="Label")

- I12与Label间无明显相关性（各个值上都出现两种标签），77%缺少 -> 可考虑舍弃该特征
- 或对数值类特征分箱/离散化

In [None]:
# C22      0.738959
p1 = sns.catplot(x='C22', hue='Label', kind='count', data=df_train_100)
plt.xticks(rotation=90)
plt.show()

## 特征工程 - GBDT
- basing on 3idiots-preA
- https://github.com/ycjuan/kaggle-2014-criteo
- https://www.kaggle.com/c/criteo-display-ad-challenge/discussion/10555
- replace all numerical missing values with -10; one-hot encoding with selection (only include dense variables)
`target_cat_feats = ['C9-a73ee510', 'C22-', 'C17-e5ba7672', 'C26-', 'C23-32c7478e', 'C6-7e0ccccf', 'C14-b28479f6', 'C19-21ddcdc9', 'C14-07d13a8f', 'C10-3b08e48b', 'C6-fbad5c96', 'C23-3a171ecb', 'C20-b1252a9d', 'C20-5840adea', 'C6-fe6b92e5', 'C20-a458ea53', 'C14-1adce6ef', 'C25-001f3601', 'C22-ad3062eb', 'C17-07c540c4', 'C6-', 'C23-423fab69', 'C17-d4bb7bd8', 'C2-38a947a1', 'C25-e8b83407', 'C9-7cc72ec2']`
- 特征构建逻辑: 大多数数值型特征都只含正值（I2最小值-2），-10是给NA赋了一个特殊值 （**可用其他值，范围外不影响score**）；对所有类别型特征做one-hot后只保留稠密（在原完整训练集中出现超过四百万次）的特征，因为传统gbdt不适合过分稀疏的特征矩阵（prohibitively expensive computation & memory problem），原始one-hot之后的特征需做筛选（**可根据此训练集尝试其他筛选**）

### 3-idiots methods

In [17]:
def pre1_gbdt(df_train, df_test):
    '''
    Function for preprocessing dataframes for gbdt model with methods in 
    Team 3idiots' solution.
    '''
    
    my_dict = dict.fromkeys(col_names_train[1:14], -10)
    my_dict.update(dict.fromkeys(col_names_train[14:], 'NA'))

    df_train = df_train.fillna(my_dict) # inplace=True will change global var
    df_test = df_test.fillna(my_dict)
    
#     df1 = pd.get_dummies(df_train_100, columns=df_train_100.columns[14:],
#                          prefix=col, prefix_sep='-',
#                          dummy_na=True, sparse=True)
    
    y_train = df_train.Label.values.astype('int')
    y_test = df_test.Label.values.astype('int')
    
    ct = ColumnTransformer(transformers=[('encoder',
                                      OneHotEncoder(handle_unknown='ignore'), 
                                      col_names_train[14:])],
                       remainder='passthrough')
    X_train = ct.fit_transform(df_train.iloc[:, 1:]) # sparse matrix
    X_test = ct.transform(df_test.iloc[:, 1:]) # sparse matrix
    
    target_feats = ['encoder__x8_a73ee510', 'encoder__x21_NA',
                    'encoder__x16_e5ba7672', 'encoder__x25_NA', 
                    'encoder__x22_32c7478e', 'encoder__x5_7e0ccccf',
                    'encoder__x13_b28479f6', 'encoder__x18_21ddcdc9',
                    'encoder__x13_07d13a8f', 'encoder__x9_3b08e48b',
                    'encoder__x5_fbad5c96', 'encoder__x22_3a171ecb',
                    'encoder__x19_b1252a9d', 'encoder__x19_5840adea',
                    'encoder__x5_fe6b92e5', 'encoder__x19_a458ea53', 
                    'encoder__x13_1adce6ef', 'encoder__x24_001f3601',
                    'encoder__x21_ad3062eb', 'encoder__x16_07c540c4',
                    'encoder__x5_NA', 'encoder__x22_423fab69', 
                    'encoder__x16_d4bb7bd8', 'encoder__x1_38a947a1',
                    'encoder__x24_e8b83407', 'encoder__x8_7cc72ec2'] \
    + ["I"+str(i) for i in range(1, 14)]
    index_selected = [i for i, x in enumerate(ct.get_feature_names())\
                      if x in target_feats]
    
    X_train = X_train[:, index_selected]
    X_test = X_test[:, index_selected]
    
    return X_train, X_test, y_train, y_test

X_train, X_test, y_train, y_test = pre1_gbdt(df_train_100, df_test_25)

In [8]:
# GBDT model only, pre1_gbdt
grd = GradientBoostingClassifier() # with default setting
grd.fit(X_train, y_train)

y_pred_train_grd = grd.predict_proba(X_train)[:, 1] # prob of y=1
y_pred_test_grd = grd.predict_proba(X_test)[:, 1] # prob of y=1
score_train = log_loss(y_train, y_pred_train_grd)
score_test = log_loss(y_test, y_pred_test_grd)
np.round(score_train, 4), np.round(score_test, 4)

GradientBoostingClassifier()

training, testing : (0.4982, 0.4968)

### modification on 3-idiots

In [74]:
def pre2_gbdt(df_train, df_test):
    '''
    Function for preprocessing dataframes for gbdt model with modification on
    methods in Team 3idiots' solution:
    - dropping I12;
    - one-hot encodded features with appearance more than ..%.
    '''
    
    my_dict = dict.fromkeys(col_names_train[1:14], -10)
    my_dict.update(dict.fromkeys(col_names_train[14:], 'NA'))

    df_train = df_train.fillna(my_dict) # inplace=True will change global var
    df_test = df_test.fillna(my_dict)
    
    y_train = df_train.Label.values.astype('int')
    y_test = df_test.Label.values.astype('int')
    
    ct = ColumnTransformer(transformers=[('encoder',
                                      OneHotEncoder(handle_unknown='ignore'), 
                                      col_names_train[14:])],
                       remainder='passthrough')
    X_train = ct.fit_transform(df_train.iloc[:, 1:]) # sparse matrix
    X_test = ct.transform(df_test.iloc[:, 1:]) # sparse matrix
    
    # select one-hot encodded features with appearance more than ..%
    selected_cat = np.asarray(X_train.sum(axis=0)[:,:-13] > 500000).reshape(-1)
    # exclude I12
    target_feats = [i for (i, v) in zip(ct.get_feature_names(), selected_cat) if v] \
    + ["I"+str(i) for i in range(1, 12)] + ['I13']
    index_selected = [i for i, x in enumerate(ct.get_feature_names())\
                      if x in target_feats]
    
    X_train = X_train[:, index_selected]
    X_test = X_test[:, index_selected]
    
    return X_train, X_test, y_train, y_test

X_train, X_test, y_train, y_test = pre2_gbdt(df_train_100, df_test_25)

In [75]:
# GBDT model only, pre2_gbdt
grd = GradientBoostingClassifier() # with default setting
grd.fit(X_train, y_train)

y_pred_train_grd = grd.predict_proba(X_train)[:, 1] # prob of y=1
y_pred_test_grd = grd.predict_proba(X_test)[:, 1] # prob of y=1
score_train = log_loss(y_train, y_pred_train_grd)
score_test = log_loss(y_test, y_pred_test_grd)
np.round(score_train, 4), np.round(score_test, 4)

(0.5054, 0.5044)

- training, testing : (0.4987, 0.4976) - dropping I12
- training, testing : (0.5038, 0.5027) - dropping I12 & above 40% dummies only (9 variables - `['encoder__x0_05db9164',
 'encoder__x4_25c83c98',
 'encoder__x7_0b153874',
 'encoder__x8_a73ee510',
 'encoder__x16_e5ba7672',
 'encoder__x18_NA',
 'encoder__x19_NA',
 'encoder__x21_NA',
 'encoder__x24_NA',
 'encoder__x25_NA']`)
- training, testing : (0.5054, 0.5044) - dropping I12 & above 50% dummies only (5 variables - `['encoder__x0_05db9164',
 'encoder__x4_25c83c98',
 'encoder__x7_0b153874',
 'encoder__x8_a73ee510',
 'encoder__x21_NA']`)

### discretization on numerical features

## 特征工程：GBDT+LR

- 人工特征工程(+scaling) + GBDT特征 + LR
