# Prepare data

Load and generate train_X, train_Y, test_X, test_Y, then convert to ndarray and save.

numeric feature in float type, category feature in object(str) type.

Generate cat_cols list, then convert to int and save.

Prepare Adult, Amazon, Click prediction, KDD appetency, KDD churn, KDD internet, KDD upselling, KDD 98, Kick prediction dataset mainly according to catboost benchmark: https://github.com/catboost/benchmarks/tree/master/quality_benchmarks.

In [22]:
import pandas as pd
import os
import numpy as np
from sklearn.metrics import log_loss
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
from io import StringIO
import re 
import pickle

In [23]:
# Adult, Amazon, Click prediction, KDD appetency, KDD churn, KDD internet, KDD upselling, KDD 98, Kick prediction
dataset = 'KDD appetency'  
dataset_path = "/home/v-tyan/NN_for_tabular/datasets_raw/"

## Adult

In [24]:
if dataset == 'Adult':
    cols = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'income']
    target_col = 'income'
    cat_cols = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country']
    num_cols = list(set(cols) - {target_col} - set(cat_cols))
    train_df = pd.read_csv(os.path.join(dataset_path, 'Adult/adult.data'), sep=', ', header=None, names=cols, na_values='?', engine='python')
    test_df = pd.read_csv(os.path.join(dataset_path, 'Adult/adult.test'), sep=', ', header=None, names=cols, na_values='?', engine='python')
    test_df.replace({'<=50K.': '<=50K', '>50K.': '>50K'}, inplace=True)
    train_X, train_Y = train_df.drop(target_col, axis=1), train_df[target_col]
    test_X, test_Y = test_df.drop(target_col, axis=1), test_df[target_col]

## Amazon

In [25]:
if dataset == 'Amazon':
    df = pd.read_csv(os.path.join(dataset_path, 'Amazon/train.csv'))
    cols = list(df.columns)
    target_col = 'ACTION'
    cat_cols = list(set(set(cols) - {target_col}))
    num_cols = []  # assume all are categorial
    X, Y = df.drop(target_col, axis=1), df[target_col]
    train_idx = pd.read_csv(os.path.join(dataset_path, "Amazon/stratified_train_idx.txt"), header=None)
    test_idx = pd.read_csv(os.path.join(dataset_path, "Amazon/stratified_test_idx.txt"), header=None)
    train_X, test_X, train_Y, test_Y = X.iloc[train_idx[0]], X.iloc[test_idx[0]], Y.iloc[train_idx[0]], Y.iloc[test_idx[0]]

## Click prediction

In [26]:
if dataset == 'Click prediction':
    cols = ['click', 'impression', 'url_hash', 'ad_id', 'advertiser_id', 'depth', 'position', 'query_id', 'keyword_id', 'title_id', 'description_id', 'user_id']
    target_col = 'click'
    cat_cols = ['impression', 'url_hash', 'ad_id', 'position', 'query_id', 'keyword_id', 'title_id', 'description_id']
    num_cols = list(set(cols) - {target_col} - set(cat_cols))
    with open(os.path.join(dataset_path, "Click prediction/track2/subsampling_idx.txt")) as fin:
        ids = list(map(int, fin.read().split()))
    unique_ids = set(ids)
    data_strings = {}
    with open(os.path.join(dataset_path, "Click prediction/track2/training.txt")) as fin:
        for i, string in enumerate(fin):
            if i in unique_ids:
                data_strings[i] = string
    data_rows = []
    for i in ids:
        data_rows.append(data_strings[i])
    df = pd.read_table(StringIO("".join(data_rows)), header=None, names=cols)    
    X, Y = df.drop(target_col, axis=1), df[target_col].apply(lambda x: 1 if x == 0 else -1)
    def clean_string(s):
        return "v_" + re.sub('[^A-Za-z0-9]+', "_", str(s))
    for cat_col in cat_cols:
        X[cat_col] = X[cat_col].apply(clean_string)
    train_idx = pd.read_csv(os.path.join(dataset_path, "Click prediction/track2/stratified_train_idx.txt"), header=None)
    test_idx = pd.read_csv(os.path.join(dataset_path, "Click prediction/track2/stratified_test_idx.txt"), header=None)
    train_X, test_X, train_Y, test_Y = X.iloc[train_idx[0]], X.iloc[test_idx[0]], Y.iloc[train_idx[0]], Y.iloc[test_idx[0]]

## KDD appetency, churn, upselling

In [27]:
if dataset in {'KDD appetency', 'KDD churn', 'KDD upselling'}:
    df = pd.read_csv(os.path.join(dataset_path, "appetency_churn_upselling/orange_small_train.data"), sep = "\t")
    cols = list(df.columns)
    cat_cols = [cols[idx] for idx in [190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206,
                207, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228]]
    num_cols = list(set(cols) - set(cat_cols))
    dataset_ = dataset.split(' ')[-1]
    X, Y = df, -pd.read_csv(os.path.join(dataset_path, f'appetency_churn_upselling/orange_small_train_{dataset_}.labels'), header=None)[0]
    train_idx = pd.read_csv(os.path.join(dataset_path, f'appetency_churn_upselling/{dataset_}/stratified_train_idx_{dataset_}.txt'), header=None)
    test_idx = pd.read_csv(os.path.join(dataset_path, f'appetency_churn_upselling/{dataset_}/stratified_test_idx_{dataset_}.txt'), header=None)
    train_X, test_X, train_Y, test_Y = X.iloc[train_idx[0]], X.iloc[test_idx[0]], Y.iloc[train_idx[0]], Y.iloc[test_idx[0]]

## Prepare numerical features

In [28]:
train_X[num_cols], test_X[num_cols] = train_X[num_cols].astype(float), test_X[num_cols].astype(float)

## Prepare category features

In [29]:
for cat_col in cat_cols:
    train_X[cat_col] = train_X[cat_col].apply(str)
    test_X[cat_col] = test_X[cat_col].apply(str)

## Convert to ndarray and int

In [30]:
if dataset in {'Adult', 'Amazon', 'Click prediction', 'KDD appetency', 'KDD churn', 'KDD upselling'}:
    cat_cols_ = []
    for idx, col in enumerate(train_X.columns):
        if col in cat_cols:
            cat_cols_.append(idx)
    cat_cols = cat_cols_
    train_X, train_Y, test_X, test_Y = train_X.values, train_Y.values, test_X.values, test_Y.values

## Save

In [31]:
data = train_X, train_Y, test_X, test_Y, cat_cols
save_dir = f"/home/v-tyan/NN_for_tabular/datasets/{dataset}.npy"
# np.save(save_dir, data)