The raw data for the Adult dataset is located at the [UCI Machine Learning Repository](https://archive.ics.uci.edu/ml/datasets/adult).

# Download raw data

In [1]:
!curl https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data > train_data_raw.csv
!curl https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test | tail -n +2 > test_data_raw.csv

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 3881k  100 3881k    0     0   152k      0  0:00:25  0:00:25 --:--:--  170k:--     0
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 1956k  100 1956k    0     0  54508      0  0:00:36  0:00:36 --:--:--  127k


# Read raw data

In [2]:
import sys
sys.path.insert(0, '../..')

import pandas as pd
from data import CVCTRImputer


def adult_data_to_documents_and_labels(path):
    data_raw = pd.read_csv(path, sep=',\s', na_values=['?'], header=None)
    return data_raw.iloc[:, :-1], data_raw.iloc[:, -1]

In [3]:
train_documents_raw, train_labels_raw = adult_data_to_documents_and_labels('train_data_raw.csv')
test_documents_raw, test_labels_raw = adult_data_to_documents_and_labels('test_data_raw.csv')

  if __name__ == '__main__':


# Preprocess data
- Binarize targets
- Replace categorical features with CTRs, for cross-validation (train set) and whole train set (test set0
- Replace NaNs/missing values with column means

In [4]:
train_labels = (train_labels_raw == '>50K').astype(int)
test_labels = (test_labels_raw == '>50K.').astype(int)

In [5]:
imputer = CVCTRImputer(cat_features_idxs=[1, 3, 5, 6, 7, 8, 9, 13], n_folds=10)
imputer.fit(train_documents_raw, train_labels)
train_documents = imputer.transform_train(train_documents_raw)
test_documents = imputer.transform_test(test_documents_raw)

In [6]:
train_set_means = {feature_id: train_documents.iloc[:, feature_id].mean() for feature_id in train_documents.columns}
train_documents = train_documents.fillna(train_set_means)
test_documents = test_documents.fillna(train_set_means)

# Save data

In [7]:
def save_in_catboost_format(documents, labels, output_path, cd_path=None):
    labels_and_documents = pd.concat([labels, documents], axis=1)
    labels_and_documents.to_csv(output_path, sep='\t', header=None, index=None)
    if cd_path is not None:
        with open(cd_path, 'w') as f:
            f.write('0\tTarget\n')

In [8]:
save_in_catboost_format(train_documents, train_labels, 'train_data_catboost_format.tsv', 'cd')
# Also save features only
train_documents.to_csv('train_documents.tsv', sep='\t', header=None, index=None)

In [9]:
save_in_catboost_format(test_documents, test_labels, 'test_data_catboost_format.tsv')
# Also save features only
test_documents.to_csv('test_documents.tsv', sep='\t', header=None, index=None)