# Test data preprocessing

In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from tqdm.notebook import tqdm
import tsfresh
import sqlite3
import gc

In [2]:
%load_ext nb_black

<IPython.core.display.Javascript object>

In [3]:
le = preprocessing.LabelEncoder()

<IPython.core.display.Javascript object>

In [4]:
con = sqlite3.connect("../../data/amex-default-prediction/amex-data.sqlite")
con

<sqlite3.Connection at 0x7fc82113e4e0>

<IPython.core.display.Javascript object>

In [5]:
customer_df = pd.read_sql_query("SELECT DISTINCT customer_ID FROM test_data", con)
customer_df

Unnamed: 0,customer_ID
0,00000469ba478561f23a92a868bd366de6f6527a684c9a...
1,00001bf2e77ff879fab36aa4fac689b9ba411dae63ae39...
2,0000210045da4f81e5f122c6bde5c2a617d03eef67f82c...
3,00003b41e58ede33b8daf61ab56d9952f17c9ad1c3976c...
4,00004b22eaeeeb0ec976890c1d9bfc14fd9427e98c4ee9...
...,...
924616,ffff952c631f2c911b8a2a8ca56ea6e656309a83d2f64c...
924617,ffffcf5df59e5e0bba2a5ac4578a34e2b5aa64a1546cd3...
924618,ffffd61f098cc056dbd7d2a21380c4804bbfe60856f475...
924619,ffffddef1fc3643ea179c93245b68dca0f36941cd83977...


<IPython.core.display.Javascript object>

In [6]:
chunks = np.array_split(
    customer_df["customer_ID"].values, int(len(customer_df) / 30000)
)  # chunk 30k
len(chunks)

30

<IPython.core.display.Javascript object>

In [7]:
def do_work(customer_ids):
    df = pd.read_sql_query(
        "SELECT * FROM test_data WHERE customer_ID IN ({seq})".format(
            seq="'" + "','".join(customer_ids) + "'"
        ),
        con,
    ).set_index("customer_ID")

    df.sort_values(["customer_ID", "S_2"], inplace=True)
    df.drop("S_2", axis=1, inplace=True)

    cat_columns = [
        "B_30",
        "B_38",
        "D_114",
        "D_116",
        "D_117",
        "D_120",
        "D_126",
        "D_63",
        "D_64",
        "D_66",
        "D_68",
    ]

    df[cat_columns] = df[cat_columns].astype(str)
    for cat_column in cat_columns:
        df[cat_column] = le.fit_transform(df[cat_column])

    categorical_df = df[cat_columns].groupby("customer_ID").max().copy()

    num_columns = list(set(df.columns) - set(cat_columns))

    df1 = (
        df[num_columns]
        .replace(r"^\s*$", np.nan, regex=True)
        .astype(float)
        .fillna(0)
        .copy()
    )
    df1 = df1.groupby("customer_ID").mean()

    return df1.join(categorical_df)

<IPython.core.display.Javascript object>

In [8]:
for idx, customer_ids in enumerate(tqdm(chunks), 1):
    test_group_df = do_work(customer_ids)

    test_group_df.to_csv(
        "../../data/amex-default-prediction/test/test_group_data_{}.csv".format(idx)
    )

    gc.collect()

  0%|          | 0/30 [00:00<?, ?it/s]

<IPython.core.display.Javascript object>