In [6]:
import warnings
warnings.simplefilter('ignore')

from typing import List

import os
import collections
from datetime import datetime
from functools import wraps

import pandas as pd
import numpy as np

from sklearn import preprocessing
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix
from xgboost import XGBClassifier

from tqdm import tqdm

import plotly
import plotly.offline as plt2
import plotly.graph_objs as go
import matplotlib.pyplot as plt
%matplotlib inline

plt2.init_notebook_mode(connected=True)


BASE_DIR = "data"
TRAIN_FILENAME = os.path.join(BASE_DIR, "train.csv")
TEST_FILENAME = os.path.join(BASE_DIR, "test.csv")
RESULT_FILENAME = lambda version: os.path.join(BASE_DIR, "result_{}.csv".format(version))

PERIOD_FIELD = "PERIOD"
IDX_FIELD = "cl_id"
MCC_FIELD = "MCC"
CHANNEL_FIELD = "channel_type"
CURRENCY_FIELD = "currency"
TRANSACTION_FIELD = "TRDATETIME"
AMOUNT_FIELD = "amount"
CATEGORY_FIELD = "trx_category"
TARGET_FIELD = "target_flag"
TARGET_SUM_FIELD = "target_sum_field"

CATEGORY_TRANSFORM = {
    "POS": 0,
    "C2C_OUT": 1,
    "DEPOSIT": 2,
    "WD_ATM_PARTNER": 3,
    "WD_ATM_ROS": 4,
    "BACK_TRX": 5,
    "WD_ATM_OTHER": 6,
    "C2C_IN": 7,
    "CAT": 8,
    "CASH_ADV": 9
}
CHANNEL_TRANSFORM = {
    "type{}".format(i): i for i in range(6)
}

In [2]:
def saver(func):
    @wraps(func)
    def f(*args, version="version", **kwargs):
        filename = RESULT_FILENAME(version)
        
        if not os.path.exists(filename):
            df = func(*args, **kwargs)
            df.to_csv(filename, sep=",", index=False)
        else:
            df = pd.read_csv(filename, sep=",", header=0)
            if len(df.columns) == 1:
                df = df[df.columns[0]]
        
        return df
    
    return f


def one_call(func):
    result = None
    
    @wraps(func)
    def f(*args, **kwargs):
        nonlocal result

        if result is None:
            result = func(*args, **kwargs)
        
        return result
    
    return f
            

def read_data(filename: str) -> str:
    df = pd.read_csv(filename, header=0)
    df[PERIOD_FIELD] = df[PERIOD_FIELD].apply(lambda s: datetime.strptime(s, "%d/%m/%Y"))
    df[TRANSACTION_FIELD] = df[TRANSACTION_FIELD].apply(lambda s: datetime.strptime(s, "%d%b%y:%X"))
    df[CHANNEL_FIELD] = df[CHANNEL_FIELD].fillna("type0")
    
    df[CHANNEL_FIELD] = df[CHANNEL_FIELD].apply(lambda x: CHANNEL_TRANSFORM[x])
    df[CATEGORY_FIELD] = df[CATEGORY_FIELD].apply(lambda x: CATEGORY_TRANSFORM[x])
    return df

In [3]:
train = read_data(TRAIN_FILENAME)
test = read_data(TEST_FILENAME)

train.head(10)

Unnamed: 0,PERIOD,cl_id,MCC,channel_type,currency,TRDATETIME,amount,trx_category,target_flag,target_sum
0,2017-10-01,0,5200,0,810,2017-10-21 00:00:00,5023.0,0,0,0.0
1,2017-10-01,0,6011,0,810,2017-10-12 12:24:07,20000.0,2,0,0.0
2,2017-12-01,0,5921,0,810,2017-12-05 00:00:00,767.0,0,0,0.0
3,2017-10-01,0,5411,0,810,2017-10-21 00:00:00,2031.0,0,0,0.0
4,2017-10-01,0,6012,0,810,2017-10-24 13:14:24,36562.0,1,0,0.0
5,2017-10-01,1,5814,0,810,2017-10-16 00:00:00,380.0,0,0,0.0
6,2017-10-01,1,5814,0,810,2017-10-10 00:00:00,378.0,0,0,0.0
7,2017-10-01,1,5814,0,810,2017-10-16 00:00:00,199.0,0,0,0.0
8,2017-10-01,1,5814,0,810,2017-10-11 00:00:00,400.0,0,0,0.0
9,2017-07-01,1,5411,0,810,2017-07-26 00:00:00,598.0,0,0,0.0


# Visualisation

In [29]:
def bar_visualize(train, group_field, fields):
    df = train.groupby(group_field).agg(fields)
    
    traces = []
    for field_name, field_value in fields.items():
        if isinstance(field_value, collections.Iterable):
            for subfield_name in field_value:
                trace = go.Bar(
                    x=df.index,
                    y=df[field_name][subfield_name]
                )
                traces.append(trace)
        else:
            trace = go.bar(
                x=df.index,
                y=df[field_name]
            )
            traces.append(trace)
    data = go.Data(traces)
    plt2.iplot(data)


def scatter_visualize(train, x_field, y_field, x_agg_func="sum", y_agg_func="sum"):
    df = train.groupby(IDX_FIELD).agg({
        x_field: x_agg_func,
        y_field: y_agg_func,
        TARGET_FIELD: "mean"
    })
    trace = go.Scatter(
        x=df[x_field],
        y=df[y_field],
        mode="markers",
        marker=dict(
            color=df[TARGET_FIELD]
        )
    )
    data = go.Data([trace])
    plt2.iplot(data)

In [21]:
bar_visualize(train, MCC_FIELD, {TARGET_FIELD: ["sum", "count"]})

In [22]:
bar_visualize(train, CATEGORY_FIELD, {TARGET_FIELD: ["sum", "count"]})

In [30]:
scatter_visualize(train, CATEGORY_FIELD, MCC_FIELD)

In [31]:
scatter_visualize(train, MCC_FIELD, CATEGORY_FIELD)

In [4]:
@one_call
def __get_one_hot_encoder(df: pd.DataFrame) -> preprocessing.OneHotEncoder:
    est = preprocessing.OneHotEncoder()
    est.fit(df)
    return est


@one_call
def __get_min_max_scaler(df: pd.DataFrame) -> preprocessing.MinMaxScaler:
    est = preprocessing.MinMaxScaler()
    est.fit(df)
    return est


def __gen_transformed_data(X, est):
    def transform(data):
        return est.transform(data).toarray()
    
    data = []
    for i, (_, line) in enumerate(X.iterrows()):
        data.append(line)
        
        if not i % 1000:
            transform_data = transform(data)
            yield transform_data
            data = []
    
    yield transform(data)


@saver
def preprocessing_(X: pd.DataFrame, scale=False, version: str="version") -> pd.DataFrame:
    categorical_fields = [MCC_FIELD, CHANNEL_FIELD, CURRENCY_FIELD, CATEGORY_FIELD]
    categorical_df = X[categorical_fields]
    base_df = X[[IDX_FIELD, AMOUNT_FIELD]]
    
    est = __get_one_hot_encoder(train[categorical_fields])
    one_hot_data = __gen_transformed_data(categorical_df, est)
    one_hot_df = pd.DataFrame()
    for chunk in tqdm(one_hot_data):
        chunk_df = pd.DataFrame(chunk)
        one_hot_df = one_hot_df.append(chunk_df, ignore_index=True)

    df = pd.concat([base_df, one_hot_df], axis=1)
    for col_name in one_hot_df.columns:
        df[col_name] = df[col_name] * df[AMOUNT_FIELD]
    
    agg_funcs = {col_name: "sum" for name in one_hot_df.columns}
    agg_funcs[AMOUNT_FIELD] = "mean"
    df = df.groupby(IDX_FIELD).agg(agg_funcs)
    
    if scale:
        scaler = __get_min_max_scaler(df)
        data = scaler.transform(df)
        df = pd.DataFrame(data=data, columns=df.columns)
    return df


@saver
def get_y(X: pd.DataFrame, version: str="version") -> pd.Series:
    return X[TARGET_FIELD]


X_train, y_train = preprocessing_(train, version="preprocessing_train"), get_y(train, version="y")
X_test = preprocessing_(test, version="preprocessing_test")

492it [04:17,  1.91it/s]
520it [04:40,  1.86it/s]


In [5]:
X_train.head()

Unnamed: 0_level_0,418,amount
cl_id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0.0,12876.6
1,0.0,2572.865769
5,0.0,3847.66993
9,0.0,21777.31
10,0.0,2428.388747


In [None]:
def fit_(X_train: pd.DataFrame, y_train: pd.Series):
    est = XGBClassifier()
    est.fit(X_train, y_train)
    
    return est


est = fit_(X_train, y_train)

In [None]:
def evaluate_(est):
    y_predict = est.predict(X_train)
    cm = confusion_matrix(y_train, y_predict)
    print(cm)


evaluate_(est)

In [None]:
def write(X_test: pd.DataFrame, est, version="0.0.1"):
    y_predict = est.predict_proba(X_test)
    result = pd.DataFrame(data={"_ID_": X_test[IDX_FIELD], "_VAL_": y_predict[:, 1]})
    
    filename = RESULT_FILENAME(version)
    result.to_csv(filename, sep=",", index=False)

    
write(X_test, est)