In [None]:
import numpy as np
import pandas as pd
import xgboost as xgb
import seaborn as sns
import matplotlib.pyplot as plt

from tqdm import tqdm

plt.rcParams['figure.figsize'] = (18, 9)

# Data

In [None]:
train = pd.read_csv('../train.csv')
test = pd.read_csv('../test.csv')

In [None]:
columns = ['region', 'company_type', 'company_name', 'inn', 'incorporation_year',
           'okved_osn_code', 'init_equities', 'init_equity_types', 'purpose',
           'other_reasons_for_check', 'check_date', 'type', 'kpp',
           'type_nasel_punkt', 'name_nasel_punkt', 'index', 'zipcode', 'risk_category']

In [None]:
train = train[columns]
test = test[columns]

In [None]:
useless = ['init_equities']
train = train.drop(columns=useless)
test = test.drop(columns=useless)

In [None]:
cat_features = ['region', 'company_type', 'company_name', 'inn', 'incorporation_year', 
                'okved_osn_code', 'init_equity_types', 'purpose', 'other_reasons_for_check', 
                'check_date', 'type', 'kpp', 'type_nasel_punkt', 'name_nasel_punkt', 'index', 'zipcode']

columns = list(train.columns)
cat_features_idx = [columns.index(feature) for feature in cat_features]

In [None]:
train[cat_features] = train[cat_features].fillna("none")
train = train.fillna(0)

test[cat_features] = test[cat_features].fillna("none")
test = test.fillna(0)

Change columns type

In [None]:
for name in cat_features:
    train[name] = train[name].astype(str)
    test[name] = test[name].astype(str)
    
train['risk_category'] = train['risk_category'].astype(int)

# Benchmarks

## Catboost

In [None]:
from catboost import Pool, CatBoostClassifier
from sklearn.metrics import f1_score, precision_score, recall_score

In [None]:
np.random.seed(seed=0)

nums = 30000
inns = train.inn.unique()
np.random.shuffle(inns)
inns = set(inns[:nums])

In [None]:
mask_train = np.array([row.inn in inns for index, row in train.iterrows()])

In [None]:
mask_val = ~mask_train

### Train/Test

In [None]:
X = train.as_matrix()[:, :-1]
y = train.as_matrix()[:, -1].astype(int)

X_train = X[mask_train]
y_train = y[mask_train]

X_val = X[mask_val]
y_val = y[mask_val]

### Class

In [None]:
def generate_data(X, y, clazz):
    indx_pos = (y == clazz)
    indx_neg = np.random.choice(np.arange(X.shape[0])[~indx_pos], size=sum(indx_pos), replace=False)
    X_new = np.concatenate((X[indx_pos], X[indx_neg]))
    y_new = np.concatenate((np.ones(sum(indx_pos)), np.zeros(sum(indx_pos))))
    return X_new, y_new

In [None]:
def train_model(X_train, y_train, X_val, y_val, clazz, plot=True, logging_level='Silent'):
    X_train_, y_train_ = generate_data(X_train, y_train, clazz)
    X_val_, y_val_ = generate_data(X_val, y_val, clazz)
    
    model = CatBoostClassifier(iterations=60, 
                           learning_rate=0.1, 
                           depth=3,
                           loss_function='Logloss',
                           eval_metric='F1',
                           rsm=0.8,
                           thread_count=5)
    
    train_pool_ = Pool(X_train_, y_train_, cat_features_idx)
    val_pool_ = Pool(X_val_, y_val_, cat_features_idx)

    model.fit(train_pool_, eval_set=val_pool_, plot=plot, logging_level=logging_level)
    
    y_pred = model.predict(X_train_).squeeze().astype(int)
    print('Train')
    print('F1: {}'.format(f1_score(y_train_, y_pred)))
    print('Precision: {}'.format(precision_score(y_train_, y_pred)))
    print('Recall: {}'.format(recall_score(y_train_, y_pred)))
    
    y_pred = model.predict(X_val_).squeeze().astype(int)
    print('\nValidate')
    print('F1: {}'.format(f1_score(y_val_, y_pred)))
    print('Precision: {}'.format(precision_score(y_val_, y_pred)))
    print('Recall: {}'.format(recall_score(y_val_, y_pred)))
    
    return model

In [None]:
model1 = train_model(X_train, y_train, X_val, y_val, clazz=1, plot=False, logging_level='Silent')

In [None]:
model2 = train_model(X_train, y_train, X_val, y_val, clazz=2, plot=False, logging_level='Silent')

In [None]:
model3 = train_model(X_train, y_train, X_val, y_val, clazz=3, plot=False, logging_level='Silent')

In [None]:
model4 = train_model(X_train, y_train, X_val, y_val, clazz=4, plot=False, logging_level='Silent')

In [None]:
model5 = train_model(X_train, y_train, X_val, y_val, clazz=5, plot=False, logging_level='Silent')

In [None]:
model6 = train_model(X_train, y_train, X_val, y_val, clazz=6, plot=False, logging_level='Silent')

In [None]:
models = [model1, model2, model3, model4, model5, model6]

In [None]:
def get_prob(models, X):
    return np.array([model.predict_proba(X)[:, 1] for model in models])

In [None]:
prob = get_prob(models, X_train)
y_pred = np.argsort(prob, axis=0)[-1] + 1
f1_score(y_train, y_pred, average='weighted')

In [None]:
prob = get_prob(models, X_val)
y_pred = np.argsort(prob, axis=0)[-1] + 1
f1_score(y_val, y_pred, average='weighted')

In [None]:
statistics(y_val, y_pred)

### Pool

In [None]:
train_pool = Pool(X_train, y_train, cat_features_idx)
val_pool = Pool(X_val, y_val, cat_features_idx)

In [None]:
model = CatBoostClassifier(iterations=60, 
                           learning_rate=0.1, 
                           depth=5,
                           loss_function='MultiClass',
                           eval_metric='TotalF1',
                           rsm=0.8,
                           thread_count=20)

In [None]:
model.fit(train_pool, eval_set=val_pool, plot=True)

**Train**

In [None]:
y_pred = model.predict(X_train).squeeze().astype(int)
f1_score(y_train, y_pred, average='weighted')

**Validate**

In [None]:
y_pred = model.predict(X_val).squeeze().astype(int)
f1_score(y_val, y_pred, average='weighted')

In [None]:
statistics(y_val, y_pred)

In [None]:
draw_importance(train.columns[:-1], model.feature_importances_)

**Test**

In [None]:
X_test = test.as_matrix()[:, :-1]
y_test = model.predict(X_test).squeeze().astype(int)

### BagBoo

In [None]:
rows = X_train.shape[0]
bags = [np.random.choice(list(range(rows)), size=rows, replace=True) for _ in range(5)]

In [None]:
models = []

for mask in tqdm(bags):
    pool = Pool(X_train[mask], y_train[mask], cat_features_idx)
    
    model = CatBoostClassifier(iterations=60, 
                               learning_rate=0.1, 
                               depth=5,
                               loss_function='MultiClass',
                               eval_metric='TotalF1',
                               rsm=0.8,
                               thread_count=20,
                               logging_level='Silent')
    
    model.fit(pool, eval_set=val_pool, plot=False)
    models.append(model)

**Train**

In [None]:
prob = np.array([model.predict_proba(X_train) for model in models])
y_pred = np.argsort(np.sum(prob, axis=0))[:, -1]
f1_score(y_train, y_pred, average='weighted')

In [None]:
statistics(y_train, y_pred)

**Validate**

In [None]:
prob = np.array([model.predict_proba(X_val) for model in models])
y_pred = np.argsort(np.sum(prob, axis=0))[:, -1]
f1_score(y_val, y_pred, average='weighted')

In [None]:
statistics(y_val, y_pred)

**Test**

In [None]:
X_test = test.as_matrix()[:, :-1]

prob = np.array([model.predict_proba(X_test) for model in models])
y_test = np.argsort(np.sum(prob, axis=0))[:, -1]

In [None]:
save_result(y_test)

# Statistics

In [None]:
def statistics(y_true, y_pred):
    print('F1: {}'.format(f1_score(y_true, y_pred, average=None)))
    print('Precision: {}'.format(precision_score(y_true, y_pred, average=None)))
    print('Recall: {}'.format(recall_score(y_true, y_pred, average=None)))

In [None]:
def draw_importance(columns, importances):
    order = np.argsort(importances)
    objects = columns[order]
    y_pos = np.arange(len(objects))
    performance = np.array(importances)[order]

    plt.barh(y_pos, performance, align='center', alpha=0.5)
    plt.yticks(y_pos, objects)
    plt.xlabel('Importance')
    plt.title('Feature')

    plt.show()

# Output

In [None]:
def save_result(y_test):
    with open('y_test.csv', 'w') as f:
        for y in y_test:
            f.write('{}\n'.format(y))