In [None]:
import numpy as np
import pandas as pd
import xgboost as xgb
import seaborn as sns
import matplotlib.pyplot as plt

from tqdm import tqdm

plt.rcParams['figure.figsize'] = (18, 9)

# Statistics

In [None]:
def statistics(y_true, y_pred):
    print('F1: {}'.format(f1_score(y_true, y_pred, average=None)))
    print('Precision: {}'.format(precision_score(y_true, y_pred, average=None)))
    print('Recall: {}'.format(recall_score(y_true, y_pred, average=None)))
    
def draw_importance(columns, importances):
    order = np.argsort(importances)
    objects = columns[order]
    y_pos = np.arange(len(objects))
    performance = np.array(importances)[order]

    plt.barh(y_pos, performance, align='center', alpha=0.5)
    plt.yticks(y_pos, objects)
    plt.xlabel('Importance')
    plt.title('Feature')

    plt.show()

# Output

In [None]:
def save_result(y_test):
    with open('y_test.csv', 'w') as f:
        for y in y_test:
            f.write('{}\n'.format(y))

# Data

In [None]:
train = pd.read_csv('../train.csv')
test = pd.read_csv('../test.csv')

In [None]:
train = train.drop(columns=['zipcode'])
test = test.drop(columns=['zipcode'])

In [None]:
useless = ['init_equities']
train = train.drop(columns=useless)
test = test.drop(columns=useless)

In [None]:
cat_features = ['region', 'company_type', 'company_name', 'inn', 'incorporation_year', 
                'okved_osn_code', 'init_equity_types', 'purpose', 'other_reasons_for_check', 
                'check_date', 'type', 'kpp', 'type_nasel_punkt', 'name_nasel_punkt', 'index']

columns = list(train.columns)
cat_features_idx = [columns.index(feature) for feature in cat_features]

In [None]:
train[cat_features] = train[cat_features].fillna("none")
train = train.fillna(0)

test[cat_features] = test[cat_features].fillna("none")
test = test.fillna(0)

Change columns type

In [None]:
for name in cat_features:
    train[name] = train[name].astype(str)
    test[name] = test[name].astype(str)
    
train['risk_category'] = train['risk_category'].astype(int)

# Benchmarks

## Catboost

In [None]:
from catboost import Pool, CatBoostClassifier
from sklearn.metrics import f1_score, precision_score, recall_score

In [None]:
np.random.seed(seed=0)

nums = 30000
inns = train.inn.unique()
np.random.shuffle(inns)
inns = set(inns[:nums])

In [None]:
mask_train = np.array([row.inn in inns for index, row in train.iterrows()])

In [None]:
mask_val = ~mask_train

### Train/Test

In [None]:
X = train.drop(columns=['inn']).as_matrix()[:, :-1]
y = train.drop(columns=['inn']).as_matrix()[:, -1].astype(int)

X_train = X[mask_train]
y_train = y[mask_train]

X_val = X[mask_val]
y_val = y[mask_val]

**Warning!**

In [None]:
cat_features_idx = np.arange(14)

### Pool

In [None]:
train_pool = Pool(X_train, y_train, cat_features_idx)
val_pool = Pool(X_val, y_val, cat_features_idx)

In [None]:
model = CatBoostClassifier(iterations=90, 
                           learning_rate=0.08, 
                           depth=5,
                           loss_function='MultiClass',
                           eval_metric='TotalF1',
                           rsm=0.8,
                           thread_count=5, 
                           logging_level='Silent')

In [None]:
model.fit(train_pool, eval_set=val_pool, plot=False)

**Train**

In [None]:
y_pred = model.predict(X_train).squeeze().astype(int)
f1_score(y_train, y_pred, average='weighted')

**Validate**

In [None]:
y_pred = model.predict(X_val).squeeze().astype(int)
f1_score(y_val, y_pred, average='weighted')

In [None]:
statistics(y_val, y_pred)

In [None]:
z = np.array(cat_features[:3] + cat_features[4:])

In [None]:
draw_importance(z, model.feature_importances_)

**Test**

In [None]:
X_test = test.as_matrix()[:, :-1]
y_test = model.predict(X_test).squeeze().astype(int)

### BagBoo

In [None]:
rows = X.shape[0]
bags = [np.random.choice(list(range(rows)), size=rows, replace=True) for _ in range(5)]

In [None]:
w = [0] + list(np.log(len(y) / np.unique(y, return_counts=True)[1]))

In [None]:
models = []

for mask in tqdm(bags):
    pool = Pool(X[mask], y[mask], cat_features_idx)
    
    model = CatBoostClassifier(iterations=90, 
                           learning_rate=0.08, 
                           depth=5,
                           loss_function='MultiClass',
                           eval_metric='TotalF1',
                           rsm=0.8,
                           thread_count=20, 
                           logging_level='Silent', class_weights=w)
    
    model.fit(pool, eval_set=val_pool, plot=False)
    models.append(model)

In [None]:
prob = np.array([model.predict_proba(X_train) for model in models])
y_pred = np.argsort(np.sum(prob, axis=0))[:, -1]
f1_score(y_train, y_pred, average='weighted')

In [None]:
statistics(y_train, y_pred)

**Validate**

In [None]:
prob = np.array([model.predict_proba(X_val) for model in models])
y_pred = np.argsort(np.sum(prob, axis=0))[:, -1]
f1_score(y_val, y_pred, average='weighted')

In [None]:
statistics(y_val, y_pred)

**Test**

In [None]:
X_test = test.drop(columns=['inn']).as_matrix()[:, :-1]

prob = np.array([model.predict_proba(X_test) for model in models])
y_test = np.argsort(np.sum(prob, axis=0))[:, -1]

In [None]:
save_result(y_test)