# Generating prob for test dataset

In [1]:
import pandas as pd
import numpy as np
import category_encoders as ce
import xgboost as xgb
import lightgbm as lgb
import catboost as ctb

import sys
import json

import matplotlib.pyplot as plt

from collections import defaultdict
from tqdm import tqdm_notebook as tqdm

from sklearn.metrics import precision_score, confusion_matrix, f1_score, make_scorer, explained_variance_score, roc_auc_score, roc_curve
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import Normalizer, StandardScaler

In [2]:
class CatHandler:
    encoders = {
        'ordinal': ce.OrdinalEncoder,
        'target': ce.TargetEncoder,
        'one_hot': ce.OneHotEncoder,
        'leave_one_out': ce.LeaveOneOutEncoder,
    }
    
    def __init__(self, max_categories, params, encoder_type='ordinal'):
        assert encoder_type in self.encoders.keys()
        self._max_categories = max_categories
        self._enc_type = encoder_type
        self._params = params
        self._encoder = None
        
    def fit(self, df, y=None):
        cat_cols = df.select_dtypes(object).columns.tolist() + ['Var118', 'Var73']  # categorical columns
        unique_values = sorted([(col_name, len(df[col_name].unique())) for col_name in cat_cols], key=lambda x: x[1])
        self._chosen_cat_cols = [v[0] for v in unique_values if v[1] < self._max_categories]  # cat columns with less than MAX_CAT unique cats
        self._many_cat_cols = list(set(cat_cols) - set(self._chosen_cat_cols))  # cat columns with more than MAX_CAT unique cats
        
        self._encoder = self.encoders[self._enc_type](cols=self._chosen_cat_cols, **self._params)
        if self._enc_type in ['target', 'leave_one_out']:
            assert not y is None
            self._encoder.fit(df, y)
        else:
            self._encoder.fit(df)
        
    def transform(self, df, y):
        assert not self._encoder is None
        if self._enc_type in ['target', 'leave_one_out'] and not y is None:
            df = self._encoder.transform(df, y)
        else:
            df = self._encoder.transform(df)
        df = df.drop(self._many_cat_cols, axis=1)
        return df

    def fit_transform(self, df, y=None):
        self.fit(df, y)
        return self.transform(df, y)
    
    
class MLNAFiller:
    def __init__(self, fit_na_threshold, fill_na_threshold, strategy='mean', rest_strategy='drop'):
        """
        fit_na_threshold: columns used for training
        fill_na_threshold: columns which will be tried to fill with models
        other_strategy: what to do with bad fills
        
        """
        assert fit_na_threshold < fill_na_threshold
        assert strategy in ['mean', 'median', 'drop', 'leave']
        assert rest_strategy in ['leave', 'drop']
        self._fill_na_threshold = fill_na_threshold
        self._fit_na_threshold = fit_na_threshold
        self._inputer = None
        self._strategy = strategy
        self._rest_strategy = rest_strategy
        if strategy in ['mean', 'median']:
            self._inputer = SimpleImputer(strategy=strategy)
        
    def fit(self, df):     
        self._fit_cols = df.columns[df.isna().mean() < self._fit_na_threshold].tolist()
        to_fill_cols = df.columns[df.isna().mean() <= self._fill_na_threshold]
        self._rest_cols = df.columns[df.isna().mean() > self._fill_na_threshold]
        
        scores = {}
        for fill_col in tqdm(to_fill_cols, desc='Searching for best columns...'):
            fit_cols = ~df[self._fit_cols].columns.isin([fill_col, 'class'])
            non_na_idx = ~df[fill_col].isna()
            model = xgb.sklearn.XGBRegressor(n_estimators=100, n_jobs=-1)
            X = df.loc[non_na_idx, fit_cols]
            y = df.loc[non_na_idx, fill_col]
            train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.2)
            train_X.reset_index(drop=True), train_y.reset_index(drop=True) 
            test_X.reset_index(drop=True), test_y.reset_index(drop=True) 
            model.fit(train_X, train_y)
            pred_y = model.predict(test_X)
            score = explained_variance_score(test_y, pred_y)
            scores[fill_col] = score
            
        chosen_to_fill = [k for k in scores if scores[k] >= 0.8]
        self._inputer_cols = [k for k in scores if scores[k] < 0.8]
        self._models = {}
        
        for fill_col in tqdm(chosen_to_fill, desc='Fitting for best columns...'):
            fit_cols = ~df[self._fit_cols].columns.isin([fill_col, 'class'])
            non_na_idx = ~df[fill_col].isna()
            model = xgb.sklearn.XGBRegressor(n_estimators=100, n_jobs=-1)
            train_X = df.loc[non_na_idx, fit_cols]
            train_y = df.loc[non_na_idx, fill_col]
            model.fit(train_X, train_y)
            self._models[fill_col] = model
            
        if self._strategy in ['mean', 'median']:
            self._inputer.fit(df[self._inputer_cols])
    
    def transform(self, df):
        df = df.copy()
            
        for fill_col, model in tqdm(self._models.items()):
            fit_cols = ~df[self._fit_cols].columns.isin([fill_col, 'class'])
            pred_X = df.loc[:, fit_cols]
            pred_y = model.predict(pred_X)
            df.loc[:, fill_col] = df.loc[:, fill_col].fillna(pd.Series(pred_y))
            
        if self._strategy in ['mean', 'median']:
            df[self._inputer_cols] = self._inputer.transform(df[self._inputer_cols])
        elif self._strategy == 'drop':
            df = df.drop(self._inputer_cols, axis=1)
            
        if self._rest_strategy == 'drop':
            df = df.drop(self._rest_cols, axis=1)
            
        return df
    
    def fit_transform(self, df):
        self.fit(df)
        return self.transform(df)
    
    
class Preprocessor:
    def __init__(self, cat_handler, na_filler):
        self.cat_handler = cat_handler
        self.na_filler = na_filler
        self.scaler = StandardScaler()
        
    def fit(self, df, y=None):
        na_cols = df.columns[df.isna().all()].tolist()
        df = df.drop(na_cols, axis=1)
        self.na_cols = na_cols
        
        df = self.cat_handler.fit_transform(df, y)
        non_normalize_cols = self.cat_handler._chosen_cat_cols
        features = df.loc[:, ~df.columns.isin(non_normalize_cols)]
        features_cols = features.columns
        df.loc[:, ~df.columns.isin(non_normalize_cols)] = pd.DataFrame(self.scaler.fit_transform(features), columns=features_cols)
        df = self.na_filler.fit(df)
        
    def transform(self, df, y=None):
        df = df.drop(self.na_cols, axis=1)
        df = self.cat_handler.transform(df, y)
        non_normalize_cols = self.cat_handler._chosen_cat_cols
        features = df.loc[:, ~df.columns.isin(non_normalize_cols)]
        features_cols = features.columns
        df.loc[:, ~df.columns.isin(non_normalize_cols)] = pd.DataFrame(self.scaler.transform(features), columns=features_cols)
        df = self.na_filler.transform(df)
        return df
        
    def fit_transform(self, df, y=None):
        self.fit(df, y)
        return self.transform(df, y)

In [3]:
train_df = pd.read_csv('../data/train.txt', sep=' ')
train_df = train_df.reset_index(drop=True)
print(f'Columns: {len(train_df.columns)}')
train_df.head()

Columns: 231


Unnamed: 0,Var1,Var2,Var3,Var4,Var5,Var6,Var7,Var8,Var9,Var10,...,Var222,Var223,Var224,Var225,Var226,Var227,Var228,Var229,Var230,class
0,,,,,,931.0,7.0,,,,...,catzS2D,LM8l689qOp,,ELof,szEZ,ZI9m,ib5G6X1eUxUn6,,,0
1,,,,,,245.0,7.0,,,,...,bTV7qqc,LM8l689qOp,,,fKCe,RAYp,F2FyR07IdsN7I,,,0
2,,,,,,791.0,7.0,,,,...,2JfQ3DB,jySVZNlOJy,,ELof,7aLG,RAYp,F2FyR07IdsN7I,am7c,,0
3,,,,,,1036.0,7.0,,,,...,hHJsvbM,LM8l689qOp,,,Qcbd,6fzt,SbOd7O8ky1wGNxp0Arj0Xs,,,0
4,,,,,,518.0,7.0,,,,...,APgVoGr,LM8l689qOp,,,kwS7,02N6s8f,xwM2aC7IdeMC0,,,0


In [4]:
test_df = pd.read_csv('../data/testx.txt', sep=' ')
test_df = test_df.reset_index(drop=True)
print(f'Columns: {len(test_df.columns)}')
test_df.head()

Columns: 230


Unnamed: 0,Var1,Var2,Var3,Var4,Var5,Var6,Var7,Var8,Var9,Var10,...,Var221,Var222,Var223,Var224,Var225,Var226,Var227,Var228,Var229,Var230
0,,,,,,,0.0,,,,...,oslk,CE7uk3u,LM8l689qOp,,,FSa2,RAYp,F2FyR07IdsN7I,,
1,,,,,,1141.0,7.0,,,,...,oslk,EPqQcw6,LM8l689qOp,,kG3k,FSa2,RAYp,55YFVY9,,
2,,,,,,490.0,7.0,,,,...,zCkv,catzS2D,LM8l689qOp,,kG3k,WqMG,ZI9m,ib5G6X1eUxUn6,mj86,
3,,,,,,2268.0,0.0,,,,...,oslk,QKXEsaq,LM8l689qOp,,xG3x,Qu4f,RAYp,F2FyR07IdsN7I,,
4,,,,,,3633.0,7.0,,,,...,oslk,kYwEsaq,LM8l689qOp,,,PM2D,RAYp,F2FyR07IdsN7I,,


# Catboost

In [5]:
c = CatHandler(max_categories=2000, params={}, encoder_type='ordinal')
m = MLNAFiller(fit_na_threshold=0.1, fill_na_threshold=0.3, strategy='leave', rest_strategy='leave')
p = Preprocessor(c, m)

In [6]:
train_X = train_df.drop('class', axis=1)
train_y = train_df['class']
test_X = test_df

train_X = p.fit_transform(train_X, train_y)
test_X = p.transform(test_X)

HBox(children=(IntProgress(value=0, description='Searching for best columns...', max=70, style=ProgressStyle(d…




HBox(children=(IntProgress(value=0, description='Fitting for best columns...', max=15, style=ProgressStyle(des…




HBox(children=(IntProgress(value=0, max=15), HTML(value='')))




HBox(children=(IntProgress(value=0, max=15), HTML(value='')))




In [7]:
model = ctb.CatBoostClassifier(iterations=1000, depth=6, eval_metric="AUC", task_type='GPU', logging_level='Silent', l2_leaf_reg=1)
model.fit(train_X, train_y, cat_features=p.cat_handler._chosen_cat_cols)

<catboost.core.CatBoostClassifier at 0x7f634aa00a20>

In [8]:
y_train_proba = model.predict_proba(train_X)[:,1]
auc = roc_auc_score(train_y.values, y_train_proba)
auc

0.8997191503568653

In [9]:
y_preds_proba = model.predict_proba(test_X)[:,1]
y_preds_proba

array([0.00220563, 0.0672546 , 0.03408449, ..., 0.0962345 , 0.03364061,
       0.01599291])

In [10]:
with open('../BARPAS.txt', 'w') as f:
    f.write('"BARPAS"\n')
    for x in y_preds_proba:
        f.write(f'{x}\n')