# Logistic regression + naive bayes

In [42]:
import pandas as pd
import numpy as np
import category_encoders as ce
import xgboost as xgb

import sys
import json

from collections import defaultdict
from tqdm import tqdm_notebook as tqdm

from sklearn.metrics import precision_score, confusion_matrix, f1_score, make_scorer, explained_variance_score, roc_auc_score
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import Normalizer, StandardScaler
from sklearn.naive_bayes import GaussianNB

In [2]:
class CatHandler:
    encoders = {
        'ordinal': ce.OrdinalEncoder,
        'target': ce.TargetEncoder,
        'one_hot': ce.OneHotEncoder,
        'leave_one_out': ce.LeaveOneOutEncoder,
    }
    
    def __init__(self, max_categories, params, encoder_type='ordinal'):
        assert encoder_type in self.encoders.keys()
        self._max_categories = max_categories
        self._enc_type = encoder_type
        self._params = params
        self._encoder = None
        
    def fit(self, df, y=None):
        cat_cols = df.select_dtypes(object).columns.tolist() + ['Var118', 'Var73']  # categorical columns
        unique_values = sorted([(col_name, len(df[col_name].unique())) for col_name in cat_cols], key=lambda x: x[1])
        self._chosen_cat_cols = [v[0] for v in unique_values if v[1] < self._max_categories]  # cat columns with less than MAX_CAT unique cats
        self._many_cat_cols = list(set(cat_cols) - set(self._chosen_cat_cols))  # cat columns with more than MAX_CAT unique cats
        
        self._encoder = self.encoders[self._enc_type](cols=self._chosen_cat_cols, **self._params)
        if self._enc_type in ['target', 'leave_one_out']:
            assert not y is None
            self._encoder.fit(df, y)
        else:
            self._encoder.fit(df)
        
    def transform(self, df, y):
        assert not self._encoder is None
        if self._enc_type in ['target', 'leave_one_out'] and not y is None:
            df = self._encoder.transform(df, y)
        else:
            df = self._encoder.transform(df)
        df = df.drop(self._many_cat_cols, axis=1)
        return df

    def fit_transform(self, df, y=None):
        self.fit(df, y)
        return self.transform(df, y)
    
    
class MLNAFiller:
    def __init__(self, fit_na_threshold, fill_na_threshold, strategy='mean', rest_strategy='drop'):
        """
        fit_na_threshold: columns used for training
        fill_na_threshold: columns which will be tried to fill with models
        other_strategy: what to do with bad fills
        
        """
        assert fit_na_threshold < fill_na_threshold
        assert strategy in ['mean', 'median', 'drop', 'leave']
        assert rest_strategy in ['leave', 'drop']
        self._fill_na_threshold = fill_na_threshold
        self._fit_na_threshold = fit_na_threshold
        self._inputer = None
        self._strategy = strategy
        self._rest_strategy = rest_strategy
        if strategy in ['mean', 'median']:
            self._inputer = SimpleImputer(strategy=strategy)
        
    def fit(self, df):     
        self._fit_cols = df.columns[df.isna().mean() < self._fit_na_threshold].tolist()
        to_fill_cols = df.columns[df.isna().mean() <= self._fill_na_threshold]
        self._rest_cols = df.columns[df.isna().mean() > self._fill_na_threshold]
        
        scores = {}
        for fill_col in tqdm(to_fill_cols, desc='Searching for best columns...'):
            fit_cols = ~df[self._fit_cols].columns.isin([fill_col, 'class'])
            non_na_idx = ~df[fill_col].isna()
            model = xgb.sklearn.XGBRegressor(n_estimators=100, n_jobs=-1)
            X = df.loc[non_na_idx, fit_cols]
            y = df.loc[non_na_idx, fill_col]
            train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.2)
            train_X.reset_index(drop=True), train_y.reset_index(drop=True) 
            test_X.reset_index(drop=True), test_y.reset_index(drop=True) 
            model.fit(train_X, train_y)
            pred_y = model.predict(test_X)
            score = explained_variance_score(test_y, pred_y)
            scores[fill_col] = score
            
        chosen_to_fill = [k for k in scores if scores[k] >= 0.8]
        self._inputer_cols = [k for k in scores if scores[k] < 0.8]
        self._models = {}
        
        for fill_col in tqdm(chosen_to_fill, desc='Fitting for best columns...'):
            fit_cols = ~df[self._fit_cols].columns.isin([fill_col, 'class'])
            non_na_idx = ~df[fill_col].isna()
            model = xgb.sklearn.XGBRegressor(n_estimators=100, n_jobs=-1)
            train_X = df.loc[non_na_idx, fit_cols]
            train_y = df.loc[non_na_idx, fill_col]
            model.fit(train_X, train_y)
            self._models[fill_col] = model
            
        if self._strategy in ['mean', 'median']:
            self._inputer.fit(df[self._inputer_cols])
    
    def transform(self, df):
        df = df.copy()
            
        for fill_col, model in tqdm(self._models.items()):
            fit_cols = ~df[self._fit_cols].columns.isin([fill_col, 'class'])
            pred_X = df.loc[:, fit_cols]
            pred_y = model.predict(pred_X)
            df.loc[:, fill_col] = df.loc[:, fill_col].fillna(pd.Series(pred_y))
            
        if self._strategy in ['mean', 'median']:
            df[self._inputer_cols] = self._inputer.transform(df[self._inputer_cols])
        elif self._strategy == 'drop':
            df = df.drop(self._inputer_cols, axis=1)
            
        if self._rest_strategy == 'drop':
            df = df.drop(self._rest_cols, axis=1)
            
        return df
    
    def fit_transform(self, df):
        self.fit(df)
        return self.transform(df)
    
    
class Preprocessor:
    def __init__(self, cat_handler, na_filler):
        self.cat_handler = cat_handler
        self.na_filler = na_filler
        self.scaler = StandardScaler()
        
    def fit(self, df, y=None):
        na_cols = df.columns[df.isna().all()].tolist()
        df = df.drop(na_cols, axis=1)
        self.na_cols = na_cols
        
        df = self.cat_handler.fit_transform(df, y)
        non_normalize_cols = self.cat_handler._chosen_cat_cols
        features = df.loc[:, ~df.columns.isin(non_normalize_cols)]
        features_cols = features.columns
        df.loc[:, ~df.columns.isin(non_normalize_cols)] = pd.DataFrame(self.scaler.fit_transform(features), columns=features_cols)
        df = self.na_filler.fit(df)
        
    def transform(self, df, y=None):
        df = df.drop(self.na_cols, axis=1)
        df = self.cat_handler.transform(df, y)
        non_normalize_cols = self.cat_handler._chosen_cat_cols
        features = df.loc[:, ~df.columns.isin(non_normalize_cols)]
        features_cols = features.columns
        df.loc[:, ~df.columns.isin(non_normalize_cols)] = pd.DataFrame(self.scaler.transform(features), columns=features_cols)
        df = self.na_filler.transform(df)
        return df
        
    def fit_transform(self, df, y=None):
        self.fit(df, y)
        return self.transform(df, y)

## Part 1 - filling means

In [3]:
df = pd.read_csv('../data/train.txt', sep=' ')
df = df.reset_index(drop=True)
print(f'Columns: {len(df.columns)}')
df.head()

Columns: 231


Unnamed: 0,Var1,Var2,Var3,Var4,Var5,Var6,Var7,Var8,Var9,Var10,...,Var222,Var223,Var224,Var225,Var226,Var227,Var228,Var229,Var230,class
0,,,,,,931.0,7.0,,,,...,catzS2D,LM8l689qOp,,ELof,szEZ,ZI9m,ib5G6X1eUxUn6,,,0
1,,,,,,245.0,7.0,,,,...,bTV7qqc,LM8l689qOp,,,fKCe,RAYp,F2FyR07IdsN7I,,,0
2,,,,,,791.0,7.0,,,,...,2JfQ3DB,jySVZNlOJy,,ELof,7aLG,RAYp,F2FyR07IdsN7I,am7c,,0
3,,,,,,1036.0,7.0,,,,...,hHJsvbM,LM8l689qOp,,,Qcbd,6fzt,SbOd7O8ky1wGNxp0Arj0Xs,,,0
4,,,,,,518.0,7.0,,,,...,APgVoGr,LM8l689qOp,,,kwS7,02N6s8f,xwM2aC7IdeMC0,,,0


In [4]:
c = CatHandler(max_categories=2000, params={'smoothing': 300}, encoder_type='target')
m = MLNAFiller(fit_na_threshold=0.1, fill_na_threshold=0.3)
p = Preprocessor(c, m)

In [79]:
df = p.fit_transform(df)
X = df.drop('class', axis=1)
y = df['class']

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


HBox(children=(IntProgress(value=0, description='Searching for best columns...', max=70, style=ProgressStyle(d…




HBox(children=(IntProgress(value=0, description='Fitting for best columns...', max=15, style=ProgressStyle(des…






HBox(children=(IntProgress(value=0, max=15), HTML(value='')))




In [83]:
model = LogisticRegression(class_weight='balanced')
params = {
    'class_weight': ['balanced'],
    'C': [1, 3, 6, 9, 12 ,15],
    'solver': ['newton-cg', 'liblinear']
}

clf = GridSearchCV(estimator=model, param_grid=params, scoring='roc_auc' ,cv=3, n_jobs=-1)
clf.fit(X, y)

GridSearchCV(cv=3, error_score='raise-deprecating',
       estimator=LogisticRegression(C=1.0, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='warn', n_jobs=None, penalty='l2', random_state=None,
          solver='warn', tol=0.0001, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'class_weight': ['balanced'], 'C': [1, 3, 6, 9, 12, 15], 'solver': ['newton-cg', 'liblinear']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='roc_auc', verbose=0)

In [84]:
clf.best_params_

{'C': 15, 'class_weight': 'balanced', 'solver': 'newton-cg'}

In [85]:
clf.cv_results_



{'mean_fit_time': array([ 8.95195826, 24.02596529,  9.01067241, 33.14362152,  7.25270788,
        40.52217325, 11.09555101, 31.85800616,  8.23669402, 26.6539309 ,
        10.03358229, 17.21585655]),
 'std_fit_time': array([0.67891729, 3.13602634, 1.34455902, 3.84561155, 0.79757415,
        1.49706325, 1.06590489, 6.63194749, 1.08488833, 1.2791305 ,
        0.32122748, 0.93630782]),
 'mean_score_time': array([0.06707358, 0.03722405, 0.02646693, 0.07146184, 0.03120279,
        0.04109526, 0.04863564, 0.03675119, 0.02642051, 0.01738167,
        0.02207065, 0.00822735]),
 'std_score_time': array([0.01366012, 0.00194921, 0.00216691, 0.02459016, 0.01155402,
        0.00963554, 0.01282431, 0.0333763 , 0.00463831, 0.00201464,
        0.00707541, 0.00079619]),
 'param_C': masked_array(data=[1, 1, 3, 3, 6, 6, 9, 9, 12, 12, 15, 15],
              mask=[False, False, False, False, False, False, False, False,
                    False, False, False, False],
        fill_value='?',
             dtyp

In [7]:
df = pd.read_csv('../data/train.txt', sep=' ')
df = df.reset_index(drop=True)
print(f'Columns: {len(df.columns)}')
df.head()

Columns: 231


Unnamed: 0,Var1,Var2,Var3,Var4,Var5,Var6,Var7,Var8,Var9,Var10,...,Var222,Var223,Var224,Var225,Var226,Var227,Var228,Var229,Var230,class
0,,,,,,931.0,7.0,,,,...,catzS2D,LM8l689qOp,,ELof,szEZ,ZI9m,ib5G6X1eUxUn6,,,0
1,,,,,,245.0,7.0,,,,...,bTV7qqc,LM8l689qOp,,,fKCe,RAYp,F2FyR07IdsN7I,,,0
2,,,,,,791.0,7.0,,,,...,2JfQ3DB,jySVZNlOJy,,ELof,7aLG,RAYp,F2FyR07IdsN7I,am7c,,0
3,,,,,,1036.0,7.0,,,,...,hHJsvbM,LM8l689qOp,,,Qcbd,6fzt,SbOd7O8ky1wGNxp0Arj0Xs,,,0
4,,,,,,518.0,7.0,,,,...,APgVoGr,LM8l689qOp,,,kwS7,02N6s8f,xwM2aC7IdeMC0,,,0


In [29]:
train_df, test_df = train_test_split(df, test_size=0.3)
train_df = train_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)

In [38]:
c = CatHandler(max_categories=60, params={'smoothing': 300}, encoder_type='target')
m = MLNAFiller(fit_na_threshold=0.1, fill_na_threshold=0.3, strategy='median')
p = Preprocessor(c, m)

In [39]:
train_X = train_df.drop('class', axis=1)
train_y = train_df['class']
test_X = test_df.drop('class', axis=1)
test_y = test_df['class']

train_X = p.fit_transform(train_X, train_y)
test_X = p.transform(test_X)

HBox(children=(IntProgress(value=0, description='Searching for best columns...', max=64, style=ProgressStyle(d…




HBox(children=(IntProgress(value=0, description='Fitting for best columns...', max=17, style=ProgressStyle(des…




HBox(children=(IntProgress(value=0, max=17), HTML(value='')))




HBox(children=(IntProgress(value=0, max=17), HTML(value='')))




In [40]:
model = LogisticRegression(C=15, class_weight='balanced', solver='newton-cg')
model.fit(train_X, train_y)

LogisticRegression(C=15, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='warn', n_jobs=None, penalty='l2', random_state=None,
          solver='newton-cg', tol=0.0001, verbose=0, warm_start=False)

In [41]:
def custom_score(y_true, y_pred):
    ind = np.argsort(y_pred)[::-1]
    check_size = int(np.ceil(0.1 * len(y_true)))
    y_true = y_true[ind][:check_size]
    return np.mean(y_true)

y_preds_proba = model.predict_proba(test_X)[:,1]
c_score = custom_score(test_y, y_preds_proba)
auc = roc_auc_score(test_y, y_preds_proba)
c_score, auc

(0.32666666666666666, 0.8001066414455243)

# Naive Bayes

In [43]:
df = pd.read_csv('../data/train.txt', sep=' ')
df = df.reset_index(drop=True)
print(f'Columns: {len(df.columns)}')
df.head()

Columns: 231


Unnamed: 0,Var1,Var2,Var3,Var4,Var5,Var6,Var7,Var8,Var9,Var10,...,Var222,Var223,Var224,Var225,Var226,Var227,Var228,Var229,Var230,class
0,,,,,,931.0,7.0,,,,...,catzS2D,LM8l689qOp,,ELof,szEZ,ZI9m,ib5G6X1eUxUn6,,,0
1,,,,,,245.0,7.0,,,,...,bTV7qqc,LM8l689qOp,,,fKCe,RAYp,F2FyR07IdsN7I,,,0
2,,,,,,791.0,7.0,,,,...,2JfQ3DB,jySVZNlOJy,,ELof,7aLG,RAYp,F2FyR07IdsN7I,am7c,,0
3,,,,,,1036.0,7.0,,,,...,hHJsvbM,LM8l689qOp,,,Qcbd,6fzt,SbOd7O8ky1wGNxp0Arj0Xs,,,0
4,,,,,,518.0,7.0,,,,...,APgVoGr,LM8l689qOp,,,kwS7,02N6s8f,xwM2aC7IdeMC0,,,0


In [44]:
train_df, test_df = train_test_split(df, test_size=0.3)
train_df = train_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)

In [63]:
c = CatHandler(max_categories=60, params={'smoothing': 300}, encoder_type='target')
m = MLNAFiller(fit_na_threshold=0.1, fill_na_threshold=0.3, strategy='median')
p = Preprocessor(c, m)

In [64]:
train_X = train_df.drop('class', axis=1)
train_y = train_df['class']
test_X = test_df.drop('class', axis=1)
test_y = test_df['class']

train_X = p.fit_transform(train_X, train_y)
test_X = p.transform(test_X)

HBox(children=(IntProgress(value=0, description='Searching for best columns...', max=64, style=ProgressStyle(d…




HBox(children=(IntProgress(value=0, description='Fitting for best columns...', max=17, style=ProgressStyle(des…




HBox(children=(IntProgress(value=0, max=17), HTML(value='')))




HBox(children=(IntProgress(value=0, max=17), HTML(value='')))




In [65]:
model = GaussianNB()
model.fit(train_X, train_y)

GaussianNB(priors=None, var_smoothing=1e-09)

In [66]:
y_preds_proba = model.predict_proba(test_X)[:,1]
c_score = custom_score(test_y, y_preds_proba)
auc = roc_auc_score(test_y, y_preds_proba)
c_score, auc

(0.16583333333333333, 0.6937247807387528)