In [1]:
import numpy as np
import pandas as pd
# from xgboost import XGBClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV, ShuffleSplit
from sklearn.decomposition import PCA
from sklearn.impute import SimpleImputer


In [2]:
def get_overview(*dfs, topn=5, onlyin=True):
    '''
    This function provides a side-by-side overview of the dataframes.
    Parameters
    ----------
    dfs : pd.DataFrame or pd.DataFrame, str
        A sequence of dataframes that can be accompanied with string suffixes.
    topn : int
        Number of most frequent values for each column of each dataframe.
    onlyin : bool
        With multiple dataframes, show the number of unique values exclusive to each dataframe.
    '''
    from pandas.api.types import is_numeric_dtype
    params = {'dfs': [], 'suffix': []}
    for p in dfs:
        if type(p) is pd.DataFrame:
            params['dfs'].append(p)
        elif type(p) is str:
            params['suffix'].append(p)
        else:
            raise TypeError('First arguments must be either DataFrame or str. '
                            'Use keyword parameters to set `topn` and `onlyin`.')
    dfs, suffix = params['dfs'], params['suffix']
    assert (len(dfs) > 0), 'No dataframe is found.'
    suffix = [str(i) for i in range(1, len(dfs)+1)] if (len(suffix) != len(dfs)) else suffix
    
    def na_percent(df, col):
        na_cnt = df[col].isna().sum()
        if na_cnt == 0:
            return '-'
        if na_cnt == df[col].shape[0]:
            return 'ALL'
        p = na_cnt / df[col].shape[0] * 100
        if p < 1:
            return '<1%'
        if p > 99:
            return '>99%'
        return f'{round(p)}%'
    
    all_stats = None
    for i, df in enumerate(dfs):        
        df_stats = pd.DataFrame(df.dtypes, columns=[f'dtypes{suffix[i]}'])
        df_stats[f'nan{suffix[i]}'] = df_stats.index.map(lambda col: na_percent(df, col))
        df_stats[f'uniques{suffix[i]}'] = df_stats.index.map(lambda col: df[col].nunique()).astype('str')
        df_stats[f'minMedMax{suffix[i]}'] = df_stats.index.map(lambda col: 
                                                           f'[{df[col].min()} < {df[col].median()} < {df[col].max()}]'
                                                           if is_numeric_dtype(df[col]) else '?')
        df_stats[f'top{topn}values{suffix[i]}'] = df_stats.index.map(lambda col: df[col].value_counts().index.values[:topn])
        if all_stats is not None:
            all_stats = pd.merge(all_stats, df_stats, left_index=True, right_index=True, how='outer')
        else:
            all_stats = df_stats
    
    all_stats.index.name = 'Columns'
    if len(dfs) == 1 or not onlyin:
        all_stats_cols_order = [col+sx for col in ['dtypes','nan','uniques','minMedMax',f'top{topn}values'] for sx in suffix]
        return all_stats[all_stats_cols_order].fillna('-')
    
    for col in all_stats.index:
        unique_vals = {}
        for i in range(len(dfs)):
            unique_vals[i] = set(dfs[i][col].dropna()) if col in dfs[i].columns else set()
        for i, df in enumerate(dfs):
            if col in df.columns:
                onlyin = unique_vals[i]
                for j in range(len(dfs)):
                    if i != j:
                        onlyin = onlyin - unique_vals[j]
                all_stats.loc[col, f'uniques{suffix[i]}'] += f' ({len(onlyin)})'
                                                             
    index_order = all_stats.isna().sum(axis=1).sort_values().index    
    cols_order = [col+sx for col in ['dtypes', 'nan', 'uniques', 'minMedMax', f'top{topn}values'] for sx in suffix]
    return all_stats.loc[index_order, cols_order].fillna('___')

In [3]:
from abc import ABC, abstractmethod
from sklearn.base import BaseEstimator

class BaseTransformer(ABC, BaseEstimator):
    '''
    Base abstract class for the encoders created in this section.
    '''
    
    def __init__(self):
        self.model = None
    
    def fit_transform(self, X, y=None):
        self.fit(X, y)
        return self.transform(X)
    
    @abstractmethod
    def fit(self, X, y):        
        '''
        Build model.
        '''
        assert (X.index.equals(y.index)), 'Mismatched X and y.'        
        pass
    
    @abstractmethod
    def transform(self, X):
        assert (self.model is not None), 'Fit model first.'
        pass
    

class TextExpander(BaseTransformer):
    
    def __init__(self, new_part_labels, splitters=[''], column=None, copy=True, keep_column=False):
        '''        
        Parameters
        ----------
        splitters : list of str or regex
            These strs are sequentially used to split the texts into two parts every time. Use '' (empty str) for whitespace.
            Example: ['', r'\(|\)|\.', '.', ' ']
        new_part_labels : list of str
            Names for the newly created columns in the same order. Use `None` to skip unwanted parts. 
            Example:['firstname', None, 'lastname']
        column : str
            Name of the default column that is used for the transformation.
        copy : bool
            If False, passed dataframe will be expanded.
        keep_column : bool
            If True, the transformed column will be removed from the output.
        '''
        assert (len(splitters) > 0), 'No splitter is defined.'
        assert (len(splitters) + 1 == len(new_part_labels)), 'len(new_part_labels) must be equal to len(splitters)+1'
        super()
        self.model = True
        self.new_part_labels = new_part_labels
        self.splitters = splitters
        self.column = column
        self.copy = copy
        self.keep_column = keep_column
    
    def fit(self, X, y=None):
        super().fit(X, X)
        self.model = True

    def transform(self, X):        
        assert (self.column is not None or type(X) is pd.Series or len(X.columns) == 1), 'Transform one column at a time.'
        super().transform(X)        
        X = X.copy() if self.copy else X
        X = pd.DataFrame(X) if type(X) is pd.Series else X
        column = X.columns[0] if self.column is None or len(X.columns) == 1 else self.column
        assert (column in X.columns), f'X does not have a {column} column.'
        tmp = X[column]
        for i, sp in enumerate(self.splitters):            
            pat = None if sp == '' else sp
            tmp = tmp.str.split(pat, n=1, expand=True)
            partA = self.new_part_labels[i]
            if partA is not None:
                X[partA] = tmp[0]            
            tmp = tmp[1]        
        if self.new_part_labels[-1] is not None:
            X[self.new_part_labels[-1]] = tmp
        X = X.drop(columns=column, axis=1) if not self.keep_column else X
        return X

In [4]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif, mutual_info_classif, chi2
from statsmodels.stats.outliers_influence import variance_inflation_factor

class FeatureSelector(BaseTransformer):
    '''Select features according to the k highest scores and and a specified multicollinearity.
    Parameters
    ----------
    k : int
        Number of top features to select. Maximum number of features to select if `max_variance_inflation` is set.
    score_func : callable or str from {'chi2', 'mutual_info_classif', 'f_classif'}
        Function taking two arrays X and y, and returning a pair of arrays (scores, pvalues) or a single array with scores.
        Default is 'chi2'.
    variance_inflation_factor : bool or float
        A positive value that shows the amount of inflation that is allowed between features.
        Set it to False to ignore multicollinearity. Default is 10.0.
        ref: https://machinelearningmastery.com/feature-selection-machine-learning-python/
        ref: https://towardsdatascience.com/how-to-identify-the-right-independent-variables-for-machine-learning-supervised-algorithms-439986562d32
    Attributes
    ----------
    model : list
        List of selected features.
    scores_ : list
        A serie of features and their scores in an ascending order of the scores.
    '''
    
    def __init__(self, k, score_func='chi2', max_variance_inflation=10.0):
        self.k = k
        functions = {'chi2': chi2, 'mutual_info_classif': mutual_info_classif, 'f_classif': f_classif}
        self.score_func = score_func if callable(score_func) else functions[score_func]
        self.max_variance_inflation = max_variance_inflation
        self.scores_ = None
    
    def fit(self, X, y):
        '''This algorithm removes the feature with highest factor in every iteration until all remaining features
        have the required max_variance_inflation value. Then it chooses top k features according to their scores.
        '''
        super().fit(X, y)
        assert (len(X.columns) >= self.k)        
        kBest = SelectKBest(score_func=self.score_func)
        kBest.fit(X, y)
        self.scores_ = pd.Series(kBest.scores_, index=X.columns).sort_values(ascending=False)
        self.model = list(self.scores_.iloc[:self.k].index)
        
        if self.max_variance_inflation:                        
            columns = set(X.columns)
            remove = True
            while remove:
                features = list(columns)
                vif = pd.DataFrame()                
                vif['features'] = features
                vif['factor'] = [variance_inflation_factor(X[features].values, i) for i in range(len(features))]                
                if vif[vif['factor'] > self.max_variance_inflation].shape[0] == 0:                    
                    remove = False
                else:                    
                    # remove the column with highest factor value
                    col_to_remove = vif.loc[vif['factor'].idxmax(), 'features']                    
                    columns.remove(col_to_remove)            
            self.scores_ = self.scores_[columns].sort_values(ascending=False)
            self.model = list(self.scores_.iloc[:self.k].index)
    
    def transform(self, X):
        return X[self.model]

In [5]:
class TopFeatureClassifier:
    """
    Estimation based on single most descriptive feature.
    
    parameters
    ----------
    strategy : str        
        percent: for numerical features in range [0, 100], all samples equal or above 50 will be positive.        
        singleout: find a single feature value that best splits data into positive and negative.
        mxtptn: mark feature values in a way that maximizes tp + tn.
    
    attribute
    ---------
    model_ : dict        
    """
    
    def __init__(self, strategy='distrib'):
        assert (strategy in ['percent','singleout','mxtptn']), 'strategy must be one of these: percent, singleout, or mxtptn.'
        self.strategy = strategy
        self.model_ = {'col': None, 'val': None, 'pred': None}
        
    def fit(self, X, y):
        assert (X.index.equals(y.index)), 'Mismatched X and y.'        
        
        if self.strategy == 'percent':
            self.model_['val'], self.model_['pred'] = 50, 1
            max_acc = 0.0
            columns = (col for col in X if (X[col].min() >= 0 and X[col].max() <= 100))
            for col in columns:
                corrects = 0
                acc = X[((X[col] >= 50) & (y == 1)) | ((X[col] < 50) & (y == 0))].shape[0] / X.shape[0]
                if acc > max_acc:
                    self.model_['col'] = col
                    max_acc = acc
        
        elif self.strategy == 'singleout':
            max_acc = 0.0
            for col in X:
                pcat, tp1 = list(X.loc[y==1, col].value_counts().items())[0]
                tn1 = X[(X[col] != pcat) & (y==0)].shape[0]
                ncat, tn2 = list(X.loc[y==0, col].value_counts().items())[0]
                tp2 = X[(X[col] != ncat) & (y==1)].shape[0]
                plan = (1, col, pcat) if tp1 + tn1 >= tp2 + tn2 else (0, col, ncat)
                acc = (1/X.shape[0]) * (X[(y==plan[0]) & (X[plan[1]] == plan[2])].shape[0] + 
                                        X[(y==(1-plan[0])) & (X[plan[1]] != plan[2])].shape[0])
                if acc > max_acc:
                    self.model_['pred'], self.model_['col'], self.model_['val'] = plan[0], plan[1], plan[2]
                    max_acc = acc                    
        
        else: # self.strategy == 'mxtptn'
            self.model_['pred'] = 1
            max_acc = 0.0
            for col in X:
                corrects = 0
                uniqs = X[col].unique()
                o = set()
                for v in uniqs:
                    positives = X[(X[col] == v) & (y==1)].shape[0]
                    negatives = X[(X[col] == v) & (y==0)].shape[0]
                    if positives >= negatives:         # always mark v as positive
                        o.add(v)
                        corrects += positives
                    else:                              # always mark v as negative
                        corrects += negatives
                if corrects / X.shape[0] > max_acc:
                    self.model_['col'] = col
                    self.model_['val'] = o
                    max_acc = corrects / X.shape[0]
    
    def predict(self, X):
        assert(self.model_ != None), 'Classifier is not trained'
        col, val, pred = self.model_['col'], self.model_['val'], self.model_['pred']
        predictions = pd.Series(1-pred, name='predictions', index=X.index)
        if self.strategy == 'percent':
            predictions[X[col] >= val] = pred
        elif self.strategy == 'singleout':
            predictions[X[col] == val] = pred
        else: # self.strategy == 'mxtptn'
            for v in val:
                predictions[X[col] == v] = pred
        return predictions
    
    def __str__(self):
        col, val, pred = self.model_['col'], self.model_['val'], self.model_['pred']
        if col == None:
            return f'{type(self).__name__}(strategy={self.strategy})'
        if self.strategy == 'percent':
            return f'all X[{col}] >= {val} are {pred} otherwise {1-pred}.'
        elif self.strategy == 'singleout':
            return f'all X[{col}] == {val} are {pred} otherwise {1-pred}.'
        else: # self.strategy == 'mxtptn'
            return f'all X[{col}] in {val} are {pred} otherwise {1-pred}.'

In [6]:
class MyImputer(BaseTransformer):
    def __init__(self, strategy='most_frequent', columns=None, copy=True):
        self.strategy = strategy
        self.columns = columns
        self.copy= copy
        
    def fit(self, X, y):
        super().fit(X, y)
        self.columns = X.columns if self.columns is None else self.columns
        self.model = {}        
        for column in self.columns:
            if self.strategy == 'median':                
                self.model[column] = X[column].median()
            elif self.strategy == 'mean':
                self.model[column] = X[column].mean()
            else:                
                self.model[column] = X[column].mode()[0]
    
    def transform(self, X):        
        X = X.copy() if self.copy else X
        for column in self.model:            
            X.loc[X[column].isna(), column] = self.model[column]            
        return X

In [7]:
class Mapper:
    def __init__(self, X, y):        
        self._X = X
        self._y = y
    
    def get_maps(self, min_bin_size, columns=None, overlap_categories=True):
        '''
        ----        
        min_bin_size: dict, int, or float [0.0 < float < 1.0]
        '''
        return self._get_maps(min_bin_size, columns, overlap_categories)[0]
    
    def _get_maps(self, min_bin_size, columns=None, overlap_categories=True):
        if type(min_bin_size) is dict:
            for k in min_bin_size:
                if type(min_bin_size[k]) is int:
                    assert min_bin_size[k] >= 1
                else:
                    assert 0.0 < min_bin_size[k] < 1.0
                    min_bin_size[k] = int(min_bin_size[k] * len(self._y))
        else:
            if type(min_bin_size) is int:
                assert min_bin_size >= 1
            else:
                assert 0.0 < min_bin_size < 1.0
                min_bin_size = int(min_bin_size * len(self._y))
        
        class Map:
            def __init__(self, mapping, na_code):
                self.mapping = pd.Series(mapping, dtype='float32', name='codes')
                if pd.isna(self.mapping.index).sum() == 0:
                    # if we have zero data about NaN, set its code to the positive rate
                    self.na_cat_code = na_code
                    self.mapping = self.mapping.append(pd.Series({np.nan:self.na_cat_code}))
                else:
                    self.na_cat_code = self.mapping[pd.isna(self.mapping.index)].values[0]
            def __str__(self):
                return 'Map:\nColumn Value -> Category Code\n' + str(self.mapping)
            def __repr__(self):
                return 'Map:\nColumn Value -> Category Code\n' + repr(self.mapping)
            def __call__(self, key):            
                return self[key]
            def __getitem__(self, key):
                try:
                    return self.mapping[key]
                except:
                    return self.na_cat_code

        from pandas.api.types import is_numeric_dtype
        y = self._y.copy(deep=True)
        df = self._X.copy(deep=True).join(y, how='inner')
        columns = self._X.columns if columns is None else columns
        y_col = y.name    
        maps, stats = {}, {}        
        for col in columns:            
            mbz = min_bin_size[col] if type(min_bin_size) is dict else min_bin_size
                        
            if is_numeric_dtype(df[col]):
                # covert to ordinal intervals                
                bin_count = max(df.shape[0] // mbz, 2)
#                 bins = pd.interval_range(df[col].min(), df[col].max(), bin_count)
                bins = pd.cut(df[col], bin_count, duplicates='drop').dtype.categories.sort_values()
                bins = pd.IntervalIndex.from_tuples(
                    [(float('-inf'), bins[0].right)] + 
                    bins[1:-1].to_tuples().tolist() + 
                    [(bins[-1].left, float('inf'))])
                df[col] = pd.cut(df[col], bins=bins, duplicates='drop')
            
            # remove outlier categories (values that do not have enough representatives)
            vcounts = df[col].value_counts()
            topn = len(vcounts[vcounts >= mbz])
            keep, setna = set(vcounts.iloc[:topn].index), set(vcounts.iloc[topn:].index)
            mp = {x: x for x in keep}
            if len(setna) > 0 and type(next(iter(setna))) is pd.Interval:
                mergecat = None
                count = mbz # set it to mbz to be reset in the first iteration
                for idx in sorted(setna):
                    if count >= mbz:                        
                        mergecat = idx
                        count = 0
                    mp[idx] = mergecat
                    count += vcounts[idx]                
            else:
                mp.update({x: np.nan for x in setna})            
            df[col] = df[col].map(mp)            

            stat_df = df.groupby(col, dropna=False)[y_col].agg([('count', 'count'), ('positive', sum)])
            stat_df['ratio'] = stat_df['positive'] / stat_df['count']
            stat_df = stat_df.sort_values(by=['ratio', 'count', col])
            na_code = round(self._y.mean() * 100)            
            mapping = {}
            if not overlap_categories:
                ### append decimal codes to the end of groups with the same percentage to make them distinguishable
                stat_df['percent'] = round(stat_df['ratio']*100)
                for percentage, intevals in stat_df.groupby('percent'):
                    adjustment = 0.0
                    for i in range(0, len(intevals)):
                        mapping[intevals.index[i]] = percentage + adjustment
                        adjustment += 0.5 / len(intevals)            
            else:
                mapping = {v:round(stat_df.iloc[i]['ratio']*100) for i, v in enumerate(stat_df.index)}
            maps[col] = Map(mapping, na_code)
            stats[col] = stat_df            
        return maps, stats
    
    def optimal_bin_size(self, columns, min_bin_size=1, max_min_bin_size=None):
        from math import log
        max_min_bin_size = len(self._X)//2 if max_min_bin_size is None else max_min_bin_size
        assert max_min_bin_size > min_bin_size >= 1
        optimal_mbz = {col: min_bin_size-1 for col in columns}
        optimal_scores = {col: 0.0 for col in columns}
        optimal_cat_cnt = {col: None for col in columns}
        pos_cnt, all_cnt = self._y.sum(), len(self._y)
        baseline = pos_cnt / all_cnt
        for col in columns:
            bc = min_bin_size
            while bc <= max_min_bin_size:
                maps, stats = self._get_maps(bc, [col])
                score = 0.0
                stat_df = stats[col]
                for i in range(stat_df.shape[0]):
                    # power two is for the synergy effect
                    score += abs(stat_df.iloc[i]['ratio'] - baseline) * (stat_df.iloc[i]['count'] / all_cnt)**2
                
                if score <= optimal_scores[col]:
                    # bc+1, bc+2, bc+4, bc+6, bc+8, bc+12, bc+16, bc+24, ...
                    bc += 2**int(log(bc - optimal_mbz[col], 2)) 
                else:                    
                    optimal_mbz[col] = bc
                    optimal_scores[col] = score
                    optimal_cat_cnt[col] = maps[col].mapping.nunique()
                    bc += 1
            
        return optimal_mbz, optimal_cat_cnt

In [8]:
class PercentageEncoder(BaseTransformer):
    """
    This class performs following tasks:
    1) Discretizes continous values to a set of bins.
    2) Selects only categories that have the minimum number of samples defined by min_bin_size. Rest of the categories
    will be assigned to nan group.
    3) Replaces each category value to the chance of that category to be a positive sample (according to y).

    Parameters
    ----------
    columns : list of str or 'auto'.
        Columns of X dataframe that are supposed to be mapped. If 'auto' is passed, all columns will be considered.
    min_bin_sizes : int, dict of features -> int, or 'auto'.
        Defines the minimum number of samples that each category must represent. If it is set to 'auto', this values is
        automatically calculated based on the X and y datasets.
    max_min_bin_sizes : int, or 'auto'.
        Defines the largest threshold that is considered to find the optimal `min_bin_size`. This value must be 
        greater than `min_bin_sizes` for each column. If this value is not defined, it will be assigned to the half of 
        the size of the dataframe.
    overlap_categories : bool.
        Whether or not let percentages have decimal points that separate categories with the same ratio of positive samples.
    copy : bool.
        Map a copied X or the original one.
    """
    
    def __init__(self, columns='auto', min_bin_sizes='auto', max_min_bin_sizes=None, overlap_categories=True, copy=True):
        assert (columns == 'auto' or type(columns) is list), "columns must be either a list of str, or 'auto'."
        assert (min_bin_sizes == 'auto' or type(min_bin_sizes) in [int, dict]), 'invalid min_bin_sizes value.'
        assert (max_min_bin_sizes is None or type(max_min_bin_sizes) is int), 'invalid max_min_bin_sizes value.'
        if type(min_bin_sizes) is int and type(max_min_bin_sizes) is int:
            assert (min_bin_sizes < max_min_bin_sizes), 'min_bin_sizes must be smaller than max_min_bin_sizes.'
        elif type(min_bin_sizes) is dict:
            if columns != 'auto':
                assert (all(key in columns for key in min_bin_sizes)), 'inconsistent columns and min_bin_sizes.'
            assert (all(value == 'auto' or type(value) is int for value in min_bin_sizes.values())), 'invalid values in min_bin_sizes.'
            if type(max_min_bin_sizes) is int:
                assert (all(value < max_min_bin_sizes for value in min_bin_sizes.values() if type(value) is int)), 'min_bin_sizes must be smaller than max_min_bin_sizes.'
        super().__init__()
        self.columns = columns
        self.min_bin_sizes = min_bin_sizes
        self.max_min_bin_sizes = max_min_bin_sizes
        self.overlap_categories = overlap_categories
        self.copy = copy
    
    def fit(self, X, y):
        super().fit(X, y)
        features = list(X.columns) if self.columns == 'auto' else self.columns
        mapper = Mapper(X, y)        
        mapper_bin_sizes = {f: self.min_bin_sizes for f in features}
        if self.min_bin_sizes == 'auto':
            mapper_bin_sizes, _ = mapper.optimal_bin_size(columns=features, max_min_bin_size=self.max_min_bin_sizes)            
        elif type(self.min_bin_sizes) is dict:
            mapper_bin_sizes = self.min_bin_sizes.copy()
            auto_cols = [col for col, mb in self.min_bin_sizes.items() if mb=='auto']
            auto_sizes, _ = mapper.optimal_bin_size(columns=auto_cols, max_min_bin_size=self.max_min_bin_sizes)
            for col in auto_cols:
                mapper_bin_sizes[col] = auto_sizes[col]
        self.model = mapper.get_maps(mapper_bin_sizes, features, self.overlap_categories)
        

    def transform(self, X):
        X = X.copy() if self.copy else X
        for col in self.model:
            X[col] = X[col].map(self.model[col])
        return X
    
    def __str__(self):
        return f'{type(self).__name__}.model is :\n{self.model}'

In [9]:
from itertools import combinations, product

class JointProbabilityExtender(BaseTransformer):
    """
    This class calculates the joint probability of two or multiple columns to specify of a positive instance.

    Parameters
    ----------
    min_group_size : int
        If the number of samples in the set defined by joint columns is greater than min_group_size, the positive sample
        rate in the set will be considered as the probability. Otherwise, the probability will be the product of the
        probabilities of the columns.
    level : int
        Maximum number of columns that can participate in joint probabilities. 
        Caution: actual counts grow exponentially!
    copy : bool
        Map a copied X or the original one.
    
    Attributes
    ----------
    efficiency_ : tuple
        This tuple shows the number of times we actually reached a subset that is not obtainable by simple calculations
        on previous columns.
        1st element: number of times that joint columns lead to a subset bigger than min_group_size,
        2nd element: number of total combinations.
    """
    
    def __init__(self, min_group_size=5, level=2, copy=True):
        assert (type(level) is int and level >= 1), 'level must be integer and greater than 0.'
        assert (type(min_group_size) is int and min_group_size >1), 'min_group_size must be integer and greater than 1.'
        super().__init__()                
        self.min_group_size = min_group_size
        self.level = level
        self.copy = copy
        self.baseline = None
        self.efficiency_ = None        
        
    def fit(self, X, y):
        super().fit(X, y)
        self.baseline = round(y.mean()*100)
        if self.level == 1:
            return
        columns = X.columns
        X = round(X) # remove decimals from percentages to make joint sets larger
        X = X.join(y, how='inner')
        ycol= y.name
        self.model = {}
        unq_vals = {}
        for col in columns:
            unq_vals[col] = X[col].unique()
        col_combo = []
        [col_combo.extend(combinations(columns, l)) for l in range(2, self.level+1)]
        
        col_val_combinations = {}
        for cols in col_combo:
            vals = (unq_vals[c] for c in cols)
            col_val_combinations[cols] = product(*vals)
        
        totcount = effectivecount = 0
        for cols in col_val_combinations:
            self.model[cols] = {}
            for vals in col_val_combinations[cols]:
                totcount += 1
                expr = (X[c] == v for c, v in zip(cols, vals))
                joint_ser = X.loc[np.logical_and.reduce(list(expr)), ycol]
                if joint_ser.size > self.min_group_size:
                    effectivecount += 1
                    self.model[cols][vals] = round(joint_ser.mean() * 100)
                else:
                    tmp_df = pd.DataFrame(data=[(c, v, (X[c] == v).sum()) for c, v in zip(cols, vals)],
                                          columns=['col', 'val', 'count'])                    
                    self.model[cols][vals] = round((tmp_df['val'] * tmp_df['count']).sum() / tmp_df['count'].sum())
        self.efficiency_ = (effectivecount, totcount)

    def transform(self, X):
        if self.level == 1:
            return X
        X = X.copy() if self.copy else X        
        for cols in self.model.keys():
            for vals in self.model[cols].keys():
                new_col_name = '_'.join(cols)                
                expr = (X[c] == v for c, v in zip(cols, vals))
                X.loc[np.logical_and.reduce(list(expr)), new_col_name] = self.model[cols][vals]
            X[new_col_name] = X[new_col_name].fillna(self.baseline)
            X[new_col_name] = X[new_col_name].astype('int16')
        return X

# Titanic

First insight on the data:

In [10]:
train_df = pd.read_csv('data/train.csv')
test_df = pd.read_csv('data/test.csv')

print(f'train shape:', train_df.shape)
print(f'test shape: ', test_df.shape)
print('Duplicated rows in train data:', train_df.duplicated().sum())
print('Duplicated rows in test data: ', test_df.duplicated().sum())

get_overview(train_df, 'TS', test_df, 'TR', topn=10)

train shape: (891, 12)
test shape:  (418, 11)
Duplicated rows in train data: 0
Duplicated rows in test data:  0


Unnamed: 0_level_0,dtypesTS,dtypesTR,nanTS,nanTR,uniquesTS,uniquesTR,minMedMaxTS,minMedMaxTR,top10valuesTS,top10valuesTR
Columns,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Age,float64,float64,20%,21%,88 (19),79 (10),[0.42 < 28.0 < 80.0],[0.17 < 27.0 < 76.0],"[24.0, 22.0, 18.0, 19.0, 30.0, 28.0, 21.0, 25....","[24.0, 21.0, 22.0, 30.0, 18.0, 27.0, 26.0, 25...."
Cabin,object,object,77%,78%,147 (110),76 (39),?,?,"[G6, C23 C25 C27, B96 B98, F2, E101, F33, C22 ...","[B57 B59 B63 B66, B45, A34, E34, C80, C23 C25 ..."
Embarked,object,object,<1%,-,3 (0),3 (0),?,?,"[S, C, Q]","[S, C, Q]"
Fare,float64,float64,-,<1%,248 (112),169 (33),[0.0 < 14.4542 < 512.3292],[0.0 < 14.4542 < 512.3292],"[8.05, 13.0, 7.8958, 7.75, 26.0, 10.5, 7.925, ...","[7.75, 26.0, 8.05, 13.0, 7.8958, 10.5, 7.775, ..."
Name,object,object,-,-,891 (889),418 (416),?,?,"[Williams, Mr. Leslie, Brocklebank, Mr. Willia...","[O'Keefe, Mr. Patrick, Rosenshine, Mr. George ..."
Parch,int64,int64,-,-,7 (0),8 (1),[0 < 0.0 < 6],[0 < 0.0 < 9],"[0, 1, 2, 5, 3, 4, 6]","[0, 1, 2, 3, 9, 4, 6, 5]"
PassengerId,int64,int64,-,-,891 (891),418 (418),[1 < 446.0 < 891],[892 < 1100.5 < 1309],"[891, 293, 304, 303, 302, 301, 300, 299, 298, ...","[1023, 1128, 1156, 1157, 1158, 1159, 1160, 116..."
Pclass,int64,int64,-,-,3 (0),3 (0),[1 < 3.0 < 3],[1 < 3.0 < 3],"[3, 1, 2]","[3, 1, 2]"
Sex,object,object,-,-,2 (0),2 (0),?,?,"[male, female]","[male, female]"
SibSp,int64,int64,-,-,7 (0),7 (0),[0 < 0.0 < 8],[0 < 0.0 < 8],"[0, 1, 2, 4, 3, 8, 5]","[0, 1, 2, 4, 3, 8, 5]"


In [11]:
train_df.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [12]:
pd.merge(train_df, test_df, on='Ticket')[['Name_x', 'Name_y', 'Survived']]

Unnamed: 0,Name_x,Name_y,Survived
0,"Cumings, Mrs. John Bradley (Florence Briggs Th...","Cumings, Mr. John Bradley",1
1,"McCarthy, Mr. Timothy J","Hilliard, Mr. Herbert Henry",0
2,"Palsson, Master. Gosta Leonard","Palsson, Master. Paul Folke",0
3,"Palsson, Miss. Torborg Danira","Palsson, Master. Paul Folke",0
4,"Palsson, Miss. Stina Viola","Palsson, Master. Paul Folke",0
...,...,...,...
292,"Compton, Miss. Sara Rebecca","Compton, Mrs. Alexander Taylor (Mary Eliza Ing...",1
293,"Compton, Miss. Sara Rebecca","Compton, Mr. Alexander Taylor Jr",1
294,"Lines, Miss. Mary Conover","Lines, Mrs. Ernest H (Elizabeth Lindsey James)",1
295,"Aks, Mrs. Sam (Leah Rosen)","Aks, Master. Philip Frank",1


In [13]:
train_df.sample(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
274,275,1,3,"Healy, Miss. Hanora ""Nora""",female,,0,0,370375,7.75,,Q
869,870,1,3,"Johnson, Master. Harold Theodor",male,4.0,1,1,347742,11.1333,,S


In [14]:
test_df.sample(3)

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
114,1006,1,"Straus, Mrs. Isidor (Rosalie Ida Blun)",female,63.0,1,0,PC 17483,221.7792,C55 C57,S
91,983,3,"Pedersen, Mr. Olaf",male,,0,0,345498,7.775,,S
124,1016,3,"Kennedy, Mr. John",male,,0,0,368783,7.75,,Q


In [15]:
print("TRAINNNNNNNNNNNNNNN")
print(train_df.info())
print("TESTTTTTTTTTTTTTTTT")
print(test_df.info())

TRAINNNNNNNNNNNNNNN
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB
None
TESTTTTTTTTTTTTTTTT
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Passe

In [16]:
def expand_names(df):    
    df_exp = df.Name.str.split(', ', n=1, expand=True)
    df['Lastname'] = df_exp[0]
    df_exp = df_exp[1].str.split('. ', n=1, expand=True)    
    df['Title'] = df_exp[0]
    df['Firstname'] = df_exp[1]

# def expand_ticket(df):    
#     s = (df['Ticket'].str.split(expand=True))[0]    
#     s[s.str.isdigit()] = np.nan
#     df['Ticket_grp'] = s

expand_names(train_df)
expand_names(test_df)
# expand_ticket(train_df)
# expand_ticket(test_df)

In [17]:
test_df.head(3)

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Lastname,Title,Firstname
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q,Kelly,Mr,James
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S,Wilkes,Mrs,James (Ellen Needs)
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q,Myles,Mr,Thomas Francis


In [18]:
train_df.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Lastname,Title,Firstname
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,Braund,Mr,Owen Harris
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,Cumings,Mrs,John Bradley (Florence Briggs Thayer)
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,Heikkinen,Miss,Laina


In [19]:
get_overview(train_df, 'TS', test_df, 'TR', topn=10)

Unnamed: 0_level_0,dtypesTS,dtypesTR,nanTS,nanTR,uniquesTS,uniquesTR,minMedMaxTS,minMedMaxTR,top10valuesTS,top10valuesTR
Columns,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Age,float64,float64,20%,21%,88 (19),79 (10),[0.42 < 28.0 < 80.0],[0.17 < 27.0 < 76.0],"[24.0, 22.0, 18.0, 19.0, 30.0, 28.0, 21.0, 25....","[24.0, 21.0, 22.0, 30.0, 18.0, 27.0, 26.0, 25...."
Cabin,object,object,77%,78%,147 (110),76 (39),?,?,"[G6, C23 C25 C27, B96 B98, F2, E101, F33, C22 ...","[B57 B59 B63 B66, B45, A34, E34, C80, C23 C25 ..."
Embarked,object,object,<1%,-,3 (0),3 (0),?,?,"[S, C, Q]","[S, C, Q]"
Fare,float64,float64,-,<1%,248 (112),169 (33),[0.0 < 14.4542 < 512.3292],[0.0 < 14.4542 < 512.3292],"[8.05, 13.0, 7.8958, 7.75, 26.0, 10.5, 7.925, ...","[7.75, 26.0, 8.05, 13.0, 7.8958, 10.5, 7.775, ..."
Firstname,object,object,-,-,799 (749),377 (327),?,?,"[John, James, Mary, William, William Henry, Be...","[Patrick, John, William, Joseph, Harry, Kate, ..."
Lastname,object,object,-,-,667 (523),352 (208),?,?,"[Andersson, Sage, Johnson, Skoog, Panula, Cart...","[Asplund, Davies, Sage, Ware, Thomas, Ryerson,..."
Name,object,object,-,-,891 (889),418 (416),?,?,"[Williams, Mr. Leslie, Brocklebank, Mr. Willia...","[O'Keefe, Mr. Patrick, Rosenshine, Mr. George ..."
Parch,int64,int64,-,-,7 (0),8 (1),[0 < 0.0 < 6],[0 < 0.0 < 9],"[0, 1, 2, 5, 3, 4, 6]","[0, 1, 2, 3, 9, 4, 6, 5]"
PassengerId,int64,int64,-,-,891 (891),418 (418),[1 < 446.0 < 891],[892 < 1100.5 < 1309],"[891, 293, 304, 303, 302, 301, 300, 299, 298, ...","[1023, 1128, 1156, 1157, 1158, 1159, 1160, 116..."
Pclass,int64,int64,-,-,3 (0),3 (0),[1 < 3.0 < 3],[1 < 3.0 < 3],"[3, 1, 2]","[3, 1, 2]"


In [20]:
features = ['Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked', 'Title', 'Lastname']
pe = PercentageEncoder(overlap_categories=False)
X = pe.fit_transform(train_df[features], train_df.Survived)
X.head(3)

Unnamed: 0,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title,Lastname
0,19.0,42.0,54.0,34.0,30.0,32.0,30.0,34.0,16.0,40.0
1,74.0,42.0,54.0,34.0,30.0,66.0,100.188232,51.0,70.0,40.0
2,74.0,42.0,35.0,34.0,30.0,32.0,30.0,34.0,70.0,40.0


In [21]:
fe = JointProbabilityExtender(level=2)
X = fe.fit_transform(X, train_df.Survived)
X.head(3)

Unnamed: 0,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title,Lastname,...,Fare_Cabin,Fare_Embarked,Fare_Title,Fare_Lastname,Cabin_Embarked,Cabin_Title,Cabin_Lastname,Embarked_Title,Embarked_Lastname,Title_Lastname
0,19.0,42.0,54.0,34.0,30.0,32.0,30.0,34.0,16.0,40.0,...,29,29,13,33,27,11,31,14,35,16
1,74.0,42.0,54.0,34.0,30.0,66.0,100.188232,51.0,70.0,40.0,...,38,83,90,71,38,38,38,80,52,76
2,74.0,42.0,35.0,34.0,30.0,32.0,30.0,34.0,70.0,40.0,...,29,29,62,33,27,60,31,65,35,76


In [22]:
features = ['Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked', 'Title']
# features = ['Age', 'SibSp', 'Parch', 'Fare']
# features = ['Sex', 'Ticket_grp', 'Cabin', 'Embarked', 'Title']

In [28]:
%%time

from sklearn.model_selection import cross_val_score, ShuffleSplit, GridSearchCV

y = train_df['Survived'].copy()
X = train_df[features].copy()

# numerical_transformer2 = MyImputer(strategy='median', columns=['Age', 'Fare', 'SibSp', 'Parch'])
# categorical_transformer2 = MyImputer(strategy='most_frequent', columns=['Title', 'Ticket_grp', 'Embarked', 'Cabin'])

numerical_transformer = SimpleImputer(strategy='median')
categorical_transformer = make_pipeline(SimpleImputer(strategy='most_frequent'), 
                                        OneHotEncoder(handle_unknown='ignore'))
preprocessor_2 = ColumnTransformer(transformers=[
    ('num', numerical_transformer, make_column_selector(dtype_include=np.number)), 
    ('cat', categorical_transformer, make_column_selector(dtype_exclude=np.number))])

my_pipeline = make_pipeline(    
                            PercentageEncoder(overlap_categories=True, min_bin_sizes=2), 
                            JointProbabilityExtender(level=2, min_group_size=2), 
#                             FeatureSelector(k=5, max_variance_inflation=False), 
#                             SVC(C=10, degree=1, random_state=0),
                            RandomForestClassifier(random_state=0, max_depth=4)
)
cv = ShuffleSplit(n_splits=10, test_size=.33, random_state=0)
scores = cross_val_score(my_pipeline, X, y, cv=cv, scoring='accuracy')
print(f'Model accuracy: {scores.mean()}')

Model accuracy: 0.8098305084745763
Wall time: 1min 53s


In [29]:
%%time
y = train_df['Survived'].copy()
X = train_df[features].copy()

parameters = {'randomforestclassifier__random_state':[0], 
              'randomforestclassifier__max_depth': [3,4,5,None], 
              'randomforestclassifier__criterion': ['gini', 'entropy'], 
              'randomforestclassifier__n_estimators': [50, 300, 600, 1000]}
# parameters = {'svc__kernel':['linear', 'rbf', 'sigmoid'], 'svc__C':[1, 0.5, 5, 10], 'svc__degree':[1,2,3]}
# parameters = {'logisticregression__solver': ['newton-cg', 'lbfgs', 'liblinear'], 'logisticregression__C': [0.001, 0.01, 0.1, 0.5, 1, 5, 10], 'logisticregression__random_state':[0]}

# model = make_pipeline(PercentageEncoder(min_bin_sizes=10), JointProbabilityExtender(level=2), RandomForestClassifier())
# model = make_pipeline(PercentageEncoder(), SVC(random_state=0))
model = make_pipeline(preprocessor_2, RandomForestClassifier())
# model = make_pipeline(preprocessor_2, StandardScaler(), SVC(random_state=0)) 
# model = make_pipeline(preprocessor_2, StandardScaler(), LogisticRegression())


cv = ShuffleSplit(n_splits=10, test_size=.33, random_state=0)
clf = GridSearchCV(model, parameters, cv=cv, scoring='accuracy')
clf.fit(X, y)
print(clf.best_estimator_)
print(clf.cv_results_['mean_test_score'][clf.best_index_])

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(transformers=[('num',
                                                  SimpleImputer(strategy='median'),
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x0000024313F77A08>),
                                                 ('cat',
                                                  Pipeline(steps=[('simpleimputer',
                                                                   SimpleImputer(strategy='most_frequent')),
                                                                  ('onehotencoder',
                                                                   OneHotEncoder(handle_unknown='ignore'))]),
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x0000024313F6DD08>)])),
                ('randomforestclassifier',
                 RandomForestClassifier(criterion='

In [30]:
from sklearn.decomposition import KernelPCA

arr = np.array([[-1, -1], [-1, 1], [1,-1], [1,1]])
dcm = KernelPCA(n_components=2)
dcm.fit_transform(arr)

array([[-0.        ,  1.41421356],
       [ 1.41421356,  0.        ],
       [-1.41421356,  0.        ],
       [-0.        , -1.41421356]])

In [31]:
class MajorityVoting:    
    def fit(self, X, y):
        pass
    
    def predict(self, X):
        return X.apply(lambda r: 1 if (r>=50).sum() > len(r)//2 else 0, axis=1)

In [32]:
%%time

# from xgboost import XGBClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

y = train_df['Survived'].copy()
X = train_df[features].copy()

preprocessor_1 = make_pipeline(Percen tageEncoder(), 
                               JointProbabilityExtender(),
#                                FeatureSelector(k=3, max_variance_inflation=False)
                              )
numerical_transformer2 = SimpleImputer(strategy='median')
categorical_transformer2 = make_pipeline(SimpleImputer(strategy='most_frequent'), 
                                        OneHotEncoder(handle_unknown='ignore', sparse=False))
preprocessor_2 = ColumnTransformer(transformers=[
    ('num', numerical_transformer2, make_column_selector(dtype_include=np.number)), 
    ('cat', categorical_transformer2, make_column_selector(dtype_exclude=np.number))])

models = {'topf_singleout1': TopFeatureClassifier(strategy='singleout'), 
          'topf_mxtptn1': TopFeatureClassifier(strategy='mxtptn'), 
          'topf_percent1': TopFeatureClassifier(strategy='percent'), 
          'voting1': MajorityVoting(),
          'nbayes1': GaussianNB(),
          'rf1': RandomForestClassifier(max_depth=4, n_estimators=300, random_state=0),
          'svc1': make_pipeline(StandardScaler(), SVC(C=0.5, degree=1, random_state=0)), 
          'lr1': make_pipeline(StandardScaler(), LogisticRegression(C=0.01, random_state=0, solver='liblinear')), 
          'knn1': make_pipeline(StandardScaler(), KNeighborsClassifier()),          
          'nbayes2': GaussianNB(),
          'rf2': RandomForestClassifier(criterion='entropy', n_estimators=300, random_state=0),
          'svc2': make_pipeline(StandardScaler(), SVC(C=5, degree=1, kernel='sigmoid', random_state=0)), 
          'lr2': make_pipeline(StandardScaler(), LogisticRegression(C=0.01, random_state=0, solver='liblinear')), 
          'knn2': make_pipeline(StandardScaler(), KNeighborsClassifier())}

n_splits = 10
results = pd.DataFrame(index = range(n_splits), dtype='float64')
for i in range(n_splits):
    X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.33, random_state=i)
    results.loc[i, 'train_baseline'] = max(y_train.mean(), 1- y_train.mean())
    results.loc[i, 'test_baseline'] = max(y_valid.mean(), 1- y_valid.mean())
    
    model = TopFeatureClassifier(strategy='singleout')
    model.fit(X_train, y_train)
    print(model)
    predictions_train = model.predict(X_train)
    results.loc[i, 'train_topf_singleout_raw'] = accuracy_score(predictions_train, y_train)
    predictions_test = model.predict(X_valid)
    results.loc[i, 'test_topf_singleout_raw'] = accuracy_score(predictions_test, y_valid)
            
    Xtp1 = preprocessor_1.fit_transform(X_train.copy(), y_train.copy())
    Xvp1 = preprocessor_1.transform(X_valid.copy())    
    Xtp2 = preprocessor_2.fit_transform(X_train.copy(), y_train.copy())
    Xvp2 = preprocessor_2.transform(X_valid.copy())
    
    print('*C:', {name: mapp.mapping.nunique() for name, mapp in preprocessor_1['percentageencoder'].model.items()})
    
    for name, model in models.items():        
        Xt = Xv = None        
        if name[-1] == '1':
            Xt, Xv = Xtp1, Xvp1
        elif name[-1] == '2':
            Xt, Xv = Xtp2, Xvp2
        
        model.fit(Xt, y_train)
        pred_train = model.predict(Xt)        
        score_train = accuracy_score(pred_train, y_train)            
        results.loc[i, f'train_{name}'] = score_train
        pred_test = model.predict(Xv)
        score_test = accuracy_score(pred_test, y_valid)        
        results.loc[i, f'test_{name}'] = score_test
        if name[:-1] in ['topf_singleout', 'topf_mxtptn', 'topf_percent']:
            print('**', model)

all X[Sex] == female are 1 otherwise 0.
*C: {'Sex': 3, 'Age': 3, 'SibSp': 4, 'Parch': 7, 'Ticket': 7, 'Fare': 4, 'Cabin': 6, 'Embarked': 2, 'Title': 2}
** all X[Sex] == 76.0 are 1 otherwise 0.
** all X[Ticket_Title] in {100, 77, 17, 93, 94} are 1 otherwise 0.
** all X[Ticket_Title] >= 50 are 1 otherwise 0.
all X[Sex] == female are 1 otherwise 0.
*C: {'Sex': 3, 'Age': 5, 'SibSp': 4, 'Parch': 4, 'Ticket': 7, 'Fare': 5, 'Cabin': 5, 'Embarked': 2, 'Title': 2}
** all X[Sex] == 75.0 are 1 otherwise 0.
** all X[Sex_Ticket] in {100, 73, 75, 76, 93} are 1 otherwise 0.
** all X[Sex_Ticket] >= 50 are 1 otherwise 0.
all X[Title] == Mr are 0 otherwise 1.
*C: {'Sex': 3, 'Age': 4, 'SibSp': 4, 'Parch': 5, 'Ticket': 5, 'Fare': 8, 'Cabin': 5, 'Embarked': 2, 'Title': 2}
** all X[Ticket_Cabin] == 100 are 1 otherwise 0.
** all X[Ticket_Title] in {72, 24, 100, 94} are 1 otherwise 0.
** all X[Sex_Ticket] >= 50 are 1 otherwise 0.
all X[Sex] == female are 1 otherwise 0.
*C: {'Sex': 3, 'Age': 4, 'SibSp': 4, 'Pa

In [33]:
print('test_topf_singleout_raw:', results['test_topf_singleout_raw'].mean())
cols_train = results.columns[results.columns.str.contains('train')]
cols_test = results.columns[results.columns.str.contains('test')]
results[cols_test].apply(lambda x: x- results['test_topf_singleout_raw']).describe().T.sort_values(by='mean', ascending=False)

test_topf_singleout_raw: 0.78


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
test_svc2,10.0,0.058305,0.00929,0.047458,0.049153,0.059322,0.063559,0.074576
test_lr2,10.0,0.053559,0.01354,0.030508,0.044915,0.052542,0.063559,0.074576
test_svc1,10.0,0.046441,0.009325,0.033898,0.040678,0.045763,0.05,0.064407
test_rf2,10.0,0.040339,0.009379,0.027119,0.037288,0.038983,0.04322,0.061017
test_rf1,10.0,0.038983,0.010387,0.023729,0.032203,0.038983,0.047458,0.050847
test_lr1,10.0,0.036271,0.012383,0.016949,0.028814,0.038983,0.04661,0.050847
test_knn1,10.0,0.032881,0.019574,-0.00339,0.017797,0.038983,0.044068,0.061017
test_topf_mxtptn1,10.0,0.030508,0.010955,0.013559,0.024576,0.028814,0.033898,0.050847
test_topf_percent1,10.0,0.028814,0.009621,0.013559,0.023729,0.028814,0.030508,0.044068
test_nbayes1,10.0,0.020678,0.014154,-0.00339,0.014407,0.023729,0.023729,0.044068


In [34]:
print('train_topf_singleout_raw:', results['train_topf_singleout_raw'].mean())
cols_train = results.columns[results.columns.str.contains('train')]
cols_test = results.columns[results.columns.str.contains('test')]
results[cols_train].apply(lambda x: x- results['train_topf_singleout_raw']).describe().T.sort_values(by='mean', ascending=False)

train_topf_singleout_raw: 0.7894295302013423


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
train_rf2,10.0,0.209899,0.010944,0.196309,0.201762,0.208893,0.216443,0.228188
train_svc2,10.0,0.203356,0.009841,0.191275,0.19547,0.201342,0.208473,0.219799
train_lr2,10.0,0.202685,0.010276,0.189597,0.195889,0.199664,0.209732,0.218121
train_nbayes2,10.0,0.169128,0.010635,0.154362,0.162332,0.168624,0.171141,0.186242
train_rf1,10.0,0.114597,0.007832,0.102349,0.111158,0.113255,0.121644,0.124161
train_svc1,10.0,0.111913,0.006525,0.100671,0.10948,0.111577,0.11745,0.120805
train_knn1,10.0,0.110906,0.00934,0.100671,0.102349,0.109899,0.118289,0.125839
train_lr1,10.0,0.108725,0.00766,0.098993,0.103188,0.108221,0.113255,0.120805
train_topf_mxtptn1,10.0,0.072651,0.007032,0.062081,0.066275,0.075503,0.077181,0.082215
train_topf_percent1,10.0,0.071141,0.006994,0.060403,0.066275,0.072148,0.075503,0.082215


In [35]:
cols_train = results.columns[results.columns.str.contains('train')]
cols_test = results.columns[results.columns.str.contains('test')]
results[cols_test].mean()

test_baseline              0.614915
test_topf_singleout_raw    0.780000
test_topf_singleout1       0.741695
test_topf_mxtptn1          0.810508
test_topf_percent1         0.808814
test_voting1               0.725763
test_nbayes1               0.800678
test_rf1                   0.818983
test_svc1                  0.826441
test_lr1                   0.816271
test_knn1                  0.812881
test_nbayes2               0.460339
test_rf2                   0.820339
test_svc2                  0.838305
test_lr2                   0.833559
test_knn2                  0.509831
dtype: float64

# Loan data

In [36]:
train_df = pd.read_csv('data/loan_train.csv', usecols=['loan_status', 'Principal', 'terms', 'effective_date', 'due_date', 'age', 'education', 'Gender'])
test_df = pd.read_csv('data/loan_test.csv', usecols=['loan_status', 'Principal', 'terms', 'effective_date', 'due_date', 'age', 'education', 'Gender'])

print(f'train dataset size: {train_df.shape}')
print(f'test dataset size:  {test_df.shape}')

get_overview(train_df, 'TR', test_df, 'TS', topn=10)

train dataset size: (346, 8)
test dataset size:  (54, 8)


Unnamed: 0_level_0,dtypesTR,dtypesTS,nanTR,nanTS,uniquesTR,uniquesTS,minMedMaxTR,minMedMaxTS,top10valuesTR,top10valuesTS
Columns,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
loan_status,object,object,-,-,2 (0),2 (0),?,?,"[PAIDOFF, COLLECTION]","[PAIDOFF, COLLECTION]"
Principal,int64,int64,-,-,5 (2),4 (1),[300 < 1000.0 < 1000],[300 < 1000.0 < 1000],"[1000, 800, 300, 500, 900]","[1000, 800, 300, 700]"
terms,int64,int64,-,-,3 (0),3 (0),[7 < 30.0 < 30],[7 < 30.0 < 30],"[30, 15, 7]","[30, 15, 7]"
effective_date,object,object,-,-,7 (0),7 (0),?,?,"[9/11/2016, 9/12/2016, 9/10/2016, 9/14/2016, 9...","[9/11/2016, 9/12/2016, 9/10/2016, 9/13/2016, 9..."
due_date,object,object,-,-,23 (7),17 (1),?,?,"[10/10/2016, 9/25/2016, 9/26/2016, 10/11/2016,...","[9/25/2016, 10/10/2016, 10/11/2016, 10/9/2016,..."
age,int64,int64,-,-,32 (13),19 (0),[18 < 30.0 < 51],[20 < 30.5 < 50],"[26, 29, 30, 27, 28, 35, 34, 31, 33, 25]","[29, 26, 37, 33, 30, 32, 36, 24, 27, 43]"
education,object,object,-,-,4 (0),4 (0),?,?,"[High School or Below, college, Bechalor, Mast...","[college, High School or Below, Bechalor, Mast..."
Gender,object,object,-,-,2 (0),2 (0),?,?,"[male, female]","[male, female]"


In [37]:
train_df.loc[train_df.loan_status == 'PAIDOFF', 'loan_status'] = 1
train_df.loc[train_df.loan_status == 'COLLECTION', 'loan_status'] = 0
test_df.loc[test_df.loan_status == 'PAIDOFF', 'loan_status'] = 1
test_df.loc[test_df.loan_status == 'COLLECTION', 'loan_status'] = 0
# train_df[[ 'Principal']] = train_df[['Principal']].astype('object')
# test_df[['terms', 'Principal']] = test_df[['terms', 'Principal']].astype('object')
train_df['due_date'] = pd.to_datetime(train_df['due_date'])
train_df['effective_date'] = pd.to_datetime(train_df['effective_date'])
test_df['due_date'] = pd.to_datetime(test_df['due_date'])
test_df['effective_date'] = pd.to_datetime(test_df['effective_date'])
train_df['dayofweek'] = train_df['effective_date'].dt.dayofweek
test_df['dayofweek'] = test_df['effective_date'].dt.dayofweek
train_df.head()

Columns,loan_status,Principal,terms,effective_date,due_date,age,education,Gender,dayofweek
0,1,1000,30,2016-09-08,2016-10-07,45,High School or Below,male,3
1,1,1000,30,2016-09-08,2016-10-07,33,Bechalor,female,3
2,1,1000,15,2016-09-08,2016-09-22,27,college,male,3
3,1,1000,30,2016-09-09,2016-10-08,28,college,female,4
4,1,1000,30,2016-09-09,2016-10-08,29,college,male,4


In [38]:
features = set(train_df.columns) - set(['loan_status'])
y = train_df['loan_status'].copy().astype('int32')
X = train_df[features].copy()
y_test = test_df['loan_status'].copy().astype('int32')
X_test = test_df[features].copy()

mapper = Mapper(X, y)
min_bin_sizes, catcount = mapper.optimal_bin_size(features)
maps = mapper.get_maps(min_bin_sizes, features, overlap_categories=False)
for col in features:
    X[col] = X[col].map(maps[col])    
    X_test[col] = X_test[col].map(maps[col])
    
# fex = JointProbabilityExtender
# X = fex.fit_transform(X, y)
# X_test = fex.transform(X_test)

# pca = PCA(n_components=2)
# X = pd.DataFrame(pca.fit_transform(X))
# X_test = pd.DataFrame(pca.transform(X_test))

print(min_bin_sizes)

{'terms': 129, 'education': 65, 'Principal': 6, 'effective_date': 129, 'age': 83, 'Gender': 1, 'due_date': 44, 'dayofweek': 129}


In [39]:
test = SelectKBest(score_func=f_classif, k=5)
fit = test.fit(X_test, y_test)
new_features = X.columns[pd.Series(fit.scores_).sort_values(ascending=False).index]
new_features

Index(['dayofweek', 'education', 'age', 'due_date', 'Principal', 'terms',
       'effective_date', 'Gender'],
      dtype='object', name='Columns')

In [40]:
y = train_df['loan_status'].copy().astype('int32')
X = train_df[features].copy()

cv = ShuffleSplit(n_splits=10, test_size=.15, random_state=0)

parameters = {'kneighborsclassifier__n_neighbors': range(1, 20)}
# parameters = {'randomforestclassifier__max_depth': [2,3,4], 
#               'randomforestclassifier__n_estimators': [10, 50], 
#               'randomforestclassifier__criterion': ['gini', 'entropy']}
# parameters = {'svc__kernel':['linear', 'rbf', 'sigmoid'], 'svc__C':[0.01, 0.1, 0.5], 'svc__degree':[1,2,3,4]}

# parameters = {'percentageencoder__overlap_categories': [True, False], 
#               'jointprobabilityextender__level': [2,3]}

# model = make_pipeline(PercentageEncoder(), 
#                       JointProbabilityExtender(), 
#                       TopFeatureClassifier(strategy='percent'))

# model = make_pipeline(KNeighborsClassifier())
# model = make_pipeline(preprocessor_1, RandomForestClassifier(random_state=0))
# model = make_pipeline(preprocessor_2, RandomForestClassifier(random_state=0))
# model = make_pipeline(SimpleImputer(), StandardScaler(), SVC())
clf = GridSearchCV(model, parameters, cv=cv, scoring='jaccard')
clf.fit(X, y)
print(clf.best_estimator_)
clf.cv_results_['mean_test_score'][clf.best_index_]

Traceback (most recent call last):
  File "C:\Users\Abtin\.conda\envs\ml\lib\site-packages\sklearn\model_selection\_validation.py", line 531, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\Abtin\.conda\envs\ml\lib\site-packages\sklearn\pipeline.py", line 330, in fit
    Xt = self._fit(X, y, **fit_params_steps)
  File "C:\Users\Abtin\.conda\envs\ml\lib\site-packages\sklearn\pipeline.py", line 296, in _fit
    **fit_params_steps[name])
  File "C:\Users\Abtin\.conda\envs\ml\lib\site-packages\joblib\memory.py", line 352, in __call__
    return self.func(*args, **kwargs)
  File "C:\Users\Abtin\.conda\envs\ml\lib\site-packages\sklearn\pipeline.py", line 740, in _fit_transform_one
    res = transformer.fit_transform(X, y, **fit_params)
  File "C:\Users\Abtin\.conda\envs\ml\lib\site-packages\sklearn\base.py", line 693, in fit_transform
    return self.fit(X, y, **fit_params).transform(X)
  File "C:\Users\Abtin\.conda\envs\ml\lib\site-packages\sklearn\prepr

Traceback (most recent call last):
  File "C:\Users\Abtin\.conda\envs\ml\lib\site-packages\sklearn\model_selection\_validation.py", line 531, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\Abtin\.conda\envs\ml\lib\site-packages\sklearn\pipeline.py", line 330, in fit
    Xt = self._fit(X, y, **fit_params_steps)
  File "C:\Users\Abtin\.conda\envs\ml\lib\site-packages\sklearn\pipeline.py", line 296, in _fit
    **fit_params_steps[name])
  File "C:\Users\Abtin\.conda\envs\ml\lib\site-packages\joblib\memory.py", line 352, in __call__
    return self.func(*args, **kwargs)
  File "C:\Users\Abtin\.conda\envs\ml\lib\site-packages\sklearn\pipeline.py", line 740, in _fit_transform_one
    res = transformer.fit_transform(X, y, **fit_params)
  File "C:\Users\Abtin\.conda\envs\ml\lib\site-packages\sklearn\base.py", line 693, in fit_transform
    return self.fit(X, y, **fit_params).transform(X)
  File "C:\Users\Abtin\.conda\envs\ml\lib\site-packages\sklearn\prepr

Traceback (most recent call last):
  File "C:\Users\Abtin\.conda\envs\ml\lib\site-packages\sklearn\model_selection\_validation.py", line 531, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\Abtin\.conda\envs\ml\lib\site-packages\sklearn\pipeline.py", line 330, in fit
    Xt = self._fit(X, y, **fit_params_steps)
  File "C:\Users\Abtin\.conda\envs\ml\lib\site-packages\sklearn\pipeline.py", line 296, in _fit
    **fit_params_steps[name])
  File "C:\Users\Abtin\.conda\envs\ml\lib\site-packages\joblib\memory.py", line 352, in __call__
    return self.func(*args, **kwargs)
  File "C:\Users\Abtin\.conda\envs\ml\lib\site-packages\sklearn\pipeline.py", line 740, in _fit_transform_one
    res = transformer.fit_transform(X, y, **fit_params)
  File "C:\Users\Abtin\.conda\envs\ml\lib\site-packages\sklearn\base.py", line 693, in fit_transform
    return self.fit(X, y, **fit_params).transform(X)
  File "C:\Users\Abtin\.conda\envs\ml\lib\site-packages\sklearn\prepr

Traceback (most recent call last):
  File "C:\Users\Abtin\.conda\envs\ml\lib\site-packages\sklearn\model_selection\_validation.py", line 531, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\Abtin\.conda\envs\ml\lib\site-packages\sklearn\pipeline.py", line 330, in fit
    Xt = self._fit(X, y, **fit_params_steps)
  File "C:\Users\Abtin\.conda\envs\ml\lib\site-packages\sklearn\pipeline.py", line 296, in _fit
    **fit_params_steps[name])
  File "C:\Users\Abtin\.conda\envs\ml\lib\site-packages\joblib\memory.py", line 352, in __call__
    return self.func(*args, **kwargs)
  File "C:\Users\Abtin\.conda\envs\ml\lib\site-packages\sklearn\pipeline.py", line 740, in _fit_transform_one
    res = transformer.fit_transform(X, y, **fit_params)
  File "C:\Users\Abtin\.conda\envs\ml\lib\site-packages\sklearn\base.py", line 693, in fit_transform
    return self.fit(X, y, **fit_params).transform(X)
  File "C:\Users\Abtin\.conda\envs\ml\lib\site-packages\sklearn\prepr

Traceback (most recent call last):
  File "C:\Users\Abtin\.conda\envs\ml\lib\site-packages\sklearn\model_selection\_validation.py", line 531, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\Abtin\.conda\envs\ml\lib\site-packages\sklearn\pipeline.py", line 330, in fit
    Xt = self._fit(X, y, **fit_params_steps)
  File "C:\Users\Abtin\.conda\envs\ml\lib\site-packages\sklearn\pipeline.py", line 296, in _fit
    **fit_params_steps[name])
  File "C:\Users\Abtin\.conda\envs\ml\lib\site-packages\joblib\memory.py", line 352, in __call__
    return self.func(*args, **kwargs)
  File "C:\Users\Abtin\.conda\envs\ml\lib\site-packages\sklearn\pipeline.py", line 740, in _fit_transform_one
    res = transformer.fit_transform(X, y, **fit_params)
  File "C:\Users\Abtin\.conda\envs\ml\lib\site-packages\sklearn\base.py", line 693, in fit_transform
    return self.fit(X, y, **fit_params).transform(X)
  File "C:\Users\Abtin\.conda\envs\ml\lib\site-packages\sklearn\prepr

Traceback (most recent call last):
  File "C:\Users\Abtin\.conda\envs\ml\lib\site-packages\sklearn\model_selection\_validation.py", line 531, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\Abtin\.conda\envs\ml\lib\site-packages\sklearn\pipeline.py", line 330, in fit
    Xt = self._fit(X, y, **fit_params_steps)
  File "C:\Users\Abtin\.conda\envs\ml\lib\site-packages\sklearn\pipeline.py", line 296, in _fit
    **fit_params_steps[name])
  File "C:\Users\Abtin\.conda\envs\ml\lib\site-packages\joblib\memory.py", line 352, in __call__
    return self.func(*args, **kwargs)
  File "C:\Users\Abtin\.conda\envs\ml\lib\site-packages\sklearn\pipeline.py", line 740, in _fit_transform_one
    res = transformer.fit_transform(X, y, **fit_params)
  File "C:\Users\Abtin\.conda\envs\ml\lib\site-packages\sklearn\base.py", line 693, in fit_transform
    return self.fit(X, y, **fit_params).transform(X)
  File "C:\Users\Abtin\.conda\envs\ml\lib\site-packages\sklearn\prepr

Traceback (most recent call last):
  File "C:\Users\Abtin\.conda\envs\ml\lib\site-packages\sklearn\model_selection\_validation.py", line 531, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\Abtin\.conda\envs\ml\lib\site-packages\sklearn\pipeline.py", line 330, in fit
    Xt = self._fit(X, y, **fit_params_steps)
  File "C:\Users\Abtin\.conda\envs\ml\lib\site-packages\sklearn\pipeline.py", line 296, in _fit
    **fit_params_steps[name])
  File "C:\Users\Abtin\.conda\envs\ml\lib\site-packages\joblib\memory.py", line 352, in __call__
    return self.func(*args, **kwargs)
  File "C:\Users\Abtin\.conda\envs\ml\lib\site-packages\sklearn\pipeline.py", line 740, in _fit_transform_one
    res = transformer.fit_transform(X, y, **fit_params)
  File "C:\Users\Abtin\.conda\envs\ml\lib\site-packages\sklearn\base.py", line 693, in fit_transform
    return self.fit(X, y, **fit_params).transform(X)
  File "C:\Users\Abtin\.conda\envs\ml\lib\site-packages\sklearn\prepr

Traceback (most recent call last):
  File "C:\Users\Abtin\.conda\envs\ml\lib\site-packages\sklearn\model_selection\_validation.py", line 531, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\Abtin\.conda\envs\ml\lib\site-packages\sklearn\pipeline.py", line 330, in fit
    Xt = self._fit(X, y, **fit_params_steps)
  File "C:\Users\Abtin\.conda\envs\ml\lib\site-packages\sklearn\pipeline.py", line 296, in _fit
    **fit_params_steps[name])
  File "C:\Users\Abtin\.conda\envs\ml\lib\site-packages\joblib\memory.py", line 352, in __call__
    return self.func(*args, **kwargs)
  File "C:\Users\Abtin\.conda\envs\ml\lib\site-packages\sklearn\pipeline.py", line 740, in _fit_transform_one
    res = transformer.fit_transform(X, y, **fit_params)
  File "C:\Users\Abtin\.conda\envs\ml\lib\site-packages\sklearn\base.py", line 693, in fit_transform
    return self.fit(X, y, **fit_params).transform(X)
  File "C:\Users\Abtin\.conda\envs\ml\lib\site-packages\sklearn\prepr

Traceback (most recent call last):
  File "C:\Users\Abtin\.conda\envs\ml\lib\site-packages\sklearn\model_selection\_validation.py", line 531, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\Abtin\.conda\envs\ml\lib\site-packages\sklearn\pipeline.py", line 330, in fit
    Xt = self._fit(X, y, **fit_params_steps)
  File "C:\Users\Abtin\.conda\envs\ml\lib\site-packages\sklearn\pipeline.py", line 296, in _fit
    **fit_params_steps[name])
  File "C:\Users\Abtin\.conda\envs\ml\lib\site-packages\joblib\memory.py", line 352, in __call__
    return self.func(*args, **kwargs)
  File "C:\Users\Abtin\.conda\envs\ml\lib\site-packages\sklearn\pipeline.py", line 740, in _fit_transform_one
    res = transformer.fit_transform(X, y, **fit_params)
  File "C:\Users\Abtin\.conda\envs\ml\lib\site-packages\sklearn\base.py", line 693, in fit_transform
    return self.fit(X, y, **fit_params).transform(X)
  File "C:\Users\Abtin\.conda\envs\ml\lib\site-packages\sklearn\prepr

ValueError: could not convert string to float: 'High School or Below'

In [None]:
%%time

# from xgboost import XGBClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score


features = set(train_df.columns) - set(['loan_status']) #- set(['effective_date', 'due_date'])
y = train_df['loan_status'].copy().astype('int32')
X = train_df[features].copy()

preprocessor_1 = make_pipeline(PercentageEncoder(), 
                               JointProbabilityExtender(),
#                                FeatureSelector(k=3, max_variance_inflation=15)
                              )
numerical_transformer2 = SimpleImputer(strategy='median')
categorical_transformer2 = make_pipeline(SimpleImputer(strategy='most_frequent'), 
                                        OneHotEncoder(handle_unknown='ignore', sparse=False))
preprocessor_2 = ColumnTransformer(transformers=[
    ('num', numerical_transformer2, make_column_selector(dtype_include=np.number)), 
    ('cat', categorical_transformer2, make_column_selector(dtype_exclude=np.number))])

models = {'topf_singleout1': TopFeatureClassifier(strategy='singleout'), 
          'topf_mxtptn1': TopFeatureClassifier(strategy='mxtptn'), 
          'topf_percent1': TopFeatureClassifier(strategy='percent'), 
          'voting1': MajorityVoting(),
          'nbayes1': GaussianNB(),
          'rf1': RandomForestClassifier(max_depth=2),
          'svc1': make_pipeline(StandardScaler(), SVC()), 
          'lr1': make_pipeline(StandardScaler(), LogisticRegression()), 
          'knn1': make_pipeline(StandardScaler(), KNeighborsClassifier()),          
          'nbayes2': GaussianNB(),
          'rf2': RandomForestClassifier(),
          'svc2': make_pipeline(StandardScaler(), SVC()), 
          'lr2': make_pipeline(StandardScaler(), LogisticRegression()), 
          'knn2': make_pipeline(StandardScaler(), KNeighborsClassifier())}

n_splits = 10
results = pd.DataFrame(index = range(n_splits), dtype='float64')
for i in range(n_splits):
    X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.1)
    results.loc[i, 'train_baseline'] = max(y_train.mean(), 1- y_train.mean())
    results.loc[i, 'test_baseline'] = max(y_valid.mean(), 1- y_valid.mean())
    
    model = TopFeatureClassifier(strategy='singleout')
    model.fit(X_train, y_train)
    print(model)
    predictions_train = model.predict(X_train)
    results.loc[i, 'train_topf_singleout_raw'] = accuracy_score(predictions_train, y_train)
    predictions_test = model.predict(X_valid)
    results.loc[i, 'test_topf_singleout_raw'] = accuracy_score(predictions_test, y_valid)
            
    Xtp1 = preprocessor_1.fit_transform(X_train.copy(), y_train.copy())
    Xvp1 = preprocessor_1.transform(X_valid.copy())    
    Xtp2 = preprocessor_2.fit_transform(X_train.copy(), y_train.copy())
    Xvp2 = preprocessor_2.transform(X_valid.copy())
    
    print('*C:', {name: mapp.mapping.nunique() for name, mapp in preprocessor_1['percentageencoder'].model.items()})
    
    for name, model in models.items():        
        Xt = Xv = None        
        if name[-1] == '1':
            Xt, Xv = Xtp1, Xvp1
        elif name[-1] == '2':
            Xt, Xv = Xtp2, Xvp2
        
        model.fit(Xt, y_train)
        pred_train = model.predict(Xt)        
        score_train = accuracy_score(pred_train, y_train)            
        results.loc[i, f'train_{name}'] = score_train
        pred_test = model.predict(Xv)
        score_test = accuracy_score(pred_test, y_valid)        
        results.loc[i, f'test_{name}'] = score_test
        if name[:-1] in ['topf_singleout', 'topf_mxtptn', 'topf_percent']:
            print('**', model)

In [None]:
print('test_baseline:', results['test_baseline'].mean())
cols_train = results.columns[results.columns.str.contains('train')]
cols_test = results.columns[results.columns.str.contains('test')]
results[cols_test].apply(lambda x: x- results['test_baseline']).describe().T.sort_values(by='mean', ascending=False)

In [None]:
print('train_baseline:', results['train_baseline'].mean())
cols_train = results.columns[results.columns.str.contains('train')]
cols_test = results.columns[results.columns.str.contains('test')]
results[cols_train].apply(lambda x: x- results['train_baseline']).describe().T.sort_values(by='mean', ascending=False)

In [None]:
from sklearn.metrics import jaccard_score
from sklearn.metrics import f1_score
from sklearn.metrics import log_loss

df_evaluation = pd.DataFrame(index=['KNN', 'Decision Tree', 'SVM', 'LogisticRegression'], columns=['Jaccard', 'F1-Score', 'LogLoss'])

features = set(train_df.columns) - set(['loan_status'])
y = train_df['loan_status'].copy().astype('int32')
X = train_df[features].copy()
y_test = test_df['loan_status'].copy().astype('int32')
X_test = test_df[features].copy()

preprocessor = make_pipeline(PercentageEncoder(overlap_categories=True), JointProbabilityExtender(level=3))
X = preprocessor.fit_transform(X, y)
X_test = preprocessor.transform(X_test)

model = KNeighborsClassifier(n_neighbors=7)
model.fit(X, y)
y_hat_knn = model.predict(X_test)
df_evaluation.loc['KNN', 'Jaccard'] = jaccard_score(y_test, y_hat_knn)
df_evaluation.loc['KNN', 'F1-Score'] = f1_score(y_test, y_hat_knn, average='weighted')

model = DecisionTreeClassifier()
model.fit(X, y)
y_hat_dtree = model.predict(X_test)
df_evaluation.loc['Decision Tree', 'Jaccard'] = jaccard_score(y_test, y_hat_dtree)
df_evaluation.loc['Decision Tree', 'F1-Score'] = f1_score(y_test, y_hat_dtree, average='weighted')


model = make_pipeline(StandardScaler(), SVC(C=0.01, degree=1, kernel='linear'))
model.fit(X, y)
y_hat_svm = model.predict(X_test)
df_evaluation.loc['SVM', 'Jaccard'] = jaccard_score(y_test, y_hat_svm)
df_evaluation.loc['SVM', 'F1-Score'] = f1_score(y_test, y_hat_svm, average='weighted')

model = make_pipeline(StandardScaler(), LogisticRegression())
model.fit(X, y)
y_hat_logreg = model.predict(X_test)
y_hat_logreg_prob = model.predict_proba(X_test)
df_evaluation.loc['LogisticRegression', 'Jaccard'] = jaccard_score(y_test, y_hat_logreg)
df_evaluation.loc['LogisticRegression', 'F1-Score'] = f1_score(y_test, y_hat_logreg, average='weighted')
df_evaluation.loc['LogisticRegression', 'LogLoss'] = log_loss(y_test, y_hat_logreg_prob)

model = RandomForestClassifier(max_depth=2, n_estimators=50)
model.fit(X, y)
y_hat_rf = model.predict(X_test)
df_evaluation.loc['Random Forest', 'Jaccard'] = jaccard_score(y_test, y_hat_rf)
df_evaluation.loc['Random Forest', 'F1-Score'] = f1_score(y_test, y_hat_rf, average='weighted')

df_evaluation

In [None]:
y_hat_rf

In [None]:
maps