In [None]:
def arange(arg1, arg2=None, arg3=None, arg4=None):
    
    '''
    default:
        arg1 - start
        arg2 - stop
        arg3 - step
        arg4 - endpoint (True: includes, False: not includes)

    variations:
        arange(arg1) -> range(start=0, stop=arg1, step=1, endpoint=False)
        
        arange(arg1, arg2):
            arange(num, num) -> (start=arg1, stop=arg2, step=1, endpoint=False)
            arange(num, bool) -> range(start=0, stop=arg1, step=1, endpoint=arg2)
            
            
        arange(arg1, arg2, arg3):
            arange(num, num, num) -> (start=arg1, stop=arg2, step=arg3, endpoint=False)
            arange(num, num, bool) -> range(start=arg1, stop=arg2, step=1, endpoint=arg3)
            
        arange(arg1, arg2, arg3, arg4) -> range(start=arg1, stop=arg2, step=arg3, endpoint=arg4)
    '''

    is_int = False

    # if only one argument: arange(arg1)
    if ((arg1 is not None) & (arg2 is None) &
        (arg3 is None) & (arg4 is None)):
        # equivalent (start=0, stop=arg1, step=1, endpoint=False)
        start = 0
        stop = arg1
        step = 1
        endpoint = False
        
        if isinstance(arg1, int):
            is_int = True

    # if two arguments: arange(arg1, arg2)
    if ((arg1 is not None) & (arg2 is not None) &
        (arg3 is None) & (arg4 is None)):
        
        # if second argument boolean: arange(number1, True)
        if isinstance(arg2, bool):
            # equivalent (start=0, stop=arg1, step=1, endpoint=arg2)
            start = 0
            stop = arg1
            step = 1
            endpoint = arg2
        # if second argument not boolean: arange(number1, number2)
        else:
            # equivalent (start=arg1, stop=arg2, step=1, endpoint=False)
            start = arg1
            stop = arg2
            step = 1
            endpoint = False

        if isinstance(arg1, int) & isinstance(arg2, int):
            is_int = True

    # if three arguments: arange(arg1, arg2, arg3)
    if ((arg1 is not None) & (arg2 is not None) &
        (arg3 is not None) & (arg4 is None)):
        # if third argument boolean: arange(number1, number2, True)
        if isinstance(arg3, bool):
            # equivalent (start=arg1, stop=arg2, step=1, endpoint=arg3)
            start = arg1
            stop = arg2
            step = 1
            endpoint = arg3
        # if third argument not boolean: arange(number1, number2, number3)
        else:
            # equivalent (start=arg1, stop=arg2, step=arg3, endpoint=False)
            start = arg1
            stop = arg2
            step = arg3
            endpoint = False

        if (isinstance(arg1, int) & isinstance(arg2, int) &
               isinstance(arg3, int)):
            is_int = True

    # if all arguments: arange(arg1, arg2, arg4, True)
    if ((arg1 is not None) & (arg2 is not None) &
        (arg3 is not None) & (arg4 is not None)):
        # equivalent (start=arg1, stop=arg2, step=arg3, endpoint=arg4)
            start = arg1
            stop = arg2
            step = arg3
            endpoint = arg4

    # safe form of np.arange(start, stop, step)
    arr = step * np.arange(start/step, stop/step)
    # if last value of arr equals to stop it concatenates to arr
    if endpoint and arr[-1]+step==stop:
        arr = np.concatenate([arr,[stop]])

    if is_int:
        arr = arr.astype(int)
        
    return arr

In [1]:
def axis_rstyle(
        y_ticks=None,
        x_ticks=None,
        y_slice=None,
        x_slice=None,
        y_lim=None,
        x_lim=None,
        offset_left=5,
        offset_bottom=5,
        width=custom_axis_linewidth,
        margin=True,
        color=custom_axis_color,
        grid=False,
        ax=None):
    
    '''
    x_ticks: tuple (x_min, x_max, step)
    y_ticks: tuple (y_min, y_max, step)
    '''

    if ax is None: ax = plt.gca()

    # order of steps is important:
        # 1 - get ticks
        # 2 - set margins if necessary
        # 3 - manipulations with sticks
        # 4 - update ticks
        # 5 - spines modification
        # 6 - set limits
        # 7 - tick params
        # 8 - grid

    # get ticks
    xticks = ax.get_xticks()
    yticks = ax.get_yticks()

    if margin is not None:
        if isinstance(margin, collections.abc.Iterable):
            ax.margins(*margin)
        else:
            margin = 0.01 if margin is True else margin
            # calculate margin coefficients coeff0 and coeff1 the way
            # margins have to be equal
            # 1st step: find size of figure/ax -> figisize (or ax) 
            # size should be like (ax_width, ax_height)
            # 2d step: suggest margin_x should be equals 0.025, then
                # ax_width * margin_x = ax_height * margin_y
                # margin_y = (margin_x * ax_width) / ax_height
            # so, calculated by this way values of margin_x and margin_y 
            # would make both margins equal and NOT depend on figure(or ax) size
            ax_height, ax_width = ax.bbox.height, ax.bbox.width
            margin_y = margin * ax_width / ax_height
            ax.margins(x=margin, y=margin_y)

    # declare xticks and yticks if necessary
    if x_ticks is not None:
        # if step not specified
        if len(x_ticks) == 2:
            x_step = xticks[1] - xticks[0]
            x_ticks = np.append(x_ticks, x_step)
        xticks = arange(x_ticks[0], x_ticks[1], x_ticks[2], True)
    if y_ticks is not None:
        # if step not specified
        if len(y_ticks) == 2:
            y_step = yticks[1] - yticks[0]
            y_ticks = np.append(y_ticks, y_step)
        yticks = arange(y_ticks[0], y_ticks[1], y_ticks[2], True)

    # declare xticks and yticks with slices if necessary
    if x_slice is not None:
        x_slice_ = slice(*x_slice)
        xticks = xticks[x_slice_]
    if y_slice is not None:
        y_slice_ = slice(*y_slice)
        yticks = yticks[y_slice_]

    # update ticks
    ax.set_xticks(xticks)
    ax.set_yticks(yticks)

    # customie spines
    ax.spines['bottom'].set_bounds(xticks[0], xticks[-1])
    ax.spines['bottom'].set_position(('outward', offset_bottom))
    ax.spines['left'].set_bounds(yticks[0], yticks[-1])
    ax.spines['left'].set_position(('outward', offset_left))
    
    if color:
        ax.spines['bottom'].set_color(color)
        ax.spines['left'].set_color(color)
        ax.tick_params(which='both', color=color)

    if width:
        ax.spines['bottom'].set_linewidth(width)
        ax.spines['left'].set_linewidth(width)
        ax.tick_params(which='both', width=width)

    # set limits if necessary
    if x_lim is not None:
        ax.set_xlim(x_lim[0], x_lim[1])
    if y_lim is not None:
        ax.set_ylim(y_lim[0], y_lim[1])
    
    # set tick params and colors
    ax.tick_params(
        which='both', direction='out', bottom=True, left=True)

    # turn off grid
    if not grid:
        ax.grid(False)

In [None]:
def data_describe(data):
    
    df = data.copy()
    # varibles types
    dtypes = df.dtypes.rename('Type').to_frame()
    # frequency
    frequency = df.count().rename('Count').to_frame()
    # unique values
    unique = df.nunique().rename('Unique').to_frame()
    # NaNs
    nans = df.isnull().sum().rename('NaN').to_frame()
    # NaNs fraction
    nans_frac = df.isnull().mean().round(2)
    nans_frac = nans_frac.rename('Percentages').to_frame()
    # list with results
    results_list = [dtypes, frequency, unique, nans, nans_frac]
    # df with results
    results = pd.concat(results_list, axis=1)
    results['Percentages'] = (results['Percentages'] * 100).astype('int64')
    results = results.sort_values(['NaN'], ascending=False)
    
    return results

In [None]:
def rename_columns(data):

    df = data.copy()
    df = df.rename(
        columns={
            'condition1': 'condition_first',
            'condition2': 'condition_second',
            'exterior1st': 'exterior_first',
            'exterior2nd': 'exterior_second',
            'bsmtfintype1': 'bsmtfintype_first',
            'bsmtfinsf1': 'bsmtfinsf_first',
            'bsmtfintype2': 'bsmtfintype_second',
            'bsmtfinsf2': 'bsmtfinsf_second',
            '1stflrsf': 'first_flrsf',
            '2ndflrsf': 'second_flrsf',
            '3ssnporch': 'three_ssnporch'
            }
    )
    return df

In [None]:
def df_outliers_create(df, quant_features, scale=1.5, boundaries=False):
    
    index = ['Count', 
             'Outliers', 'Lower Outliers', 'Upper Outliers',
             'Lower Fence', 'Upper Fence', 
             'Q25', 'Q75', 'IQR', 'Scale']
    
    boundaries_dict = {}
    # словарь, в котором ключи - признаки,
    # а значения - верхние и нижние выбросы, а также q25, q75, iqr, lower, upper, scale
    outliers_dict = {}
    
    for feature in quant_features:
        
        q25, q75 = np.nanpercentile(df[feature], [25, 75])
        iqr = q75 - q25
        lower_boundary = q25 - (scale * iqr)
        upper_boundary = q75 + (scale * iqr)
        
        lower_ouliers = len(df[df[feature] < lower_boundary])
        upper_outliers = len(df[df[feature] > upper_boundary])
        outliers = lower_ouliers + upper_outliers

        dict_data = [
            df[feature].count(), # общее число элементов признака i
            outliers, lower_ouliers, upper_outliers,
            lower_boundary, upper_boundary,
            q25, q75, iqr,  scale
        ]
        
        outliers_dict['{0}'.format(feature)] = [round(i,2) for i in dict_data]
        
        boundaries_dict[feature] = [lower_boundary, upper_boundary]
    
    outliers_df = pd.DataFrame(outliers_dict, index=index)
    
    if boundaries:
        return outliers_df, boundaries_dict
    else:
        return outliers_df

In [None]:
def plot_data_outliers(df_outliers):
    
    plt.figure(figsize=(6,3))
    ax = sns.barplot(
        x=df_outliers.columns,
        y=df_outliers.loc['Outliers'] / df_outliers.loc['Count'] * 100,
        width=0.6,
        color=palette[0]
    )
    ax.tick_params(axis='both', which='major', labelsize=9)
    ax.set_ylabel('Persentages', fontsize=10, weight='bold')
    ax.yaxis.set_label_coords(-0.11, 0.46)
    plt.title('Outliers', fontsize=10, weight='bold')
    plt.xticks(rotation=90)

    plt.show()

In [None]:
def fillna_na(data, features_list):

    df = data.copy()
    for feature in features_list:
        df[feature] = df[feature].fillna('NA')

    return df

In [None]:
def overview_transformation(
        data, features_na, imputer_categorical,
        imputer_quantitative, variable_target, log_target=False):        

    df = data.copy()
    df.columns = map(str.lower, df.columns)
    # rename predictors for linear models (statsmodels)
    df = rename_columns(df)
    # fill NaNs by 'NA' for categorical variables
    df[features_na] = df[features_na].fillna('NA')
    # imput NaNs for other categorical variables
    df = imputer_categorical.transform(df)

    garage_fill = df.loc[df['garageyrblt'].isna(), 'yearbuilt']
    loc = df['garageyrblt'].isna(), 'garageyrblt'
    df.loc[loc] = df.loc[loc].fillna(garage_fill)
    # median imput for quantitative variables
    df = imputer_quantitative.transform(df)

    if log_target:
        # log target variable
        df[variable_target] = np.log(df[variable_target])
    
    return df

In [None]:
def plot_feature_importance(
        features, importance, labels=False,
        width=6, height=15, coeff_xaxis=-1.5, 
        top=None, lower_limit=None):

    feature_importance = pd.DataFrame({
        'Feature': features,
        'Importance': importance
    })
    feature_importance = (feature_importance
                          .sort_values('Importance', ascending=False)
                          .reset_index(drop=True))
    if lower_limit:
        loc_limit_rows = (feature_importance['Importance'] >= lower_limit)
        feature_importance = (feature_importance
                              .loc[loc_limit_rows, :])
    if top: 
        loc_limit_rows = slice(0, top-1)
        feature_importance = (feature_importance
                              .loc[loc_limit_rows, :])
    
    x = feature_importance['Importance']
    y = feature_importance['Feature']
    # reverse y ticks
    x = x.iloc[::-1]
    y = y.iloc[::-1]
    # k = (x.max() - x.min())/len(x) * 0.75
    # figure
    f, ax = plt.subplots(figsize=(width, height))
    xlim = feature_importance['Importance'].max() * 1.1
    ylim = ax.get_ylim()
    # f.suptitle('Feature Importance', fontsize=11, y=0.01*coeff_title)
    plt.hlines(
        xmin=0, xmax=x, y=y, linewidth=2,
        color=palette[-2], alpha=0.55)
    plt.plot(
        x, y, 'o', markersize=3.5, color=palette[-4], alpha=1)
    if labels:
        for x, y in zip(x, y):
            plt.text(
                x=x*1.2, y=y, s='{:.3f}'.format(x), size=8,
                horizontalalignment='center', verticalalignment='bottom',
                bbox={
                    'boxstyle': 'round',
                    'facecolor': 'none',
                    'edgecolor': '0.75'
                }
            )
    ax.set_xlabel('')
    ax.set_ylabel('')
    ax.set_xlim(0, xlim)
    ax.xaxis.tick_top()
    ax.spines['top'].set_visible(True)
    ax.spines['top'].set_position(('outward', coeff_xaxis*height))
    ax.spines['bottom'].set_visible(False)
    ax.spines['left'].set_visible(True)
    ax.spines['left'].set_bounds((-0.5, len(feature_importance)-0.5))
    ax.spines['left'].set_position(('outward', 10))
    plt.grid(None)
    yticks = np.arange(len(features)-1, -1, -1)
    yticklabels = [str.upper(i) for i in features]
    plt.yticks(ticks=yticks, labels=yticklabels, fontsize=9)
    plt.grid(None)
    return f

In [None]:
def log_features(data, features_log):
    df = data.copy()
    for feature in features_log:
        # const = abs(np.min(df[feature])) + 1
        const = 1
        df[feature] = np.log(df[feature] + const)
        df = df.rename(columns={feature: 'lg_'+feature})

    return df

In [None]:
def transform_feature_selection(data, features_log, target, factors=True):

    df = data.copy()
    df['flrsfmean'] = ((df['first_flrsf']
                       + 0.7*df['second_flrsf']) / 2)
    df['totrms'] = (df['totrmsabvgrd']
                   - df['bedroomabvgr']
                   - df['kitchenabvgr'])
    df['bedroomsze'] = (df['bedroomabvgr'] / df['grlivarea'])
    df['kitchensze'] = (df['kitchenabvgr'] / df['grlivarea'])
    # 'bedroomfracrms' feature
    df['bedroomfracrms'] = (df['bedroomabvgr'] / df['totrms'])
    # max value of 'bedroomfracrms' except inf
    loc_value = (~np.isinf(df['bedroomfracrms']), 'bedroomfracrms')
    value = df.loc[loc_value].max()
    # fill inf values with max value
    loc_r = np.isinf(df['bedroomfracrms'])
    df.loc[loc_r, 'bedroomfracrms'] = value
    # 'kitchenfracrms' feature
    df['kitchenfracrms'] = (df['kitchenabvgr'] / df['totrms'])
    # max value of 'kitchenfracrms' except inf
    loc_value = (~np.isinf(df['kitchenfracrms']), 'kitchenfracrms')
    value = df.loc[loc_value].max()
    # fill inf values with max value
    loc_r = np.isinf(df['kitchenfracrms'])
    df.loc[loc_r, 'kitchenfracrms'] = value
    # fill NaN values by 0
    df['kitchenfracrms'] = df['kitchenfracrms'].fillna(0)
    # 'bathsfracbedr' feature
    df['bathsfracbedr'] = (df['fullbath'] / df['bedroomabvgr'])
    # max value of 'bathsfracbedr' except inf
    loc_value = (~np.isinf(df['bathsfracbedr']), 'bathsfracbedr')
    value = df.loc[loc_value].max()
    # fill inf values with max value
    loc_r = np.isinf(df['bathsfracbedr'])
    df.loc[loc_r, 'bathsfracbedr'] = value
    # fill NaN values by 0
    df['bathsfracbedr'] = df['bathsfracbedr'].fillna(0)

    if factors:
        features_to_factor = [
            'yearremodadd', 'masvnrarea', 'bsmtfinsf_first', 'bsmtfinsf_second', 
            'totalbsmtsf', 'bsmtunfsf', 'lowqualfinsf', 'second_flrsf', 'garagearea',
            'wooddecksf', 'openporchsf', 'enclosedporch', 'three_ssnporch',
            'screenporch', 'poolarea', 'miscval'
        ]
        for feature in features_to_factor:
            new_feature_name = feature + '_exst'
            df[new_feature_name] = (df[feature] != 0).astype(int)
        
    df['yearremodadd_exst'] = (df['yearremodadd']!=df['yearbuilt']).astype(int)
    cond = (df['yearremodadd_exst']==1)
    outcome1 = (df['yrsold'] - df['yearremodadd'])
    outcome0 = (df['yrsold'] - df['yearbuilt'])
    df['modage'] = np.where(cond, outcome1, outcome0)
    # df = df.drop('yearremodadd_exst', axis=1)
    
    df['houseage'] = df['yrsold'] - df['yearbuilt']
    df['garageage'] = df['yrsold'] - df['garageyrblt']
    
    features_log = [
        'masvnrarea', 'bsmtfinsf_first', 'bsmtfinsf_second', 'bsmtunfsf',
        'totalbsmtsf', 'first_flrsf', 'second_flrsf', 'lowqualfinsf',
        'grlivarea', 'garageyrblt', 'garagearea', 'wooddecksf',
        'openporchsf', 'enclosedporch', 'three_ssnporch', 'screenporch',
        'poolarea', 'miscval', 'houseage', 'lotfrontage', 'lotarea'
    ]
    
    df = log_features(df, features_log)

    for feature in features_log:
        df = df.rename(columns={feature: 'lg_'+feature})
        # rename elements in features list: add 'lg_'
        # dct = {feature: 'lg_'+feature}
        # features = [dct.get(n, n) for n in features]
        # numeric = [dct.get(n, n) for n in numeric]

    col = df.pop(target)
    df.insert(len(df.columns), target, col)
    
    return df

In [None]:
def plot_corr_matrix(
        data, target, num_features=None, vars_color='0.3', vars_weight='medium',
        width=0.7, height=0.4, annot=5, labelsize=9, full=True,
        abs_results=True, plot=True, linecolor='light', df=False, df_limit=None):
    
    data_copy = data.copy()
    
    if num_features: 
        idx = num_features + 1
        width = num_features * width
        height = num_features * height
    else:
        idx = 0
        width = len(data_copy) * width
        height = len(data_copy) * height

    if abs_results:
        data = data.sort_values(target, ascending=False, key=abs)[:idx]
    else:
        data = data.sort_values(target, ascending=False)[:idx]
        
    cols = data.index
    data = data[cols]
    data = data.iloc[1:]

    if plot:
        f = plt.figure(figsize=(width, height))
        cmap = corr_matrix
        if linecolor == 'dark':
            linecolor = '#282828'
        else:
            linecolor = '#FFFFFF'
        if full:
            data = np.round(data, 2)
            ax = sns.heatmap(
                data=data, cmap=cmap, annot=True, vmax=1, vmin=-1,
                center=0, square=False, linewidths=0.5, linecolor=linecolor,
                annot_kws={'size': annot}, cbar=False
            )
        else:
            mask = np.triu(data)
            data = np.round(data, 2)
            ax = sns.heatmap(
                data=data, cmap=cmap, annot=True, vmax=1, vmin=-1,
                center=0, square=False, linewidths=0.5, linecolor=linecolor,
                mask=mask, annot_kws={'size': annot}, cbar=False
            )
        ax.xaxis.tick_top()
        xtickslabels = data.columns.tolist()
        xtickslabels = [str.upper(i) for i in xtickslabels]
        xtickslabels = xtickslabels[::-1]
        xticks = np.arange(len(xtickslabels)-1, -1, -1)
        xticks = [(i+0.5) for i in xticks]
        plt.xticks(
            ticks=xticks, labels=xtickslabels, fontsize=labelsize,
            weight=vars_weight, rotation=90)
        plt.tick_params(
            axis='x', labelcolor=vars_color, bottom=False,
            top=True, labelbottom=False, pad=7)
        ytickslabels = data.index.tolist()
        ytickslabels = [str.upper(i) for i in ytickslabels]
        ytickslabels = ytickslabels[::-1]
        yticks = np.arange(len(ytickslabels)-1, -1, -1)
        yticks = [(i+0.5) for i in yticks]
        plt.yticks(
            ticks=yticks, labels=ytickslabels, fontsize=labelsize,
            weight=vars_weight)
        plt.tick_params(axis='y', labelcolor=vars_color, pad=7)
        plt.show()

    if df:
        data_copy['sort'] = abs(data_copy[target])
        data_copy = data_copy.sort_values('sort', ascending=False)
        data_copy = data_copy.drop('sort', axis=1)

        if df_limit:
            last_idx = df_limit + 1
        else:
            last_idx = len(data_copy)
           
        data_copy = data_copy[target][1:last_idx].to_frame()
        
        return f, data_copy
    else:
        return f

In [None]:
def importance_categorical_create(x, y, variables):
    
    arr = np.array([])

    for variable in variables:
        x_lr = pd.get_dummies(x[variable], dtype='float')
        lr = sm.OLS(y, x_lr).fit()
        arr = np.append(arr, np.round(lr.rsquared, 3))

    return arr

In [None]:
def plot_features_inline(
        data, features, target, ci=95, kind='line',
        order=False, rotation=False, **kwargs):

    k=1
    ncols = len(features)
    fig, axes = plt.subplots(1, ncols, figsize=(5*ncols,3))
    
    for feature in features:
        plt.subplot(1, ncols, k)
        if kind == 'line':
            sns.regplot(
                data=data,
                x=feature,
                y=target,
                marker='.',
                scatter_kws={
                    'alpha': 0.75
                },
                line_kws={
                    'color': palette[-2],
                    'alpha':0.5
                },
                **kwargs
            ) 
        else:
            if order:
                order_ = (data[[feature, target]]
                          .groupby(feature)
                          .mean()
                          .sort_values(target, ascending=False)
                          .index
                          .tolist())
            else:
                order_ = None
            sns.pointplot(
                data=data,
                x=feature,
                y=target,
                order=order_,
                errwidth=1.5,
                scale=0.65,
                capsize=0.05,
                join=False,
                **kwargs
            )
        plt.ylabel(None)
        if rotation:
            plt.xticks(rotation=rotation)
        k +=1
        
    plt.show()
    return fig

In [None]:
def log_var(df, var_list, except_zeros=True):

    for var in var_list:
        df.loc[df[var]>0, var] = np.log(df[df[var]>0][var])
        
    return df

In [None]:
def var_exist_feature_create(df, var_list):

    for var in var_list:
        df[var+'_exst'] = df[var]>0
        df[var+'_exst'] = df[var+'_exst'].astype(int)

    return df

In [None]:
def plot_features_numeric(data, variable, target, hue=None):

    f = plt.figure(figsize=(9,3.5))
    f.suptitle(
        variable.title(), fontsize=11,
        weight='bold', color='0.45')
    
    plt.subplot(1, 2, 1)
    ax1 = sns.histplot(
        data=data,
        x=variable,
        hue=hue,
        alpha=0.95
    )
    plt.subplot(1, 2, 2)
    ax2 = sns.regplot(
        data=data,
        x=variable,
        y=target,
        marker='.',
        scatter_kws={
            'alpha': 0.75
        },
        line_kws={
            'color': palette[-2],
            'alpha': 0.75
        }
    )
    ax1.set(xlabel=None)
    ax2.set(xlabel=None)
    ax2.set_ylabel(ax2.get_ylabel().capitalize())
    ax1.set_ylabel(ax1.get_ylabel().capitalize())

    plt.subplots_adjust(wspace=0.3)
    plt.show()
    return f

In [None]:
def plot_features_categorical(
        data, variable, target, ci=95,
        rotation=None, order=None):
    
    # nunique = data[variable].nunique()
    f = plt.figure(figsize=(9,3.5))
    f.suptitle(
        variable.title(), fontsize=11,
        weight='bold', color='0.45')

    if order:
        order = order
        
    plt.subplot(1, 2, 1)
    ax1 = sns.countplot(
        data=data,
        x=variable,
        order=order,
        color=palette[0],
        saturation=0.75,
        width=0.5
    )
    plt.xticks(rotation=rotation)
    
    plt.subplot(1, 2, 2)
    ax2 = sns.pointplot(
        data=data,
        x=variable,
        y=target,
        order=order,
        errorbar=('ci', ci),
        errwidth=1.5,
        scale=0.65,
        capsize=0.05,
        join=False,
        color=palette[0]
    )
    ax1.set(xlabel=None)
    ax2.set(xlabel=None)
    ax1.set_ylabel(ax1.get_ylabel().capitalize())
    ax2.set_ylabel(ax2.get_ylabel().capitalize())
    plt.xticks(rotation=rotation)
    plt.subplots_adjust(wspace=0.3)
    plt.show()
    return f

In [None]:
def check_columns_match(data):

    df = data.copy()
    df['is_equal'] = df.eq(df.iloc[:, 0], axis=0).all(1).astype(int)
    equal_sum = df['is_equal'].sum()

    if equal_sum == len(df):
        print('All values matched')
        return None
    else:
        loc = df['is_equal'] == 0, df.columns != 'is_equal'
        result = df.loc[loc].copy()
        return result      

In [None]:
def vif_preprop(data, drop_first=False):
    
    df = data.copy()
    df = pd.get_dummies(df, drop_first=drop_first, dtype=int)
    sc = MinMaxScaler(feature_range=(0,1))
    
    df_minmax = sc.fit_transform(df)
    df_minmax = pd.DataFrame(
        data=df_minmax,
        index=df.index,
        columns=df.columns
    )
    return df_minmax

In [None]:
def vif(data):

    exogs = data.columns
    vif_dict, tolerance_dict = {}, {}

    for exog in exogs:
        not_exog = [i for i in exogs if i != exog]
        X, y = data[not_exog], data[exog]

        r_squared = LinearRegression().fit(X, y).score(X, y)

        vif = 1/(1 - r_squared)
        vif_dict[exog] = vif

        tolerance = 1 - r_squared
        tolerance_dict[exog] = tolerance

    df_vif = pd.DataFrame({
        'VIF': vif_dict,
        'Tolerance': tolerance_dict
    })
    df_vif = df_vif.sort_values('VIF', ascending=0)
    
    return df_vif

In [None]:
def create_df_group(features):
    df = train[features + [target]].copy()
    return df

In [None]:
def correlation_w_target(data, target):
    df = (data
          .corr()[target]
          .sort_values(ascending=False, key=abs)[1:]
          .to_frame())
    return df

In [None]:
def columns_correct_after_oh(data1, data2):

    df1 = data1.copy()
    df2 = data2.copy()
    idx1 = df1.index
    idx2 = df2.index
    
    cols = list(set(list(df1.columns) + list(df2.columns)))
    df_nans1 = pd.DataFrame(columns=cols, index=df1.index)
    df_nans2 = pd.DataFrame(columns=cols, index=df2.index)
    
    df1 = df1.merge(df_nans1, how='left', sort=False)
    df2 = df2.merge(df_nans1, how='left', sort=False)

    df1.index = idx1
    df2.index = idx2
    df2.columns = df1.columns

    df1 = df1.fillna(0)
    df2 = df2.fillna(0)

    return df1, df2

In [None]:
def check_new_features(data, features_list):
    
    df = data.copy()
    y = df[target]
    results_dct = {
        'features': [],
        'mrse': [],
        'vif_max_value': []
    }
    for features in features_list:
        if len(features) > 1:
            x = df[features]
            x = StandardScaler().fit_transform(x)
        else:
            x = df[features[0]].values.reshape(-1, 1)
            x = StandardScaler().fit_transform(x)
        lr = LinearRegression()
        lr.fit(x, y)
        mrse = mean_squared_error(y, lr.predict(x), squared=False)
        results_dct['features'].append(features)
        results_dct['mrse'].append(mrse)
        if len(features) > 1:
            vif_value = vif(pd.DataFrame(x)).iloc[0,0]
            results_dct['vif_max_value'].append(vif_value)
        else:
            results_dct['vif_max_value'].append(0)

    results = pd.DataFrame(results_dct)
    results = results.sort_values('mrse')
    results['features'] = results['features'].str.join(', ')

    return results

In [None]:
def cv_results_partial(cv_results, slices=None, n_folds=5):
    
    results = pd.DataFrame(columns=[
        'rmse', 'params', 'std', 'std_err',
        'split0', 'split1', 'split2', 'split3', 'split4'
    ])
    if slices:
        slice_x=slices[0]
        slice_y=slices[1]
        range_x = 0
        range_y = slice_y - slice_x
        
    else:
        slice_x = 0
        slice_y = len(cv_results['mean_test_score'])
        range_x = 0
        range_y = len(cv_results['mean_test_score'])

    scorescv = cv_results['mean_test_score'][slice_x:slice_y]
    paramscv = cv_results['params'][slice_x:slice_y]
    stdcv = cv_results["std_test_score"][slice_x:slice_y]
    
    for idx in np.arange(range_x, range_y, 1):
        score = abs(scorescv[idx])
        dct = paramscv[idx]
        alpha = dct['alpha']
        alpha = str(np.round(alpha, 4))
        l1_ratio = dct['l1_ratio']
        l1_ratio = str(np.round(l1_ratio, 1))
        std = stdcv[idx]
        std_err = std / np.sqrt(n_folds)
        params = alpha + '/' + l1_ratio
        new_row = [score, params, std, std_err]
        
        for split in np.arange(0, n_folds):
            split_name = 'split' + str(split) + '_test_score'
            split_values = abs(cv_results[split_name][slice_x:slice_y][idx])
            new_row.append(split_values)
            
        results.loc[len(results), results.columns] = new_row
    
    return results

In [None]:
def cv_results_params_transform(cv_results_df, param_cols, round_list):
    
    df = cv_results_df.copy()
    zipp = zip(param_cols, round_list)
    
    for col, scale in zipp:
        df[col] = df[col].astype(float)
        df[col] = df[col].round(scale)

    df['params_new'] = ''
    for col in param_cols:
        add_param = df[col].astype(str)
        df['params_new'] = df['params_new'] + '/' + add_param

    df['params_new'] = df['params_new'].apply(lambda x: x.lstrip('/'))
    idx = df.columns.get_loc('params')
    df.insert(loc=idx, column='parameters', value=df['params_new'])
    df = df.drop('params_new', axis=1)
    
    return df

In [None]:
def lr_model_data_formula(data, target, predictors=None):
    
    if predictors is None:
        # put target to the end of data
        cols = data.columns.tolist()
        cols.append(cols.pop(cols.index(target)))
        df = data[cols].copy()
        predictors = ' + '.join(df.columns[:-1])
    else:
        df = data.copy()
        predictors = ' + '.join(predictors)

    formula = target + ' ~ ' + predictors
    
    return df, formula

In [None]:
def simulation_regressions(
        data, target, model1, model2, model_names,
        sample_frac=0.3, n_folds=1000):
    
    # put target column to the end of columns
    cols = data.columns.tolist()
    cols.append(cols.pop(cols.index(target)))
    df = data[cols].copy()
    features = df.columns[:-1]
    dct = {i:[] for i in model_names}
    
    range = np.arange(0, n_folds, 1)
    for i in range:
        smpl = df.sample(frac=sample_frac, replace=True)
        x = smpl.iloc[:, :-1]
        y = smpl.iloc[:, -1]
        
        y_pred1 = model1.predict(x)
        rmse1 = mean_squared_error(y, y_pred1, squared=False)
        y_pred2 = model2.predict(x)
        rmse2 = mean_squared_error(y, y_pred2, squared=False)
        
        dct[model_names[0]].append(rmse1)
        dct[model_names[1]].append(rmse2)
        
    res = pd.DataFrame(dct)
    
    return res

In [None]:
def breusch_pagan_lagrange_test(model):
    name = ['Lagrange multiplier statistic', 'p_value', 'f_value', 'f_p_value']
    statistic = sm.stats.het_breuschpagan(model.resid, model.model.exog)
    dct = dict(zip(name, statistic))
    if ((dct['p_value'] < 0.05) |
        (dct['f_p_value'] < 0.05)):
        print('The test assumes heteroscedasticity')
    else:
        print('The test assumes homoscedasticity')
    return dct

In [None]:
def omnibus_test(model):
    name = ['Chi^2', 'Two-tail probability']
    statistic = sm.stats.omni_normtest(model.resid)
    dct = dict(zip(name, statistic))
    if dct['Two-tail probability'] < 0.05:
        print('The test assumes, that residuals distribution not gaussian')
    else:
        print('The test assumes, that residuals distribution is gaussian')
    return dct

In [None]:
def goldfeld_quandt_test(model):
    name = ["F statistic", "p_value"]
    statistic = sms.het_goldfeldquandt(model.resid, model.model.exog)
    dct = dict(zip(name, statistic))
    if dct['p_value'] < 0.05:
        print('The test assumes heteroscedasticity')
    else:
        print('The test assumes homoscedasticity')
    return dct

In [None]:
def jarque_bera_test(model):
    name = ["Jarque-Bera", "Chi^2 Two-tail probability", "Skew", "Kurtosis"]
    statistic = sms.jarque_bera(model.resid)
    dct = dict(zip(name, statistic))
    if dct['Chi^2 Two-tail probability'] < 0.05:
        print('The test assumes, that skewness and kurtosis'
              ' not matching a normal distribution')
    else:
        print('The test assumes, that skewness and kurtosis'
              ' matching a normal distribution')
    return dct

In [None]:
def test_ramsey_reset(model, alpha=0.05):

    tests_names = 'Ramsey’s RESET'
    # category_names = 'Model'
    pvalue = []
    # result = []
    condition = []
    
    reset_test = reset_ramsey(model, degree=5)
    pvalue_reset = reset_test.pvalue
    if pvalue_reset < alpha:
        condition.append('Non-linear effects')
        # result.append('Not passed')
    else:
        condition.append('No non-linear effects')
        # result.append('Passed')
    pvalue_reset = np.round(pvalue_reset, 2)
    pvalue.append(pvalue_reset)

    results_df = pd.DataFrame({
        'Test': tests_names,
        # 'Category': category_names,
        'P or Statistic (s)': pvalue,
        'Condition': condition,
        # 'Result': result
    })
    
    return results_df

In [None]:
def test_lr_multicollinearity(model):

    tests_names = 'Condition Number (s)'
    # category_names = 'Predictors'
    # result = []
    condition = []
    
    # condition number
    mult = np.linalg.cond(model.model.exog)
    if mult < 15:
        condition.append('No multicollinearity')
        # result.append('Passed')
    else:
        condition.append('Multicollinearity')
        # result.append('Not passed')
        
    pvalue = np.round(mult, 4)
    
    results_df = pd.DataFrame({
        'Test': tests_names,
        # 'Category': category_names,
        'P or Statistic (s)': pvalue,
        'Condition': condition,
        # 'Result': result
    })
    
    return results_df

In [None]:
def test_lr_cooks_distance(model):

    tests_names = "Cook's Distance (s)"
    # category_names = 'Outliers'
    condition = []
    # result = []
    # cook's distance
    cooksd = model.get_influence().cooks_distance[0]
    cooksd_border = 4/len(cooksd)
    outliers_num = (cooksd > cooksd_border).sum()
    if outliers_num > 0:
        condition.append('Outliers detected')
        # result.append('Not passed')
    else:
        condition.append('No outliers')
        # result.append('Passed')
        
    pvalue = np.round(outliers_num, 4)
    results_df = pd.DataFrame({
        'Test': tests_names,
        # 'Category': category_names,
        'P or Statistic (s)': pvalue,
        'Condition': condition,
        # 'Result': result
    })
    
    return results_df

In [None]:
def test_normality(data, alpha=0.05):
    
    # category_names = []
    tests_names = []
    pvalue = []
    condition = []
    # result = []
        
    # Kolmogorov-Smirnov
    ks = stats.kstest(data, 'norm')
    pvalue_ks = ks.pvalue
    # category_names.append('Residuals')
    tests_names.append('Kolmogorov-Smirnov')
    pvalue.append(pvalue_ks)
    if pvalue_ks < alpha:
        condition.append('Not normal')
        # result.append('Not passed')
    else:
        condition.append('Normal')
        # result.append('Passed')

    # Anderson-Darling
    and_dar = stats.anderson(data, dist='norm')
    and_dar_sign = and_dar.critical_values[2]
    and_dar_statistic = and_dar.statistic
    # category_names.append('Residuals')
    tests_names.append('Anderson-Darling (s)')
    pvalue.append(and_dar_statistic)
    if and_dar_statistic > and_dar_sign:
        condition.append('Not normal')
        # result.append('Not passed')
    else:
        condition.append('Normal')
        # result.append('Passed')

    # Shapiro-Wilk
    pvalue_sw = stats.shapiro(data).pvalue
    # category_names.append('Residuals')
    tests_names.append('Shapiro-Wilk')
    pvalue.append(pvalue_sw)
    if pvalue_sw < alpha:
        condition.append('Not normal')
        # result.append('Not passed')
    else:
        condition.append('Normal')
        # result.append('Passed')

    # jarque-bera test
    jb_name = ["Jarque-Bera", "Chi^2", "Skew", "Kurtosis"]
    jb_statistic = sms.jarque_bera(data)
    jb = dict(zip(jb_name, jb_statistic))
    pvalue_jb = jb['Chi^2']
    # category_names.append('Residuals')
    tests_names.append('Jarque-Bera')
    pvalue.append(pvalue_jb)
    if pvalue_jb < alpha:
        condition.append('Not normal')
        # result.append('Not passed')
    else:
        condition.append('Normal')
        # result.append('Passed')
    
    # D’Agostino and Pearson
    dagp = stats.normaltest(data)
    pvalue_dagp = dagp.pvalue
    # category_names.append('Residuals')
    tests_names.append('D’Agostino-Pearson')
    pvalue.append(pvalue_dagp)
    if pvalue_dagp < alpha:
        condition.append('Not normal')
        # result.append('Not passed')
    else:
        condition.append('Normal')
        # result.append('Passed')

    pvalue = [np.round(i, 4) for i in pvalue]
    results_df = pd.DataFrame({
        'Test': tests_names,
        # 'Category': category_names,
        'P or Statistic (s)': pvalue,
        'Condition': condition,
        # 'Result': result
    })
    
    return results_df

In [None]:
def test_lr_residuals_mean_ttest(resid, alpha=0.05):

    category_names = []
    tests_names = []
    pvalue = []
    condition = []
    # result = []
    
    # mean by student
    stmn = stats.ttest_1samp(resid, popmean=0)
    pvalue_stmn = stmn.pvalue
    # category_names.append('Residuals')
    tests_names.append('One Sample t-test')
    pvalue.append(pvalue_stmn)
    if pvalue_stmn < alpha:
        condition.append('Mean not equals zero')
        # result.append('Not passed')
    else:
        condition.append('Mean equals zero')
        # result.append('Passed')

    pvalue = [np.round(i, 4) for i in pvalue]
    results_df = pd.DataFrame({
        'Test': tests_names,
        # 'Category': category_names,
        'P or Statistic (s)': pvalue,
        'Condition': condition,
        # 'Result': result
    })
    
    return results_df

In [None]:
def test_lr_heteroscedasticity(model, alpha=0.05):

    # category_names = []
    tests_names = []
    pvalue = []
    condition = []
    # result = []

    resid = model.resid
    exogs = model.model.exog

    # white test
    w_name = [
        'Test Statistic', 'p_value',
        'F Statistic', 'f p_value'
    ]
    w_statistic = sms.het_white(resid,  exogs)
    w_test = dict(zip(w_name, w_statistic))
    pvalue_w_test = w_test['p_value']
    # category_names.append('Residuals')
    tests_names.append("White's")
    pvalue.append(pvalue_w_test)
    if pvalue_w_test < alpha:
        condition.append('Heteroscedasticity')
        # result.append('Not passed')
    else:
        condition.append('Homoscedasticity')
        # result.append('Passed')
    
    # breusch-pagan-lagrange test
    bpl_name = [
        'Lagrange multiplier statistic',
        'p_value', 'f_value', 'f_p_value'
    ]
    bpl_statistic = sm.stats.het_breuschpagan(resid, exogs)
    bpl = dict(zip(bpl_name, bpl_statistic))
    pvalue_bpl = bpl['p_value']
    # category_names.append('Residuals')
    tests_names.append('Breusch-Pagan-Lagrange')
    pvalue.append(pvalue_bpl)
    if pvalue_bpl < alpha:
        condition.append('Heteroscedasticity')
        # result.append('Not passed')
    else:
        condition.append('Homoscedasticity')
        # result.append('Passed')
    
    # goldfeld-quandt test
    gq_name = ["F statistic", "p_value"]
    gq_statistic = sms.het_goldfeldquandt(resid, exogs, drop=0.2)
    gq = dict(zip(gq_name, gq_statistic))
    pvalue_gq = gq['p_value']
    # category_names.append('Residuals')
    tests_names.append('Goldfeld-Quandt')
    pvalue.append(pvalue_gq)
    if pvalue_gq < alpha:
        condition.append('Heteroscedasticity')
        # result.append('Not passed')
    else:
        condition.append('Homoscedasticity')
        # result.append('Passed')

    pvalue = [np.round(i, 4) for i in pvalue]
    results_df = pd.DataFrame({
        'Test': tests_names,
        # 'Category': category_names,
        'P or Statistic (s)': pvalue,
        'Condition': condition,
        # 'Result': result
    })

    return results_df

In [None]:
def test_lr_r_squared(model, alpha):

    # category_names = []
    tests_names = []
    pvalue = []
    condition = []
    # result = []
    
    df1 = int(model.df_model)
    df2 = int(model.nobs - model.df_model - 1)
    fvalue = model.fvalue
    fpvalue = 1 - stats.f.cdf(fvalue, df1, df2, loc=0, scale=1)
    # category_names.append('Model')
    tests_names.append('Fisher Criterion')
    pvalue.append(fpvalue)
    if fpvalue < alpha:
        condition.append('Significant')
        # result.append('Passed')
    else:
        condition.append('Insignificant')
        # result.append('Not passed')

    pvalue = [np.round(i, 4) for i in pvalue]
    results_df = pd.DataFrame({
        'Test': tests_names,
        # 'Category': category_names,
        'P or Statistic (s)': pvalue,
        'Condition': condition,
        # 'Result': result
    })

    return results_df

In [None]:
def test_lr_autocorrelation(model, alpha=0.05):

    # category_names = []
    tests_names = []
    pvalue = []
    condition = []
    # result = []
    
    # residuals calculate
    resid = model.resid

    # durbin-watson
    # category_names.append('Residuals')
    tests_names.append('Durbin-Watson (s)')
    dw_test = sm.stats.stattools.durbin_watson(resid)
    if (1.5 < dw_test < 2.5):
        condition.append('No autocorrelation')
        # result.append('Passed')
    else:
        condition.append('Autocorrelation')
        # result.append('Not Passed')
    dw_test = np.round(dw_test, 2)
    pvalue.append(dw_test)

    # Breusch-Godfrey
    bg_test = sm.stats.diagnostic.acorr_breusch_godfrey(model, nlags=3)
    bg_name = [
        'Lagrange multiplier test statistic',
        'p_value', 'F statistic', 'fp_value'
    ]
    bg = dict(zip(bg_name, bg_test))
    pvalue_bg = bg['p_value']
    # category_names.append('Residuals')
    tests_names.append('Breusch-Godfrey')
    if pvalue_bg < alpha:
        condition.append('Autocorrelation')
        # result.append('Not passed')
    else:
        condition.append('No autocorrelation')
        # result.append('Passed')
    pvalue_bg = np.round(pvalue_bg, 4)
    pvalue.append(pvalue_bg)
    
    results_df = pd.DataFrame({
        'Test': tests_names,
        # 'Category': category_names,
        'P or Statistic (s)': pvalue,
        'Condition': condition,
        # 'Result': result
    })

    return results_df

In [None]:
def test_vif(X):
    
    # category_names = []
    tests_names = []
    pvalue = []
    condition = []
    # result = []
    
    vf = vif(X)['VIF'].iloc[0]
    # category_names.append('Predictors')
    tests_names.append('VIF (s)')
    pvalue.append(vf)
    if vf > 10:
        condition.append('Multicollinearity')
        # result.append('Not passed')
    else:
        condition.append('No multicollinearity')
        # result.append('Passed')

    results_df = pd.DataFrame({
        'Test': tests_names,
        # 'Category': category_names,
        'P or Statistic (s)': pvalue,
        'Condition': condition,
        # 'Result': result
    })
    return results_df

In [None]:
def wilcoxon(data, alpha=0.05):

    tests_names = []
    pvalue = []
    condition = []

    tests_names.append('One Sample Wilcoxon test')
    w, p_value = scipy.stats.wilcoxon(data, alternative='greater')
    pvalue.append(p_value)

    if p_value < alpha:
        condition.append('Mean not equals zero')
    else:
        condition.append('Mean equals zero')

    results_df = pd.DataFrame({
        'Test': tests_names,
        'P or Statistic (s)': pvalue,
        'Condition': condition,
    })
    
    return results_df

In [None]:
def test_one_sided_bootstrap(
        data, value, statistic=np.mean, n_bootstrap=10000, confidence_level=0.95):

    '''
    Null hypothesis: value is real mean (or statistic) of data
    '''
    
    bootstrap = ci_bootstrap(
        data=data, statistic=statistic, n_bootstrap=n_bootstrap,
        confidence_level=confidence_level)

    ci_min = bootstrap['ci_min']
    ci_max = bootstrap['ci_max']

    if ci_min <= value <= ci_max:
        # we can't reject hypothesis that 'value' is real mean
        return True
    else:
        return False

In [None]:
def test_lr_residuals_mean_bootstrap(resid, n_bootstrap=10000, alpha=0.05):

    condition = []
    ci_level = 1 - alpha
    res = test_one_sided_bootstrap(
        resid, value=0, statistic=np.mean, n_bootstrap=n_bootstrap, confidence_level=ci_level)

    if res:
        condition.append('Mean equals zero')
    else:
        condition.append('Mean not equals zero')
    results_df = pd.DataFrame({
        'Test': 'One Sample Bootstrap test (s)',
        'P or Statistic (s)': n_bootstrap,
        'Condition': condition,
    })
    
    return results_df

In [None]:
def regression_diagnostics(model, X, alpha=0.05):

    # residuals calculate
    resid = model.resid
    # ramsey's reset
    results_ramsey_reset = test_ramsey_reset(model, alpha=0.05)
    results_ramsey_reset['Category'] = 'Model'
    # r-squared significance
    results_r = test_lr_r_squared(model, alpha=alpha)
    results_r['Category'] = 'Model'
    # vif
    results_vif = test_vif(X)
    results_vif['Category'] = 'Predictors'
    # multicollinearity test
    results_mult = test_lr_multicollinearity(model)
    results_mult['Category'] = 'Predictors'
    # residuals mean t-test
    results_resid_mean = test_lr_residuals_mean_ttest(resid, alpha=alpha)
    results_resid_mean['Category'] = 'Residuals'
    # residuals mean wilcoxon test
    results_resid_mean_wilcoxon = wilcoxon(resid, alpha=alpha)
    results_resid_mean_wilcoxon['Category'] = 'Residuals'
    # residuals mean bootstrap test
    results_resid_mean_bootstrap = test_lr_residuals_mean_bootstrap(resid, alpha=alpha)
    results_resid_mean_bootstrap['Category'] = 'Residuals'
    # residuals normality test
    results_resid = test_normality(resid, alpha=alpha)
    results_resid['Category'] = 'Residuals'
    # heteroscedasticity test
    results_hetero = test_lr_heteroscedasticity(model, alpha=alpha)
    results_hetero['Category'] = 'Residuals'
    # cook's distance
    results_cd = test_lr_cooks_distance(model)
    results_cd['Category'] = 'Residuals'
    # autocorrelation
    results_ac = test_lr_autocorrelation(model, alpha=0.05)
    results_ac['Category'] = 'Residuals'

    results_list = [
        results_ramsey_reset, results_r, results_vif, results_mult,
        results_cd, results_resid_mean, results_resid_mean_wilcoxon,
        results_resid_mean_bootstrap, results_resid, results_hetero, results_ac
    ]

    results = pd.concat(results_list, axis=0)
    results = results.reset_index(drop=True)
    # move column 'Category' to position after column 'Test'
    col = results.pop('Category')
    idx = results.columns.get_loc('Test')
    results.insert(idx+1, 'Category', col)

    # round values
    round_to_zero_list = [
        "Cook's Distance (s)", 'One Sample Bootstrap test (s)'
    ]
    for i in results.index:
        value = results.loc[i, 'P or Statistic (s)']
        if results.loc[i, 'Test'] not in round_to_zero_list:
            results.loc[i, 'P or Statistic (s)'] = '{:,.4f}'.format(value)
        else:
            results.loc[i, 'P or Statistic (s)'] = '{:.0f}'.format(value)
    
    return results

In [None]:
def get_cooksd_outliers_idxs(model, data):
    df = data.copy()
    # calculate cooks distance for all elements
    cooksd = model.get_influence().cooks_distance[0]
    # border
    cooksd_border = 4/len(cooksd)
    df['cooksd'] = cooksd
    df['outlier'] = (df['cooksd'] > cooksd_border).astype(int)
    outlier_number = df['outlier'].sum()
    outlier_idxs = df[df['outlier'] == 1].index.tolist()
    return outlier_idxs

In [None]:
def plot_lr_coef(model, orient='v', figsize=(4, 6)):

    f = plt.figure(figsize=figsize)
    s=15
    coeff_df = pd.DataFrame(model.params, columns=['coeff'])
    conf_df = model.conf_int().rename(columns={0: 'min_ci', 1: 'max_ci'})
    conf_plot = pd.concat([coeff_df, conf_df], axis=1)
    conf_plot = conf_plot.iloc[1:]
    conf_plot = conf_plot.sort_values('coeff', ascending=False, key=abs)
    
    if orient == 'h':
        ax = sns.scatterplot(
            data=conf_plot,
            x=conf_plot.index,
            y='coeff',
            s=s,
            alpha=0.9
        )
        # errobars
        yerr = ([conf_plot['coeff'] - conf_plot['min_ci'],
                 conf_plot['max_ci'] - conf_plot['coeff']])
        plt.errorbar(
            x=conf_plot.index.tolist(),
            y=conf_plot['coeff'],
            yerr=yerr, fmt='none', elinewidth=1, capsize=1.2, alpha=0.9)

        plt.axhline(0, color=palette[0], lw=0.75, alpha=0.25)
    
        plt.ylabel('Coefficients')
        plt.xticks(rotation=90)

    if orient == 'v':
        ax = sns.scatterplot(
            data=conf_plot,
            x='coeff',
            y=conf_plot.index,
            s=s,
            alpha=0.9
        )
        # errobars
        xerr = ([conf_plot['coeff'] - conf_plot['min_ci'],
                 conf_plot['max_ci'] - conf_plot['coeff']])
        plt.errorbar(
            x=conf_plot['coeff'], y=conf_plot.index.tolist(),
            xerr=xerr, fmt='none', elinewidth=1,
            capsize=1.2, capthick=1, alpha=0.9
        )
        plt.axvline(
            0, ymin=0.02, ymax=0.98, color=palette[0], lw=0.5, alpha=0.75)

    yticks = conf_plot.index
    ylabels = [str.lower(i) for i in yticks]
    plt.yticks([])
    plt.xticks(color=alpha_color(palette[0], 0.85))
    plt.ylabel(None)
    plt.xlabel(None)
    plt.title(' ', loc='left', pad=40)
    ax.tick_params(
        bottom=False,
        axis='x', which='major', labelsize=7, pad=5)
    ax.tick_params(axis='y', pad=10)
    ax.spines[['bottom', 'top', 'left', 'right']].set_visible(True)
    ax.spines[['bottom', 'top', 'left', 'right']].set_linewidth(0.5)
    plt.grid(False)
    plt.show()
    
    return f

In [None]:
def plot_lr_residuals(model, data, target):
    # fitted values
    y_pred = model.fittedvalues
    #  Plot
    ax = sns.residplot(
        x=y_pred, y=target, data=data,
        lowess=True,
        scatter_kws={
            'alpha': 0.9,
            's': 10,
            'ec': '0.85',
            'linewidth': 0.35
        },
        line_kws={
            'color': palette[1],
            'lw': 1.5,
            'alpha': 0.35
        })
    # Titel and labels
    ax.set_title('Residuals vs Predicted')
    ax.set_xlabel('Fitted values')
    ax.set_ylabel('Residuals')
    plt.show()

In [None]:
def plot_leverage_resid_square(model, figsize):

    infl = model.get_influence()
    leverage = infl.hat_matrix_diag
    resid = zscore(infl.resid)
    resid_square = resid**2
    labels = model.model.data.row_labels

    fig, ax = plt.subplots(figsize=figsize)
    plt.scatter(x=resid_square, y=leverage)
    for i, txt in enumerate(labels):
        ax.annotate(txt, (resid_square[i], leverage[i]), fontsize=7)
    ax.set_xlabel('Normalized Residuals Squared')
    ax.set_ylabel('Levarage')
    ax.set_title('Levarage vs. Normalized Residuals Squared')
    plt.show()

In [None]:
def outliers_column_iqr(data, feature, scale=1.5):

    df = data.copy()

    q1 = df[feature].quantile(0.25)
    q3 = df[feature].quantile(0.75)
    iqr = q3 - q1
    lower_boundary = q1 - scale*iqr
    upper_boundary = q3 + scale*iqr
    condition = ((df[feature] < lower_boundary) |
                 (df[feature] > upper_boundary))
    df[feature+'_is_out'] = condition.astype(int)

    return df

In [None]:
def simulation_enet_features(X, y, alphas):

    X_ = X.copy()
    keys = [
        'score', 'alpha', 'features_num',
        'vif_max_value', 'features_list'
    ]
    res_dct = {key:[] for key in keys}
    
    for alpha in alphas:
        res_dct['alpha'].append(alpha)
        # score 
        estimator = ElasticNet(alpha=alpha, random_state=seed)
        estimator.fit(X_, y)
        y_pred = estimator.predict(X_)
        score = mean_squared_error(y, y_pred, squared=False)
        res_dct['score'].append(score)
        # features and features number
        summary = pd.DataFrame({
            'feature': estimator.feature_names_in_,
            'coeff': estimator.coef_
        })
        summary = summary.loc[summary['coeff'] != 0]
        features = summary['feature'].tolist()
        features_len = len(summary)
        res_dct['features_list'].append(features)
        res_dct['features_num'].append(features_len)
        # vif estimation
        vif_df = vif(X_[features])
        if vif_df.empty:
            vif_max_value = 0
        else:
            vif_max_value = vif_df.iloc[0,0]
        vif_max_value = np.round(vif_max_value, 1)
        res_dct['vif_max_value'].append(vif_max_value)
    
    return res_dct

In [None]:
def check_sum(data, sum_column, columns, return_diff=False):
    
    all_columns = columns.copy()
    all_columns.append(sum_column)
    df = data[all_columns].copy()
    df['diff'] = df[sum_column]
    
    for column in columns:
        df['diff'] = df['diff'] - df[column]
    cond = df['diff'].any() != 0
    
    if cond:
        if return_df:
            result = df.loc[df['diff'] != 0]
            result = result.sort_values(
                'diff', ascending=False, key=abs)
            return result
        else:
            return True
    else:
        return False

In [None]:
def check_garage_discrepancy(data, features_garage):
    df = data[features_garage].copy()
    
    garage_len_check(df, features_garage)
    garage_na_zero_check(test, features_garage)

In [None]:
def garage_len_check(df, features_garage):
    condition = (len(df[df['garagearea'] == 0])
                 == len(df[df['garagecars'] == 0])
                 == len(df[df['garagecond'] == 'NA'])
                 == len(df[df['garagefinish'] == 'NA'])
                 == len(df[df['garagequal'] == 'NA'])
                 == len(df[df['garagetype'] == 'NA']))
    if condition:
        print('Garage Features NA-zeroes length: No discrepancy')
    else:
        print('Garage Features NA-zeroes length: Discrepancy detected')

In [None]:
def garage_na_zero_check(data, features_garage):
    df = data.copy()
    loc_r = ((df['garagecars'] == 0)
             | (df['garagearea'] == 0)
             | (df['garagecond'] == 'NA')
             | (df['garagefinish'] == 'NA')
             | (df['garagequal'] == 'NA')
             | (df['garagetype'] == 'NA'))

    check_na_vars = [
        'garagecars', 'garagearea', 'garagecond',
        'garagefinish', 'garagequal', 'garagetype'
    ]
    train_garage_na = df.loc[loc_r, check_na_vars]
    values_dict = {0: 'NA'}

    for feature in check_na_vars:
        loc = (train_garage_na[feature] == 0, feature)
        train_garage_na.loc[loc] = train_garage_na.loc[loc].map(values_dict)

    df = train_garage_na.copy()
    df['is_equal'] = df.eq(df.iloc[:, 0], axis=0).all(1).astype(int)
    equal_sum = df['is_equal'].sum()

    if equal_sum == len(df):
        print('Garage Features NA-zeroes: No discrepancy')
        return None
    else:
        print('Garage Features NA-zeroes: Discrepancy detected')
        loc = df['is_equal'] == 0, df.columns != 'is_equal'
        result = df.loc[loc].copy()
        return result

In [None]:
def check_bsmt_discrepancy(data, features_bsmt):
    df = data[features_bsmt].copy()
    df['bsmnt_check'] = (df['totalbsmtsf']
                         - df['bsmtunfsf']
                         - df['bsmtfinsf_second']
                         - df['bsmtfinsf_first'])

    condition = df['bsmnt_check'].any()
    if condition:
        print('Basement Features: Discrepancy detected')
        return df.loc[df['bsmnt_check'] != 0, :]
    else:
        print('Basement Features: No discrepancy')

In [None]:
def square_feet_check_discrepancy(data, features_square):
    features = features_square.copy()
    features.extend(['masvnrtype', 'poolqc'])
    df = data[features].copy()
    # livarea
    df['livarea_check'] = (df['grlivarea']
                           - df['first_flrsf']
                           - df['second_flrsf']
                           - df['lowqualfinsf'])
    condition = df['livarea_check'].any()
    if condition:
        print('Livarea Features: Discrepancy detected')
        return df.loc[df['bsmnt_check'] != 0, :]
    else:
        print('Livarea Features: No discrepancy')
    # masvnrtype
    cond1 = ((df['masvnrtype'] == 'NA')
             & df['masvnrarea'] != 0)
    cond2 = ((df['masvnrarea'] == 0)
             & (df['masvnrtype'] != 'NA'))
    loc_r = (cond1 | cond2)
    train_masvnr_unaccord = df[['masvnrarea', 'masvnrtype']].loc[loc_r, :]
    if train_masvnr_unaccord.empty:
        print('Masvnrtype Features: No discrepancy')
    else:
        print('Masvnrtype Features: Discrepancy detected')
        return train_masvnr_unaccord
    # poolarea
    loc = ((df['poolarea'] == 0)
       | (df['poolqc'] == 'NA'))
    train_poolarea_na = df[['poolarea', 'poolqc']].loc[loc, :]
    values_dict = {0: 'NA'}
    loc = (train_poolarea_na['poolarea'] == 0, 'poolarea')
    train_poolarea_na.loc[loc] = train_poolarea_na.loc[loc].map(values_dict)

    df = train_poolarea_na.copy()
    df['is_equal'] = df.eq(df.iloc[:, 0], axis=0).all(1).astype(int)
    equal_sum = df['is_equal'].sum()

    if equal_sum == len(df):
        print('Pool Features NA-zeroes: No discrepancy')
    else:
        print('Pool Features NA-zeroes: Discrepancy detected')
        loc = df['is_equal'] == 0, df.columns != 'is_equal'
        result = df.loc[loc].copy()
        return result

In [None]:
def np_std_ddof1(x):
        return np.std(x, ddof=0)

In [None]:
def year_check_discrepancy(data, features_year):
    df = data[features_year].copy()
    if (df['yearremodadd'] < df['yearbuilt']).any():
        print('Modernization is earlier than built')
    if (df['yrsold'] < df['yearbuilt']).any():
        print('Sold earlier than built')
    else:
        print('Year Features: No discrepancy')

In [None]:
def models_evaluation(
        X, y, names, estimators, parameters,
        n_folds=10, full_results=False):
    
    results_cv_keys = [
        'Model', 'MeanScore', 'StdScore',
        'FitTime', 'StdFitTime', 'ScoreTime', 'StdScoreTime',
        'HyperSearchTime', 'BestEstimator'
    ]
    t_t = time.time()
    results_cv_dct = {i:[] for i in results_cv_keys}
    # results_cv_dct_full = {i:{} for i in names}
    cv_dict = {i:None for i in names}
    ranges = zip(names, estimators, parameters)
    
    for name, estimator, params in ranges:
        t_st = time.time()
        cv = GridSearchCV(
            estimator=estimator, 
            param_grid=params,
            scoring='neg_root_mean_squared_error',
            cv=n_folds
        )
        cv.fit(X, y)
        # calculate scores
        best_index = cv.best_index_
        best_score = cv.cv_results_['mean_test_score'][best_index]
        std_score = cv.cv_results_['std_test_score'][best_index]
        fit_time = cv.cv_results_['mean_fit_time'][best_index]
        std_fit_time = cv.cv_results_['std_fit_time'][best_index]
        score_time = cv.cv_results_['mean_score_time'][best_index]
        std_score_time = cv.cv_results_['std_score_time'][best_index]
        best_estimator = cv.best_estimator_
        t_reg = time.time() - t_st
        t_reg_format = dt.timedelta(seconds=np.round(t_reg))
        t_reg_format = str(t_reg_format)
        # list with result values
        results_cv_values = [
            name, best_score, std_score,
            fit_time, std_fit_time, score_time, std_score_time,
            t_reg_format, best_estimator
        ]
        # fill results dict
        for key, value in zip(results_cv_keys, results_cv_values):
            results_cv_dct[key].append(value)

        if full_results:
            # results_cv_dct_full[name] = cv.cv_results_
            cv_dict[name] = cv
        
    results_cv = pd.DataFrame(results_cv_dct)
    results_cv = results_cv.sort_values('MeanScore', ascending=False)
    results_cv = results_cv.reset_index(drop=True)

    t_tf = time.time() - t_t
    t_tf_format = dt.timedelta(seconds=np.round(t_tf))
    t_tf_format = str(t_tf_format)
    results_cv['TotalTime'] = t_tf_format
    idx = results_cv.columns.get_loc('BestEstimator')
    results_cv.insert(
        loc=idx, column='TotalTime_1', value=results_cv['TotalTime'])
    results_cv = results_cv.drop('TotalTime', axis=1)
    results_cv = results_cv.rename(columns={'TotalTime_1': 'TotalTime'})

    if full_results:
        return results_cv, cv_dict
    else:
        return results_cv

In [None]:
def get_colors(n=5):
    hsv_tuples = [(x/n, 0.85*x/n, 0.85*x/n) for x in range(n)]
    hex = []
    for rgb in hsv_tuples:
        rgb = map(lambda x: int(x * 255), colorsys.hsv_to_rgb(*rgb))
        hex.append('#%02x%02x%02x' % tuple(rgb))
    return hex

In [None]:
def rgb_to_hex(x):
    color_hex = matplotlib.colors.to_hex(x)
    return color_hex

In [None]:
def feature_importance_display(
        features, importance,
        top=None, imp_min_level=None, only_features=True):

    feature_importance = pd.DataFrame({
        'Feature': features,
        'Importance': importance
    })
    if imp_min_level is not None:
        loc_row = feature_importance['Importance'] > imp_min_level
        feature_importance = (feature_importance
                              .loc[loc_row, :]
                              .sort_values('Importance', ascending=False)
                              .reset_index(drop=True))
    if top is not None:
        feature_importance = (feature_importance
                             .sort_values('Importance', ascending=False)
                             .reset_index(drop=True))
        feature_importance = feature_importance.loc[0:top-1]

    if only_features:
        feature_importance = feature_importance['Feature']
        
    return feature_importance

In [None]:
def ci_t_distribution(
        data=None, mean=None, std=None, n=None, confidence_level=0.95):

    if data is not None:
        arr = np.array(data)
        n = len(arr)
        mean = np.mean(arr)
        se = scipy.stats.sem(arr)
        
    if mean and std and n is not None:
        se = std / np.sqrt(n)

    t = scipy.stats.t.ppf((1+confidence_level) / 2, n-1)
    margin = t * se
    ci_min = mean - margin
    ci_max = mean + margin

    return_dct = {
        'min': ci_min,
        'max': ci_max,
        'mean': mean,
        'margin': margin,
        't': t
    }
    return return_dct

In [None]:
def ci_bootstrap(
        data, statistic=np.mean, method='BCa', n_bootstrap=1000,
        confidence_level=0.95, random_state=42):
    '''
    Returns: dict(statistic, std, ci_min, ci_max, margin)
    '''
    data_ = (data,)
    bootstrap = scipy.stats.bootstrap(
        data=data_,
        statistic=statistic,
        method=method,
        n_resamples=n_bootstrap,
        confidence_level=confidence_level,
        random_state=random_state
    )
    ci_min = bootstrap.confidence_interval[0]
    ci_max = bootstrap.confidence_interval[1]

    if isinstance(data, pd.DataFrame):
        stat = data.apply(statistic)
        stat = np.array(stat)
        std = np.array(np.std(data, ddof=1))
    else:
        stat = statistic(data)
        std = np.std(data, ddof=1)

    proximate_margin = bootstrap.standard_error*1.96

    return_dct = {
        'statistic': stat,
        'std': std,
        'ci_min': ci_min,
        'ci_max': ci_max,
        'proxi_margin': proximate_margin
    }
    
    return return_dct

In [None]:
def pointplot(
        x, y, err, figsize=(3,4), s=40, fmt='none',
        linestyle='none', capsize=2, capthick=1, linewidth=1, ylim=None,
        scatter_kwargs={}, err_kwargs={}):

    fig = plt.figure()
    plt.figure(figsize=(3,4))
    plt.scatter(
        x=x,
        y=y,
        s=40,
        **scatter_kwargs
    );
    plt.errorbar(
        x=x,
        y=y,
        yerr=err,
        fmt=fmt,
        linestyle=linestyle,
        capsize=capsize,
        capthick=capthick,
        linewidth=linewidth,
        **err_kwargs
    );
    xmin, xmax, _, _ = plt.axis()
    plt.xlim(xmin-0.5, xmax+0.5)
    if ylim:
        plt.ylim(ylim)

    return fig

In [None]:
def legend_customized(
        figure, legend_x=200, legend_y=16.5, ncols=1, fontsize=8, titlesize=9,
        alignment='left', ax=None, **kwargs):
    
    fig_x = (figure.get_size_inches()*figure.dpi)[0]
    fig_y = (figure.get_size_inches()*figure.dpi)[1]

    if ncols == 1:
        legend_x = (fig_x + legend_x) / fig_x
    else:
        legend_x = (fig_x + 0.1*legend_x) / fig_x
    legend_y = (fig_x + legend_y) / fig_x

    ax = ax or plt
    ax.legend(
        fontsize=fontsize, ncols=ncols, 
        title_fontproperties={'size': titlesize}, alignment=alignment,
        bbox_to_anchor=(legend_x, legend_y), **kwargs
    )

In [None]:
def plot_gridplot(
        data, features, target=None, figsize=None, ncols=3, kind='reg',
        plot_shape='rectangle', markersize=15, hscale=1, pscale=1, regplot_kwargs={},
        pointplot_kwargs={}, scatterplot_kwargs={}, histplot_kwargs={}):

    nrows = math.ceil(len(features) / ncols)
    nplots = np.arange(1, len(features)+1)

    if plot_shape == 'square':
        whscale=(2,2)
    if plot_shape == 'rectangle':
        whscale=(4,2.5)

    if figsize is not None:
        figsize = figsize
    else:
        width = whscale[0] * ncols
        height = whscale[1] * nrows
        figsize_width = width * pscale
        figsize_height = height * pscale
        figsize = (figsize_width, figsize_height)
    
    fig = plt.figure(figsize=figsize)
    
    if kind == 'reg':
        for feature, plot in zip(features, nplots):
            plt.subplot(nrows, ncols, plot)
            sns.regplot(
                data=data,
                x=feature,
                y=target,
                scatter_kws={
                    'ec': '#606060',
                    's': markersize,
                    'alpha': 0.9
                },
                **regplot_kwargs
            )
            plt.ylabel(None)

    if kind == 'point':
        for feature, plot in zip(features, nplots):
            plt.subplot(nrows, ncols, plot)
            sns.pointplot(
                data=data,
                x=feature,
                y=target,
                markersize=markersize,
                linestyle='none',
                capsize=0.031,
                err_kws={'lw': 0.81*pscale},
                **pointplot_kwargs
            )
            plt.ylabel(None)
            plt.xticks(rotation=45)

    if kind == 'hist':
        for feature, plot in zip(features, nplots):
            plt.subplot(nrows, ncols, plot)
            sns.histplot(
                data=data,
                x=feature,
                alpha=0.95,
                **histplot_kwargs
            )
            plt.ylabel(None)

    if kind == 'scatter':
        for feature, plot in zip(features, nplots):
            plt.subplot(nrows, ncols, plot)
            sns.scatterplot(
                data=data,
                x=feature,
                y=target,
                s=markersize,
                **scatter_kwargs
            )
            plt.ylabel(None)
        
    plt.subplots_adjust(hspace=0.4*hscale)
    plt.show()
    return fig

In [None]:
def plot_regression_diagnostics(model, data, target, figsize=(11, 12)):

    resid = model.resid
    y_pred = model.fittedvalues
    
    f = plt.figure(figsize=figsize)
    spec = f.add_gridspec(4, 2, height_ratios=[2, 2, 1, 1])
    
    # plot #1
    ax = f.add_subplot(spec[0, :])
    ax.scatter(x=model.model.endog, y=y_pred, **scatter)
    ax.plot(
        model.model.endog, model.model.endog, linestyle=':', linewidth=1.5,
        color=palette[1], alpha=0.5)
    ax.set_xlabel('Actual')
    ax.set_ylabel('Predicred')
    ax.tick_params(labelsize=7)
    
    #plot #2
    ax = f.add_subplot(spec[1, :])
    sns.residplot(
            x=y_pred, y=target, data=data,
            lowess=True,
            scatter_kws={
                'alpha': 1,
                'ec': '#FDFDFD',
                'linewidths': 0.15
            },
            line_kws={
                'ls': ':',
                'color': palette[1],
                'lw': 1.5,
                'alpha': 1
            },
        ax=ax
    )
    ax.set_xlabel('Predicted')
    ax.set_ylabel('Residuals')
    ax.tick_params(labelsize=7)
    
    # plot #3
    ax = f.add_subplot(spec[2, 0])
    ax.hist(resid, bins=50)
    ax.set_xlabel('Residuals')
    ax.set_ylabel('Count')
    ax.tick_params(labelsize=7)
    
    # plot #4
    ax = f.add_subplot(spec[2, 1])
    sm.qqplot(resid, line='s', ax=ax)
    ax.get_lines()[0].set_marker('.')
    ax.get_lines()[1].set_color(palette[1])
    ax.get_lines()[1].set_linestyle(':')
    ax.get_lines()[1].set_alpha(1)
    ax.set_xlabel('Theoretical quant.')
    ax.set_ylabel('Sample quant.')
    ax.tick_params(labelsize=7)
    
    # plot #5
    ax = f.add_subplot(spec[3, :])
    # calculations
    infl = model.get_influence()
    leverage = infl.hat_matrix_diag
    resid_z = zscore(infl.resid)
    resid_square = resid_z**2
    # plot
    ax.scatter(x=resid_square, y=leverage, **scatter)
    ax.set_xlabel('Normalized Residuals Squared')
    ax.set_ylabel('Levarage')
    ax.tick_params(labelsize=7)
    
    plt.subplots_adjust(hspace=0.5)
    return f

In [None]:
def loadit(name, dir='files'):
    if dir != 'files':
        dir = f'files/{dir}'
    result = pd.read_pickle(f'{dir}/{name}.pkl')
    return result

def saveit(file, name, dir='files'):
    if dir != 'files':
        dir = f'files/{dir}'
    # check if dir exists and create it if not
    if not os.path.exists(dir):
        os.mkdir(dir)
    # save file
    filehandler = open(f'{dir}/{name}.pkl', 'wb') 
    pickle.dump(file, filehandler)
    filehandler.close()

def savefig(name, dir='img', format='png', dpi=100, transparent=True,  figure=None, **kwargs):
    '''
    Saves figure as PNG to 'img/' dir
    '''
    if not figure:
       figure = fig
    if dir != 'img':
        dir = f'img/{dir}'
    else:
        pass
    if format == 'svg':
        dpi=None
    # check if dir exists and create it if not
    if not os.path.exists(dir):
        os.mkdir(dir)
    figure.savefig(
        f'{dir}/{name}.{format}',
        transparent=transparent
        bbox_inches='tight',
        dpi=dpi, 
        # format=format,
        **kwargs
    )
    print(f"Image '{name}.{format}' successfully saved into '{dir}' directory")

In [None]:
def check_fit(names_list, estimators_list):
    counter = 0
    names_catched = []
    for name, estimator in zip(names_list, estimators_list):
        try:
            check_is_fitted(estimator)
        except BaseException:
            pass
        else:
            names_catched.append(f"'{name}'")
            counter += 1
    if counter > 0:
        if counter == len(names_list):
            print('All estimators fitted')
        else:
            print(f'Estimators fitted: {", ".join(names_catched)}')
    else:
        print('All estimators not fitted')

In [None]:
def order_X_y(data, target):
    columns = data.columns.tolist()
    columns.append(columns.pop(columns.index(target)))
    df = data[columns].copy()
    return df

In [None]:
def simulation(
        datasets_list, features_list, target, estimators_list, estimators_names,
        datasets_indexes, features_indexes,
        sample_frac=1, replace=True, full_results=False, n_folds=1000):
    # data size
    data_size = len(datasets_list[0])
    # sample size
    sample_size = np.round(data_size * sample_frac)
    # fold range
    fold_range = np.arange(0, n_folds)
    # results dict
    results_dict = {i:np.array([]) for i in estimators_names}
    results_full_dict = {i:{} for i in estimators_names}
    # for i in fold_range:
    for i in estimators_names:
        results_full_dict[i] = {f'fold_{fold}':{} for fold in fold_range}
    for fold in fold_range:
        fold_name = f'fold_{fold}'
        sample_idxs = np.random.randint(
            low=0, high=data_size, size=sample_size)
        zip_ = zip(
            estimators_list,
            estimators_names,
            datasets_indexes,
            features_indexes
        )
        for estimator, name, datasets_idx, features_idx in zip_:
            # choose dataset type for estimator
            data = datasets_list[datasets_idx]
            # data columns
            columns = data.columns.tolist()
            data = order_X_y(data, target)
            data = np.array(data)
            # choose features type for estimator
            features = features_list[features_idx]
            # calculate indexes of estimator features in columns of input X
            features_select_idxs = []
            for i in features:
                features_select_idxs.append(columns.index(i))
                
            X = data[sample_idxs, :][:, features_select_idxs]
            y = data[sample_idxs, :][:, -1]
            X = pd.DataFrame(data=X, columns=features)
            y_pred = estimator.predict(X)
            mrse = mean_squared_error(y, y_pred, squared=False)
            results_dict[name] = np.append(results_dict[name], mrse)
            
            
            results_full_dict[name][fold_name]['y_true'] = y
            results_full_dict[name][fold_name]['y_pred'] = y_pred
            
    return results_dict

In [2]:
def plot_bar_horizontal(
        values, labels, labelsize=9, labelweight='medium', labelcolor='0.3',
        figsize=(8,3), color='#707070',
        width=0.5, s=6,  kind='lol', x_lim_right=None, grid=False):

    values = np.array(values)
    values = np.flip(values)
    yticklabels = np.array(labels)
    yticklabels = np.flip(yticklabels)
    
    yticks = np.arange(0, len(yticklabels), 1)
    yticklabels = [str.lower(i) for i in yticklabels]
    height = figsize[1]

    f, ax = plt.subplots(figsize=figsize)
    
    if kind == 'lol':
        linewidth = width
        ax.scatter(
            x=values, y=yticks, linewidth=width, ec='face',
            s=s, color=palette[0], clip_on=False
        )
    elif kind == 'bar':
        pass
    else:
        print("Select type of plot: 'lol' or 'bar'")
        
    ax.hlines(
        xmin=0, xmax=values, y=yticks,
        linewidth=width, color=palette[-3], alpha=0.25, clip_on=False)
    
    ax.xaxis.tick_top()
    ax.spines['top'].set_visible(True)
    ax.spines['top'].set_color(color)
    ax.spines['top'].set_position(('outward', 15))
    ax.spines['bottom'].set_visible(False)
    ax.spines['left'].set_visible(True)
    ax.spines['left'].set_color(color)
    ax.spines['left'].set_position(('outward', 15))
    ax.set_xlim(left=0, right=x_lim_right)
    ax.set_ylim(bottom=0, top=yticks[-1])
    ax.set_yticks(ticks=yticks, labels=yticklabels, weight=labelweight, color=color)
    plt.grid(grid)
    
    ax.tick_params(
        axis='x', direction='out', size=2, colors=color)
    ax.tick_params(
        axis='y', direction='out', size=2, left=True,
        labelsize=labelsize, labelcolor=labelcolor, colors=color, pad=10)

    plt.title('Feature importance')
    plt.show()
    
    return f

In [None]:
def plot_lr_enet_comparison(data):
    
    s=7
    lw = 1
    capsize=1.2
    scatter_alpha=0.9
    error_alpha=0.9
    legend_alpha=0.9

    data_histplot = data.melt()
    
    # figure
    f = plt.figure(figsize=(7, 4))
    # title
    f.suptitle(
        'Comparing LR and ENET: simulations on Train Data',
        x=0.357, y=1.03, fontsize=10)
    # spec
    spec = f.add_gridspec(2, 2, height_ratios=[1, 1])
    
    # histplot
    f.add_subplot(spec[1, :])
    ax = sns.histplot(
        data=data_histplot,
        x='value',
        hue='variable',
        bins=75,
        alpha=0.5,
        kde=True,
        palette=[palette[0], palette[1]])
    axis_rstyle(
        x_ticks=[0.115, 0.155, 0.005],
        y_ticks=[0, 40, 10]
    )
    # hide legend
    ax.get_legend().remove()
    # axis labels
    plt.xlabel(None)
    plt.ylabel(str.capitalize('count'))
    plt.tick_params(axis='y', which='major', pad=5)
    plt.tick_params(axis='both', labelsize=7)
    # title
    plt.title('RMSE distribution', fontsize=9)
    plt.grid(False)
    plt.ylim(0, 45)
    
    # CI plot
    f.add_subplot(spec[0, 0])
    plot_interval_confidence(
        data=data,
        scatter_kws={
            's': s,
            'alpha': scatter_alpha
        },
        error_kws={
            'lw': lw,
            'capsize': capsize,
            'alpha': error_alpha}
    )
    axis_rstyle()
    # xaxis customization
    plt.gca().spines['bottom'].set_visible(False)
    plt.tick_params(bottom=False, labelbottom=False)
    plt.tick_params(axis='y', labelsize=7)
    # yaxis customization
    yticks = arange(0.134, 0.136, 0.001, True)
    plt.yticks(yticks)
    plt.ylim(0.134, 0.136)
    plt.ylabel('RMSE')
    # title
    plt.title('Confidence intervals (95%)', fontsize=9)
    plt.grid(False)
    
    
    # PI plot
    f.add_subplot(spec[0, 1])
    plot_interval_prediction(
        data=data,
        kind='std',
        scatter_kws={
            's': s,
            'alpha': scatter_alpha
        },
        error_kws={
            'lw': lw,
            'capsize': capsize,
            'alpha': error_alpha}
    )
    axis_rstyle(
        y_ticks=[0.125, 0.15, 0.010]
    )
    # xaxis customization
    plt.gca().spines['bottom'].set_visible(False)
    plt.tick_params(bottom=False, labelbottom=False)
    plt.tick_params(axis='y', labelsize=7)
    # yaxis
    plt.ylim(0.125, 0.145)
    plt.ylabel(None)
    # title
    plt.title('Prediction intervals', fontsize=9)
    plt.grid(False)
    # create legend
    patch_lr = Line2D(
        [], [], label='Linear Regression', marker='o',
        markersize=3, color=palette[0], linestyle='None', alpha=legend_alpha)
    patch_enet = Line2D(
        [], [], label='Elastic Net', marker='o',
        markersize=3, color=palette[1], linestyle='None', alpha=legend_alpha)
    plt.legend(
        handles=[patch_lr, patch_enet], fontsize=8,
        markerscale=1, frameon=False, alignment='left', handletextpad=-0.15,
        loc='upper left', bbox_to_anchor=(1,1)
    )
    # subplot adjust
    plt.subplots_adjust(hspace=0.5, wspace=0.5)
    plt.show()
    
    return f

In [None]:
def plot_interval_prediction(data, ax=None, palette=None, kind='pi', scatter_kws={}, error_kws={}):
    if ax is None:
        ax = plt.gca()

    if palette is None:
        palette = plt.rcParams['axes.prop_cycle'].by_key()['color']

    if kind == 'std':
        z = 1
    if kind == 'pi':
        z = 1.96

    mean = data.mean()
    std = data.std()

    xticks = np.arange(0, len(data.columns), 1)
    xtickslabels = data.columns

    size = len(mean.index)
    
    for i, j in zip(mean.index, np.arange(0, size)):
        pi_plot = ax.scatter(
            x=i,
            y=mean[i],
            color=palette[j],
            **scatter_kws
        )
        pi_plot = ax.errorbar(
            x=i,
            y=mean[i],
            yerr=z*std[i],
            linestyle='none',
            color=palette[j],
            **error_kws
        )
    ax.set_xlim(-0.5, xticks[-1]+0.5)
    
    return pi_plot

In [None]:
def plot_interval_confidence(data, ax=None, scatter_kws={}, error_kws={}):
    if ax is None:
        ax = plt.gca()

    xticks = np.arange(0, len(data.columns), 1)
    xtickslabels = data.columns

    for i, j in zip(data.columns, np.arange(0, len(data.columns))):
        
        bootstrap = ci_bootstrap(data[i])
        mean = bootstrap['statistic']
        margin = bootstrap['proxi_margin']
        
        ci_plot = ax.scatter(
            x=i,
            y=mean,
            color=palette[j],
            **scatter_kws
        )
        ci_plot = ax.errorbar(
            x=i,
            y=mean,
            yerr=margin,
            linestyle='none',
            color=palette[j],
            **error_kws
        )
    ax.set_xlim(-0.5, xticks[-1]+0.5)

    return ci_plot

In [None]:
def plot_estimators_comparing(
        data,
        labels,
        ylabels=None,
        kind='bar',
        figsize=(8, 4),
        width=0.2,
        markersize=4,
        linewidth=0.5,
        palette=None,
        title_plot=None,
        ax0_y_ticks=None,
        ax1_y_ticks=None,
        x_ticks_fontsize=8,
        x_ticks_weight='bold',
        x_labels_color='#7F7F7F',
        capitalize=True,
        spines_width=0.75,
        spines_color=custom_axis_color,
        ticks_step=None,
        ticks_color=custom_axis_color,
        ticklabels_color=custom_axis_color,
        grid=True):

    df = data.copy()
    xticks = np.arange(0, len(df), 1)

    if capitalize:
        df.columns = [i.capitalize() for i in df.columns]
    
    if palette is None:
        color_palette = plt.rcParams['axes.prop_cycle'].by_key()['color']
        color0 = color_palette[0]
        color1 = color_palette[1]
    else:
        color0 = palette[0]
        color1 = palette[1]
    
    f, ax = plt.subplots(2,1, sharex=False, figsize=figsize)
    
    # AX0
    # plots
    for col, color in zip(df.columns, palette):
        ax[0].scatter(
            x=xticks, y=df[col], label=col, s=markersize, color=color)
        ax[0].plot(
            xticks, df[col], lw=linewidth, color=color)
        ax[0].hlines(
            df[col].mean(), xticks[0], xticks[-1], lw=0.75, color=color)
    # spines
    ax[0].spines['bottom'].set_visible(False)
    ax[0].spines['bottom'].set_linewidth(0.5)
    ax[0].spines['bottom'].set_alpha(0.5)
    ax[0].spines['bottom'].set_linewidth(spines_width)
    ax[0].spines['left'].set_position(('outward', 5))
    ax[0].spines['left'].set_color(spines_color)
    ax[0].spines['left'].set_linewidth(spines_width)
    # xticks
    ax[0].set_xticks(
        ticks=xticks, labels=xticks, weight=x_ticks_weight,
        fontsize=x_ticks_fontsize)
    ax[0].tick_params(
        bottom=False, pad=9, labelcolor=x_labels_color)
    # yticks
    if ax0_y_ticks is not None:
        ax0_y_ticks_ = np.arange(
            ax0_y_ticks[0],
            ax0_y_ticks[1] + ax0_y_ticks[2],
            ax0_y_ticks[2])
        ax[0].set_yticks(ticks=ax0_y_ticks_)
        # ylim
        ax[0].set_ylim(ax0_y_ticks_[0], ax0_y_ticks_[-1])
    ax[0].tick_params(
        axis='y', left=True, direction='out',
        color=ticks_color, labelcolor=ticklabels_color, pad=5)
    # xlim
    ax[0].set_xlim(xticks[0]-0.5, xticks[-1]+0.5)
    # ylabel
    if ylabels is not None:
        ax[0].set_ylabel(ylabels[0])
    if grid:
        # grid (only horizotal and not at the end of axis)
        ax[0].grid(visible=False, axis='y')
        ax0_y_ticks_final = ax[0].get_yticks()
        for i in ax0_y_ticks_final[1:-1]:
            ax[0].axhline(i, lw=0.5, ls=':', color='0.85')
    else:
        ax[0].grid(False)
    # title
    if title_plot is not None:
        ax[0].set_title(title_plot)
    
    # AX1
    # calculate delta
    df_columns = list(df.columns)
    delta = df[df_columns[1]] - df[df_columns[0]]
    # define colors
    # if delta<0, i==0, color0 / if delta>0, i==1, color1
    colors_marks = np.where(delta < 0, 0, 1)
    colors = [color0 if i==0 else color1 for i in colors_marks]
    # count 0 and 1 in marks
    marks_unique, marks_counts = np.unique(colors_marks, return_counts=True)
    marks_dict = dict(zip(marks_unique, marks_counts))
    # plots
    if kind == 'bar':
        ax[1].bar(
            x=xticks, height=delta, width=width,
            ec='none', color=colors, zorder=10)
        # line 0
        ax[1].hlines(0, xticks[0], xticks[-1], lw=0.5, color='0.90', zorder=1)
    if kind == 'lol':
        ax[1].scatter(
            x=xticks, y=delta, s=markersize, ec='face',
            color=colors, zorder=10)
        for i, j in zip(xticks, delta):
            color = palette[0] if j<0 else palette[1]
            ax[1].plot(
                [i, i], [0, j], lw=0.5, alpha=0.35,
                color=color, clip_on=False, zorder=1)
        # line 0
        ax[1].hlines(0, xticks[0], xticks[-1], lw=1, color='0.90', zorder=10)
    if kind == 'line':
        for i, j in zip(xticks, delta):
            color = palette[0] if j<0 else palette[1]
            ax[1].plot(
                [i, i], [0, j], lw=1.5,
                color=color, clip_on=False, zorder=10)
        # line 0
        ax[1].hlines(0, xticks[0], xticks[-1], lw=1, color='0.90', zorder=1)
    # spines
    ax[1].spines['bottom'].set_visible(False)
    ax[1].spines['bottom'].set_linewidth(0.5)
    ax[1].spines['bottom'].set_alpha(0.5)
    ax[1].spines['bottom'].set_linewidth(spines_width)
    ax[1].spines['left'].set_position(('outward', 5))
    ax[1].spines['left'].set_color(spines_color)
    ax[1].spines['left'].set_linewidth(spines_width)
    # xticks
    ax[1].set_xticks(ticks=xticks)
    ax[1].tick_params(
        axis='x', bottom=False, labelbottom=False, labelcolor=ticklabels_color)
    # yticks
    if ax1_y_ticks is not None:
        ax1_y_ticks_ = np.arange(
            ax1_y_ticks[0],
            ax1_y_ticks[1] + ax1_y_ticks[2],
            ax1_y_ticks[2])
        ax[1].set_yticks(ticks=ax1_y_ticks_)
        # ylim
        ax[1].set_ylim(ax1_y_ticks_[0], ax1_y_ticks_[-1])
    ax[1].tick_params(
        axis='y', left=True, direction='out',
        color=ticks_color, labelcolor=ticklabels_color, pad=5)
    # xlim
    ax[1].set_xlim(xticks[0]-0.5, xticks[-1]+0.5)
    # ylabel
    if ylabels is not None:
        ax[1].set_ylabel(ylabels[1])
    if grid:
        # grid (only horizotal and not at the end of axis)
        ax[1].grid(visible=False, axis='y')
        ax1_y_ticks_final = ax[1].get_yticks()
        for i in ax1_y_ticks_final[1:-1]:
            ax[1].axhline(i, lw=0.5, ls=':', color='0.85')
    else:
        ax[1].grid(False)
    
    # legend
    # create handles
    handle00 = Line2D(
        [], [], label=labels[0], marker='o',
        markersize=3, color=palette[0],
        linestyle='None')
    handle01 = Line2D(
        [], [], label=labels[1], marker='o',
        markersize=3, color=palette[1],
        linestyle='None')
    handle10 = Line2D(
        [], [], label=marks_dict[0], marker='s',
        markersize=3, color=color0,
        linestyle='None')
    handle11 = Line2D(
        [], [], label=marks_dict[1], marker='s',
        markersize=3, color=color1,
        linestyle='None')
    handles00 = [handle00, handle01]
    handles10 = [handle10, handle11]
    # create legend
    ax0_legend = ax[0].legend(
        handles=handles00, fontsize=8, alignment='left', markerscale=1,
        handletextpad=0.75, handlelength=0.75, frameon=False,
        bbox_to_anchor=(1+figsize[0]*0.0015, 1), loc='upper left', labelcolor='0.3')
    ax1_legend = ax[1].legend(
        handles=handles10, fontsize=8, alignment='left', markerscale=1,
        handletextpad=0.75, handlelength=0.75, frameon=False,
        bbox_to_anchor=(1+figsize[0]*0.0015, 1), loc='upper left', labelcolor='0.3')

    # set ticks step
    if ticks_step is not None:
        loc = matplotlib.ticker.MultipleLocator(base=ticks_step) # this locator puts ticks at regular intervals
        ax[0].xaxis.set_major_locator(loc)
        ax[1].xaxis.set_major_locator(loc)
    
    # subplots adjust
    plt.subplots_adjust(hspace=0.25)
    plt.show()
    
    return f

In [None]:
def prediction_interval(model, X_train, y_train, x0, alpha: float = 0.05):
  ''' Compute a prediction interval around the model's prediction of x0
  with Bootstrap 632 rule.

  INPUT
    model
      A predictive model with `fit` and `predict` methods
    X_train: numpy array of shape (n_samples, n_features)
      A numpy array containing the training input data
    y_train: numpy array of shape (n_samples,)
      A numpy array containing the training target data
    x0
      A new data point, of shape (n_features,)
    alpha: float = 0.05
      The prediction uncertainty

  OUTPUT
    A triple (`lower`, `pred`, `upper`) with `pred` being the prediction
    of the model and `lower` and `upper` constituting the lower- and upper
    bounds for the prediction interval around `pred`, respectively. '''

  # Number of training samples
  n = X_train.shape[0]

  # The authors choose the number of bootstrap samples as the square root
  # of the number of samples
  nbootstraps = np.sqrt(n).astype(int)

  # Compute the m_i's and the validation residuals
  bootstrap_preds, val_residuals = np.empty(nbootstraps), []
  for b in range(nbootstraps):
    train_idxs = np.random.choice(range(n), size = n, replace = True)
    val_idxs = np.array([idx for idx in range(n) if idx not in train_idxs])
    model.fit(X_train[train_idxs, :], y_train[train_idxs])
    preds = model.predict(X_train[val_idxs])
    val_residuals.append(y_train[val_idxs] - preds)
    bootstrap_preds[b] = model.predict(x0)
  bootstrap_preds -= np.mean(bootstrap_preds)
  val_residuals = np.concatenate(val_residuals)

  # Compute the prediction and the training residuals
  model.fit(X_train, y_train)
  preds = model.predict(X_train)
  train_residuals = y_train - preds

  # Take percentiles of the training- and validation residuals to enable
  # comparisons between them
  val_residuals = np.percentile(val_residuals, q = np.arange(100))
  train_residuals = np.percentile(train_residuals, q = np.arange(100))

  # Compute the .632+ bootstrap estimate for the sample noise and bias
  no_information_error = np.mean(np.abs(np.random.permutation(y_train) - \
    np.random.permutation(preds)))
  generalisation = np.abs(val_residuals.mean() - train_residuals.mean())
  no_information_val = np.abs(no_information_error - train_residuals)
  relative_overfitting_rate = np.mean(generalisation / no_information_val)
  weight = 0.632 / (1 - 0.368 * relative_overfitting_rate)
  residuals = (1 - weight) * train_residuals + weight * val_residuals

  # Construct the C set and get the percentiles
  C = np.array([m + o for m in bootstrap_preds for o in residuals])
  qs = [100 * alpha / 2, 100 * (1 - alpha / 2)]
  percentiles = np.percentile(C, q = qs)

  return percentiles[0], model.predict(x0), percentiles[1]

In [None]:
def cross_validation(
        estimator, data, features, target, pipeline_transform, n_folds=10):
    
    k_fold = KFold(n_splits=n_folds, random_state=None, shuffle=False)
    k_fold_splitted = k_fold.split(data)
    
    fit_time = np.empty(n_folds)
    test_score = np.empty(n_folds)

    encoder = OrdinalEncoder(
        encoding_method='ordered',
        variables=categorical_transform,
        missing_values='ignore',
        unseen='encode'
    )

    scaler = StandardScaler()
    
    for i, (train_index, test_index) in enumerate(k_fold_splitted):
        data_train = train_cv.iloc[train_index, :]
        data_test = train_cv.iloc[test_index, :]
    
        X_train = data_train[features].copy()
        y_train = data_train[target].copy()
        X_test = data_test[features].copy()
        y_test = data_test[target].copy()
    
        X_train[features] = pipeline_transform.fit_transform(X_train, y_train)
        X_test[features] = pipeline_transform.transform(X_test)
        
        st = stopwatch.start()
        estimator.fit(X_train, y_train)
        fit_time[i] = stopwatch.stop_sec(st)
        y_pred = estimator.predict(X_test)
        rmse = mean_squared_error(y_test, y_pred, squared=False)
        test_score[i] = rmse

        results_dict = {
            'test_score': test_score,
            'fit_time': fit_time
        }

    return results_dict

In [None]:
def save_session(name, directory='sessions'):
    if directory != 'sessions':
        directory = f'sessions/{directory}/'
    else:
        directory = 'sessions/'
    # check if dir exists and create it if not
    if not os.path.exists(directory):
        os.mkdir(directory)
    # save session
    dill.dump_session(directory+name)


def load_session(name, directory='sessions'):
    if directory != 'sessions':
        directory = f'sessions/{dir}/'
    else:
        directory = 'sessions/'
    # load session
    dill.load_session(directory+name)

In [None]:
def plot_actual_predicted(data_pred_actual, variable, points_num=20):

    df = data_pred_actual[:points_num].copy()
    
    if variable == 'index':
        df = df.sort_index()
        plt.scatter(
            df.index, df['actual'],
            s=10, ec='none', color=palette[0], zorder=10, label='Actual')
        plt.plot(
            df.index, df['actual'],
            color=alpha_color(palette[0], 0.15), zorder=1)
        
        plt.scatter(
            df.index, df['predicted'],
            s=10, ec='none', color=palette[1], zorder=10, label='Predicted')
        plt.plot(
            df.index, df['predicted'],
            color=alpha_color(palette[1], 0.15), zorder=1)
    
    else:
        df = df.sort_values(variable)
        xticks = df[variable].tolist()
        delta = (xticks[-1] - xticks[0]) / 100
        
        plt.scatter(
            df[variable]-delta, df['actual'],
            s=10, ec='none', color=palette[0], zorder=10, label='Actual')
        # plt.plot(
        #     df[variable]-delta, df['actual'],
        #     lw=0.25, color=alpha_color(palette[0], 0.5))
        
        plt.scatter(
            df[variable]+delta, df['predicted'],
            s=10, ec='none', color=palette[1], zorder=10, label='Predicted')
        # plt.plot(
        #     df[variable]+delta, df['predicted'],
        #     lw=0.25, color=alpha_color(palette[1], 0.5))

        for index in df.index:
            plt.plot(
                [df[variable]-delta, df[variable]+delta], [df['actual'], df['predicted']],
                color=alpha_color(palette[0], 0.25), zorder=1
            )

    plt.title(variable, **title_inline)
    plt.legend(**legend_inline, ncols=2)
    plt.grid(False)
    
    plt.show()

In [None]:
def plot_actual_predicted_single_var(data, variable):

    df = data.copy()
    df = df.sort_values(variable)
    xticks = df[variable].tolist()
    delta = (xticks[-1] - xticks[0]) / 100
    
    plt.scatter(
        df[variable]-delta, df['actual'],
        s=10, ec='none', color=palette[0], zorder=10, label='Actual')
    
    plt.scatter(
        df[variable]+delta, df['predicted'],
        s=10, ec='none', color=palette[1], zorder=10, label='Predicted')

    for index in df.index:
        plt.plot(
            [df[variable]-delta, df[variable]+delta], [df['actual'], df['predicted']],
            lw=0.5, color=alpha_color(palette[0], 0.15), zorder=1
        )

    plt.legend(**legend_inline, ncols=2)
    plt.grid(False)
    plt.xticks(ticks=list(set(data_pred_actual[variable])), labels=list(set(data_pred_actual[variable])))
    plt.show()

In [None]:
def to_round(x, scale=1, error='skip'):
    
    '''
    Round x if possible
    '''
    try:
        return round(x, ndigits=scale)
    except TypeError:
        if error == 'type':
            print(f'TypeError: {x}')
        elif error == 'skip':
            pass
        else:
            print("'error' must be 'type' or 'skip'")
            return
        return x

In [None]:
def skewness(df):

    df = pd.DataFrame(df.skew(numeric_only=True),
                      columns=['Skewness'],
                      index=None)

    df['Highly skewed'] = (abs(df['Skewness']) > 0.5)
    df['abs'] = abs(df['Skewness'])

    df = df.sort_values(by=['abs', 'Highly skewed'], ascending=False)
    df = df.drop('abs', axis=1)

    return df

In [None]:
def kurtosis(df):

    df = pd.DataFrame(df.kurtosis(numeric_only=True),
                      columns=['Kurtosis'],
                      index=None)
    df['Type'] = np.nan

    df.loc[df['Kurtosis'] > 1, 'Type'] = 'Too Peaked'
    df.loc[df['Kurtosis'] < -1, 'Type'] = 'Too Flat'
    df.loc[(df['Kurtosis'] <= 1) & (df['Kurtosis'] >= -1), 'Type'] = 'Normal'
    
    df['abs'] = abs(df['Kurtosis'])
    df = df.sort_values(by=['abs', 'Type'], ascending=False)
    df = df.drop('abs', axis=1)

    return df

In [None]:
def list_add_after_every(lst, element, add_every):
    # using [item for subgroup in groups for item in subgroup]
    lst_new = [
        x for y in (lst[i:i+add_every] + [element] * (i < len(lst) - add_every + 1) 
                    for i in range(0, len(lst), add_every)) for x in y
    ]
    return lst_new

In [None]:
def axis_add_xaxis(
            ticks,
            labels,
            width=0.5,
            offset=0,
            offset_first_axis=5,
            color_labels=None,
            color_ticks=None,
            ax=None):

    '''
    
    '''
    
    if ax is None: ax = plt.gca()

    ax_xticks = ax.get_xticks()
    ax_xticks_length = len(ax_xticks)
    ax_xticks_min, ax_xticks_max = ax_xticks[0], ax_xticks[-1]
    ax_xticks_lim = ax.get_xlim()

    for n, l, idx in zip(ticks, labels, arange(len(ticks))):
        
        if n < ax_xticks_length:
            n_corrected = (2*n-1) + 2
            l = list_add_after_every(l, ' ', 1)
            l = [''] + l
        else:
            n_corrected = n

        t = np.linspace(ax_xticks_min, ax_xticks_max, n_corrected)

        # axes for labels
        ax_labels = ax.twiny()
        ax_labels.grid(False)
        ax_labels.set_xticks(ticks=t, labels=l)
        
        if idx == 0:
            ax_labels.set_xlim(ax_xticks_lim)

        ax_labels.spines[['top', 'left', 'bottom', 'right']].set_visible(False)
        ax_labels.spines['bottom'].set_position(('outward', offset))
        
        ax_labels.tick_params(
            axis='x', direction='out',
            top=False, left=False, right=False, bottom=False,
            labeltop=False, labelbottom=True, labelleft=False, labelright=False)
        if color_labels:
            ax_labels.tick_params(axis='x', labelcolor=color_labels)

        # axes for ticks
        ax_ticks = ax.twiny()
        ax_ticks.grid(False)
        ax_ticks.set_xticks(t)

        if color_ticks:
            ax_ticks.tick_params(axis='x', color=color_ticks)
        
        if idx == 0:
            pad = offset_first_axis=5,
            ax_ticks.set_xlim(ax_xticks_lim)
            size = 3
        else:
            pad = offset + 5
            size = 10
            
        ax_ticks.tick_params(
            axis='x', direction='out', width=width, size=size,
            bottom=True, labelbottom=False, top=False, labeltop=False)

        ax_ticks.spines[['top', 'left', 'bottom', 'right']].set_visible(False)
        ax_ticks.spines['bottom'].set_position(('outward', pad))

        if idx != 0:
            # hide every second tick (not count first and last)
            for i in ax_ticks.xaxis.get_major_ticks()[1:-1][::2]:
                i.set_visible(False)
            ax_ticks.xaxis.get_major_ticks()[0].set_visible(False)
            ax_ticks.xaxis.get_major_ticks()[-1].set_visible(False)
        
        offset += 15

In [None]:
def put_column_after(data, column_to_move, column_insert_after):

    '''
    Moves 'column_to_move' from its position to the position after 'column_insert_after'

    Before:
     col1 | column_insert_after | col2 | col3 | col4 | column_to_move | col5
    -------------------------------------------------------------------------
    
    After:
     col1 | column_insert_after | column_to_move | col2 | col3 | col4 | col5
    -------------------------------------------------------------------------
    '''

    df = data.copy()
    
    col = df.pop(column_to_move)
    idx = df.columns.get_loc(column_insert_after) + 1
    df.insert(idx, column_to_move, col)

    return df