# Functions and useful definitions

In [1]:
from ml_belt.prep import Prep
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import missingno as msno
from dateutil.parser import parse as date_parse
import os.path

from IPython.display import set_matplotlib_formats
from IPython.core.interactiveshell import InteractiveShell
import warnings

In [2]:
InteractiveShell.ast_node_interactivity = "all"
warnings.filterwarnings('ignore')
pd.options.display.max_columns = None
pd.options.display.max_rows = None
pd.options.mode.use_inf_as_na = True

In [3]:
# Some defines about plot

default_color = (0.003602, 0.195911, 0.441564)
default_light_color = 'white'
default_dark_color = (0.185453, 0.258914, 0.426788)
colormap = 'cividis'  # plt.cm.cool
figsize = (15, 5)

%store default_color
%store default_light_color
%store default_dark_color
%store colormap
%store figsize

set_matplotlib_formats('pdf', 'png')
pd.options.display.float_format = '{:.2f}'.format
rc={'savefig.dpi': 75, 'figure.autolayout': False, 'figure.figsize': figsize, 'axes.labelsize': 12,\
   'axes.titlesize': 18, 'font.size': 14, 'lines.linewidth': 2.0, 'lines.markersize': 8, 'legend.fontsize': 10,\
   'xtick.labelsize': 12, 'ytick.labelsize': 12}

sns.set(rc=rc)
sns.set_palette(sns.color_palette(colormap))

In [4]:
def get_meta(train):
    data = []
    for col in train.columns:
        # Defining the role
        if col == 'target':
            role = 'target'
        elif col == 'id':
            role = 'id'
        else:
            role = 'input'

        prefix = col.split('_')[0]
        # Defining the level
        if prefix == 'bin' or col == 'target':
            level = 'binary'
        elif prefix == 'nom':
            level = 'nominal'
        elif prefix == 'ord':
            level = 'ordinal'
        elif prefix == 'dis':
            level = 'discrete'
        elif prefix == 'per':
            level = 'percentual'
        elif prefix == 'con':
            level = 'interval/continuous'
        elif train[col].dtype == np.float64:
            level = 'interval'
        elif train[col].dtype == np.int64:
            level = 'discrete'

        # Initialize keep to True for all variables except for id
        keep = True
        if col == 'id':
            keep = False

        # Defining the data type 
        dtype = train[col].dtype

        # Creating a Dict that contains all the metadata for the variable
        col_dict = {
            'varname': col,
            'role'   : role,
            'level'  : level,
            'keep'   : keep,
            'dtype'  : dtype,
            'prefix' : prefix
        }
        data.append(col_dict)
    meta = pd.DataFrame(data, columns=['varname', 'role', 'level', 'keep', 'dtype', 'prefix'])
    meta.set_index('varname', inplace=True)
    return meta

In [5]:
def replace_by_index(df, indexes, cols, val):
    """Altera o valor das colunas em `cols` nos indices `indexes` pelo valor `val`"""
    for index in indexes:
        for col in cols:
            df.iat[index, df.columns.get_loc(col)] = val
            
    return df

In [6]:
def check_float(df, cols):
    """Transforma em `np.nan` todos os valores das colunas `col`
    que não podem ser convertidos em `float`
    """
    for index, row in df.iterrows():
        for col in cols:
            try:
                df.at[index, col] = float(df.at[index, col])
            except:
                df.at[index, col] = np.nan
    return df

In [7]:
def replace_nan(df, cols_dict):
    """Apnas um wrapper para o método `replace` de pandas.DataFrame
    permitindo ser encadeado no pipeline de `Prep`
    """
    return df.replace(cols_dict, np.nan)

In [8]:
def bin_to_num(df, cols, one=['sim'], zero=None):
    """Change `one` to 1 and `zero` to 0.
    If `zero` is None, all values not in `one` will be changed to 0.
    If `zero` is passed, all values not in `one` and `zero` will be changed to np.nan.
    """
    i = 0
    for index, row in df.iterrows():
        for col in cols:
            val = df.at[index, col]
            if isinstance(val, str) and val in one:
                df.at[index, col] = 1
            elif zero is None:
                df.at[index, col] = 0
            elif isinstance(val, str) and val in zero:
                df.at[index, col] = 0
            else:
                df.at[index, col] = np.nan
    return df

In [None]:
# TODO: COLOCAR NA ML BELT UM APPLY_CUSTOM NO NÍVEL ROW OU COL
def transform_month(df, col):
    df[col] = df.apply(lambda x, col=col: (x[col]-2017)/10000, axis=1)
    return df

In [None]:
# TODO: COLOCAR NA ML BELT
def filter_valid(df, col, valid_value):
    df = df[df[col] == valid_value]
    return df

In [None]:
def calc_per_acum(df):
    for idx, row in df[df['per_acum_acumulado'].isna()].iterrows():
        df.at[idx, 'per_acum_acumulado'] = df.at[idx, 'per_peso_kpi'] * df.at[idx, 'per_pontos_acumulado']   
    return df

In [None]:
# TODO: COLOCAR NA ML BELT
def astype(df, cols, new_type):
    for col in cols:
        df[col] = df[col].astype(new_type)
    return df

In [None]:
def plot_numerical(df, col, title):
    # Checking nulls
    df_size = len(df)
    not_nulls = df[col].count()
    nulls = df[col].isna().sum()
    print('Total: {} - Nulls: {} ({:.02f}%) / Not Nulls: {} ({:.02f}%)'.format(
        df_size, nulls, nulls/df_size*100, not_nulls, not_nulls/df_size*100))

    # Ploting:
    descending_order = df[col].value_counts().sort_values(ascending=False).index

    fig, axs = plt.subplots(1, 2, figsize=figsize)
    plt.suptitle(title)

    ax1 = sns.countplot(df[col], palette=colormap, order=descending_order, ax=axs[0])
    ax1.set_xlabel(title)
    ax1.set_ylabel('Observações')
    for tick in axs[0].get_xticklabels():
        tick.set_rotation(90)

    ax2 = sns.countplot(df[col].isna(), palette=colormap, ax=axs[1])
    ax2.set_xlabel('Nulos')
    ax2.set_ylabel('Observações')
    for tick in axs[1].get_xticklabels():
        tick.set_rotation(90)

    plt.show()

In [None]:
def check_nulls(df):
    msno.bar(df,figsize=(20,8), color=default_color, fontsize=18, labels=True)
    msno.matrix(df,figsize=(20,8), fontsize=14)
    msno.heatmap(df,figsize=(20,8), cmap=colormap)

In [None]:
def mat_corr(df):
    fig, ax = plt.subplots(figsize=(20,20)) 
    sns.heatmap(df.corr(), linewidths=0.1, vmax=1.0, square=True, cmap=colormap, linecolor='white', annot=True, ax=ax)