# Functions and useful definitions

In [1]:
from ml_belt.prep import Prep
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import missingno as msno
from dateutil.parser import parse as date_parse
import os.path

from IPython.display import set_matplotlib_formats
from IPython.core.interactiveshell import InteractiveShell
import warnings

In [2]:
InteractiveShell.ast_node_interactivity = "all"
warnings.filterwarnings('ignore')
pd.options.display.max_columns = None
pd.options.display.max_rows = None
pd.options.mode.use_inf_as_na = True

In [3]:
# Some defines about plot

default_color = (0.003602, 0.195911, 0.441564)
default_light_color = 'white'
default_dark_color = (0.185453, 0.258914, 0.426788)
colormap = 'cividis'  # plt.cm.cool
figsize = (15, 5)

%store default_color
%store default_light_color
%store default_dark_color
%store colormap
%store figsize

set_matplotlib_formats('pdf', 'png')
pd.options.display.float_format = '{:.2f}'.format
rc={'savefig.dpi': 75, 'figure.autolayout': False, 'figure.figsize': figsize, 'axes.labelsize': 12,\
   'axes.titlesize': 18, 'font.size': 14, 'lines.linewidth': 2.0, 'lines.markersize': 8, 'legend.fontsize': 10,\
   'xtick.labelsize': 12, 'ytick.labelsize': 12}

sns.set(rc=rc)
sns.set_palette(sns.color_palette(colormap))

In [4]:
def get_meta(train):
    data = []
    for col in train.columns:
        # Defining the role
        if col == 'target':
            role = 'target'
        elif col == 'id':
            role = 'id'
        else:
            role = 'input'

        prefix = col.split('_')[0]
        # Defining the level
        if prefix == 'bin' or col == 'target':
            level = 'binary'
        elif prefix == 'nom':
            level = 'nominal'
        elif prefix == 'ord':
            level = 'ordinal'
        elif prefix == 'dis':
            level = 'discrete'
        elif prefix == 'per':
            level = 'percentual'
        elif prefix == 'con':
            level = 'interval/continuous'
        elif train[col].dtype == np.float64:
            level = 'interval'
        elif train[col].dtype == np.int64:
            level = 'discrete'

        # Initialize keep to True for all variables except for id
        keep = True
        if col == 'id':
            keep = False

        # Defining the data type 
        dtype = train[col].dtype

        # Creating a Dict that contains all the metadata for the variable
        col_dict = {
            'varname': col,
            'role'   : role,
            'level'  : level,
            'keep'   : keep,
            'dtype'  : dtype,
            'prefix' : prefix
        }
        data.append(col_dict)
    meta = pd.DataFrame(data, columns=['varname', 'role', 'level', 'keep', 'dtype', 'prefix'])
    meta.set_index('varname', inplace=True)
    return meta

In [5]:
def replace_by_index(df, indexes, cols, val):
    """Altera o valor das colunas em `cols` nos indices `indexes` pelo valor `val`"""
    for index in indexes:
        for col in cols:
            df.iat[index, df.columns.get_loc(col)] = val
            
    return df

In [6]:
def check_float(df, cols):
    """Transforma em `np.nan` todos os valores das colunas `col`
    que não podem ser convertidos em `float`
    """
    for index, row in df.iterrows():
        for col in cols:
            try:
                df.at[index, col] = float(df.at[index, col])
            except:
                df.at[index, col] = np.nan
    return df

In [7]:
def replace_nan(df, cols_dict):
    """Apnas um wrapper para o método `replace` de pandas.DataFrame
    permitindo ser encadeado no pipeline de `Prep`
    """
    return df.replace(cols_dict, np.nan)

In [8]:
def bin_to_num(df, cols):
    """Change `sim` and `não` into 1 and 0."""
    for index, row in df.iterrows():
        for col in cols:
            val = df.at[index, col]
            if isinstance(val, str) and val.lower() == 'sim':
                df.at[index, col] = 1
            elif isinstance(val, str) and (val.lower() == 'não' or val.lower() == 'nao'):
                df.at[index, col] = 0
            else:
                df.at[index, col] = np.nan
    return df