#### Importing libs

In [1]:
import pandas as pd
import numpy as np

# Data Import

##### Importing features

In [3]:
sheet_id = "1xOPRcRkJrGzxb65dieEHrWHnqXYrrSpx7Ack3pr6Pdw"
features = pd.read_excel(f"https://docs.google.com/spreadsheets/d/{sheet_id}/export?format=xlsx", sheet_name = None)
print(features.keys()) # Empresas

dict_keys(['BOMBRIL', 'COSAN', 'METALFRIO', 'TAESA', 'TUPY', 'JHSF', 'KLABIN', 'VIBRA', 'VALE', 'RANDON', 'IRANI', 'CYRELA', 'UNIPAR', 'SPRINGS', 'HAGA', 'DIRECIONAL', 'SIDERURGICA ALIPERTI', 'AZEVEDO E TRAVASSOS', 'PANATLANTICA', 'TELEFONICA', 'ALUPAR', 'PETRORIO', 'DOMMO', 'MRV', 'HOTEIS OTHON', 'JOÃO FORTES', 'MANGELS', 'ENAUTA', 'FRAS-LE', 'NATURA', 'PETTENATTI', 'IMC', 'NUTRIPLANT', 'OCEANPACT', 'AMERICANAS', 'PBG', 'ROSSI', 'TEKNO', 'TIM', 'ENERGISA', '3R PETROLEUM', 'ENEVA', 'ETERNIT', 'EUCATEX', 'FER HERINGER', 'GPC PART', 'LUPATECH', 'MMX MINER', 'OI', 'OSX BRASIL', 'PDG REALT', 'PET MANGUINHOS', 'POMIFRUTAS', 'RECRUSUL', 'REDE ENERGIA', 'SANSUY', 'SARAIVA', 'TECNOSOLO', 'TEKA', 'VIVER', 'WETZEL', 'BRASKEM', 'CEMIG', 'DURATEX', 'EZTEC', 'INDS ROMI', 'LITEL', 'MARCOPOLO', 'METAL IGUAÇU', 'SLC AGRICOLA', 'SONDOTECNICA', 'ULTRAPAR'])


##### Importing targets

In [9]:
sheet_id = "116cM2eSTve3UHHYESOdWXRiYxx8gGAPPLlonNDNIn6M"
target = pd.read_excel(f"https://docs.google.com/spreadsheets/d/{sheet_id}/export?format=xlsx")

# Alterando índices
target.index = target["Empresa"]
target.drop(["Empresa"], axis = 1, inplace = True)

# Montando o alvo
print("Classificações: ",list(set(target["Classificação"])))
target["Recuperação Judicial"] = np.where(target["Classificação"] == "Recuperação Judicial", 1, 0)
target["Amostra Falida"] = np.where(target["Classificação"] == "Amostra Falida", 1, 0)
target["Alvo"] = target["Amostra Falida"] + target["Recuperação Judicial"]
target.drop(["Recuperação Judicial", "Amostra Falida"], axis = 1, inplace = True)
target.head()

Classificações:  ['Recuperação Judicial', 'Não Falida', 'Amostra Par', 'Amostra Falida']


Unnamed: 0_level_0,Ticker,Setor,Classificação,Data do Evento,Alvo
Empresa,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
3R PETROLEUM,RRRP,"Petróleo, Gás e Biocombustíveis",Não Falida,2013-01-17,0
ALUPAR,ALUP,Utilidade Pública,Amostra Par,2014-12-09,0
AMERICANAS,AMER,Consumo Cíclico,Não Falida,2018-11-23,0
AVIANCA (OCEANAIR),,Bens Industriais,Amostra Falida,NaT,1
AZEVEDO E TRAVASSOS,AZEV,Bens Industriais,Amostra Par,2012-08-03,0


##### Validation

In [14]:
res = 0
miss = []
for i in list(features.keys()):
    if i in list(target.index):
        res += 1
    else:
        miss.append(i)

print("Número de Empresas (Target):", res)
print("Número de Empresas (Features):", len(features.keys())) # Numero de empresas
print("Empresas Faltando:", miss)

Número de Empresas (Target): 72
Número de Empresas (Features): 72
Empresas Faltando: []


# Data Wrangling

Próximos passos: 
* tratar NA's

In [28]:
# Corte Transversal ou Cross-Sectional (analisa dados em um momento específico) -> oposto do longitudinal (série temporal)
indicadores = list(features["BOMBRIL"].iloc[35:, 0])

def wrangle(ano_pre_fal):
    # monta df para 1 a 5 anos pré-falência
    
    df = pd.DataFrame(index = features.keys())

    for indicador in indicadores:
        coluna = []
        for empresa in features.keys():
            data = features[empresa]
            inds = data.columns[0]
            val_ind = data[data[inds] == indicador].iloc[0, 6-ano_pre_fal]
            coluna.append(val_ind)

        df[indicador] = coluna
    
    df = df.join(target["Alvo"]) # Acrescenta os alvos
    df.replace('#DIV/0!', np.nan, inplace = True)
    return df

In [31]:
df_1y = wrangle(1)
df_1y.head()

Unnamed: 0,Dispon/PC,Dispon/Receita Liq Operacional,Dispon/AC,Dispon/AT,AC/PC,AC/AT,PC/AT,PC/Receita Liq Operacional,Estoque/AT,Quick assets/AT,...,VA/IMOBILIZADO,VA/AT,VA/RLO,Contas a pagar/RLO,AC/RLO,Estoque/RLO,NOWC/RLO,Contas a receber/RLO,RLO/AT,Alvo
BOMBRIL,0.0374,0.022244,0.101842,0.029547,0.367232,0.290121,0.790021,0.594763,0.031404,0.258718,...,1.912136,0.573094,0.431451,0.095822,0.218416,0.023642,-0.112902,0.130422,1.328294,1
COSAN,1.644152,0.042668,0.535983,0.063081,3.067545,0.117692,0.038367,0.025951,0.012073,0.105619,...,-94.153913,-0.273755,-0.185168,0.004062,0.079607,0.008166,0.056392,0.009919,1.478415,0
METALFRIO,0.714739,0.581478,0.562656,0.390647,1.270295,0.694291,0.546559,0.813553,0.154425,0.539866,...,-0.557492,-0.091249,-0.135825,0.192842,1.033452,0.229861,0.795228,0.176731,0.671817,0
TAESA,0.621887,0.963256,0.842059,0.172986,0.738532,0.205432,0.278163,1.548924,0.0,0.205432,...,64.438054,0.164443,0.915685,0.01421,1.14393,0.0,1.12769,0.178644,0.179584,0
TUPY,1.260625,0.444879,0.556159,0.265093,2.266661,0.476648,0.210287,0.352903,0.067499,0.409149,...,-0.199108,-0.066526,-0.111644,0.08611,0.799912,0.113277,0.613744,0.158185,0.595876,0


# Tratamento de NA's

In [56]:
print("DF Original:", df_1y.shape[0], "empresas")
print("DF sem NA's:", df_1y.dropna().shape[0], "empresas")

DF Original: 72 empresas
DF sem NA's: 65 empresas


In [57]:
# Indicadores com maior quantidade de valores faltantes
df_1y.isna().sum().sort_values(ascending = False)

VA/IMOBILIZADO                         7
FCFF/Rec Liq Operacional               6
PC/Receita Liq Operacional             6
GTP/RLO                                6
Lucro liquido/RLO                      6
Capital de giro/Rec Liq Operacional    6
VA/RLO                                 6
Contas a pagar/RLO                     6
AC/RLO                                 6
Estoque/RLO                            6
NOWC/RLO                               6
Contas a receber/RLO                   6
Dispon/Receita Liq Operacional         6
EBITDA/Rec Liq Operacional             6
Juros/VA                               5
FCFF/VA                                5
EBIT/VA                                5
LL/VA                                  5
PL/AT                                  2
Divida total/PL                        2
PL/PE                                  2
Dispon/PC                              2
Divida total/AT                        2
Divida LP/AT                           2
VA/AT           

In [78]:
df_1y[df_1y.isna().any(axis=1)]

Unnamed: 0,Dispon/PC,Dispon/Receita Liq Operacional,Dispon/AC,Dispon/AT,AC/PC,AC/AT,PC/AT,PC/Receita Liq Operacional,Estoque/AT,Quick assets/AT,...,VA/IMOBILIZADO,VA/AT,VA/RLO,Contas a pagar/RLO,AC/RLO,Estoque/RLO,NOWC/RLO,Contas a receber/RLO,RLO/AT,Alvo
DOMMO,501.020619,,0.971032,0.031559,515.96701,0.0325,6.3e-05,,0.0,0.0325,...,,0.0,,,,,,,0.0,1
PETTENATTI,,,,,,,,,,,...,,,,,,,,,,0
PDG REALT,0.007415,,0.008783,0.000188,0.84428,0.021363,0.025303,,0.0,0.021363,...,,0.0,,,,,,,0.0,1
REDE ENERGIA,0.116592,,0.384236,0.025591,0.303438,0.066601,0.219488,,0.0,0.066601,...,,-0.000444,,,,,,,0.0,1
EZTEC,,,,,,,,,,,...,,,,,,,,,,0
LITEL,101.816754,,0.049432,0.001341,2059.717277,0.027121,1.3e-05,,0.0,0.027121,...,,0.0,,,,,,,0.0,0
ULTRAPAR,0.109518,1.760172,0.174831,0.013422,0.626423,0.076772,0.122556,16.072,0.0,0.076772,...,,-0.001961,-0.257165,0.007913,10.067866,0.0,8.600109,6.851807,0.007625,0


# Exportando a Base

In [92]:
sheet_names = ["df_" + str(i) + "y" for i in range(1,6)]
dfs = [wrangle(i) for i in range(1,6)]

writer = pd.ExcelWriter('../data/dados.xlsx', engine='xlsxwriter')
for n in range(1,6): dfs[n-1].to_excel(writer, sheet_name = sheet_names[n-1])
writer.save()