# Trantamento inicial da planilha

Primeiramente vamos ler a planilha e salvá-la no formato parquet.

In [1]:
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq

import config as cfg
import helpers as hlp

In [2]:
excel_file = pd.ExcelFile(cfg.PATH_EXCEL_DATABASE)

excel_file.sheet_names

['Screener', 'Portfolios', 'PreLast', 'Volume']

In [3]:
def save_to_parquet(df, path):
    hlp.ensure_folder_exists(path)
    table = pa.Table.from_pandas(df)
    pq.write_table(table, path)

In [4]:
df_screener = excel_file.parse(excel_file.sheet_names[0])

df_screener.columns = df_screener.columns.str.replace('\n', '')

save_to_parquet(df_screener, cfg.PATH_SCREENER_PARQUET)

df_screener.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 148 entries, 0 to 147
Data columns (total 23 columns):
 #   Column                                                          Non-Null Count  Dtype  
---  ------                                                          --------------  -----  
 0   Identifier (RIC)                                                148 non-null    object 
 1   Company Name                                                    148 non-null    object 
 2   Country of Exchange                                             148 non-null    object 
 3   TRBC Industry Name                                              146 non-null    object 
 4   Market Capitalization(FY0, BRL)                                 146 non-null    float64
 5   Average Daily Value Traded - 52 Weeks(BRL)                      146 non-null    float64
 6   Price to EPS Diluted excl Exord, Common - Total, 5 Yr Avg(FY0)  146 non-null    float64
 7   Price to Book Value per Share, 5 Yr Avg(FY0)         

In [5]:
df_portfolios = excel_file.parse(excel_file.sheet_names[1])

df_portfolios.columns = df_portfolios.columns.str.replace('\n', '')

save_to_parquet(df_portfolios, cfg.PATH_PORTFOLIO_PARQUET)

df_portfolios.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15 entries, 0 to 14
Data columns (total 24 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Blue Chips       15 non-null     object
 1   Mid Caps         15 non-null     object
 2   Small Caps       15 non-null     object
 3   Micro Caps       15 non-null     object
 4   High Return      15 non-null     object
 5   Low Return       15 non-null     object
 6   High Volatility  15 non-null     object
 7   Low Volatility   15 non-null     object
 8   High Liquidity   15 non-null     object
 9   Low Liquidity    15 non-null     object
 10  High Beta        15 non-null     object
 11  Market Beta      15 non-null     object
 12  Low Beta         15 non-null     object
 13  High Alpha       15 non-null     object
 14  Low Alpha        15 non-null     object
 15  High Sharpe      15 non-null     object
 16  Low Sharpe       15 non-null     object
 17  High P/E         15 non-null     obje

In [6]:
df_prelast = excel_file.parse(excel_file.sheet_names[2])

df_prelast.columns = df_prelast.columns.str.replace('\n', '')

save_to_parquet(df_prelast, cfg.PATH_PRELAST_PARQUET)

df_prelast.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 810 entries, 0 to 809
Columns: 149 entries, Data to DI_INDEX
dtypes: datetime64[ns](1), float64(148)
memory usage: 943.0 KB


In [7]:
df_volume = excel_file.parse(excel_file.sheet_names[3])

df_volume.columns = df_volume.columns.str.replace('\n', '')

save_to_parquet(df_volume, cfg.PATH_VOLUME_PARQUET)

df_volume.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 809 entries, 0 to 808
Columns: 148 entries, Data to .BVSP
dtypes: datetime64[ns](1), float64(144), int64(3)
memory usage: 935.5 KB
