# Project Real-time Finance

_(ENG)_
**Objective:** To analyze stocks data from IBOVESPA and create a time series model that could predict in short or mid-term the stocks' price based on historical data, such as:

1. Stocks' history.
2. Quarterly results (Q4).
3. News.

_(PT-BR)_
**Objetivo:** Analisar os dados de ativos da IBOVESPA e criar um modelo de times series que possa prever a curto ou médio prazo o preço das ações com base em dados históricos da empresa, tais como:

1. Valores dos ativos historicamente.
2. Resultados trimestrais (TRI).
3. Notícias.

## 1.1. Imports

In [89]:
# Data Wrangling
import pandas as pd
import numpy as np
import os
import time
import datetime 

# datasets of finance
import yfinance as yf

# webscrapping and manipulating endpoints
import requests
from bs4 import BeautifulSoup as BS

# Data visualization
import dash
from matplotlib import pyplot as plt
import seaborn

## 1.2. Functions

In [285]:
def parse_pt_date(date_string):
    """
    Description:
        Parses a date-time string with Portuguese month names and returns a datetime object in the 
        ISO format 'YYYY-MM-DDTHH:MM:SS'.

    Attributes:
        date_string (str): A string containing the date-time in the format 'dia mês ano hora:minuto:segundo + ou - offset'.
    
    Returns:
        date_object (datetime): A datetime object with the date-time information in ISO format. 
        E.g.: the format is like this: 'Seg, 21 Out 2013 22:14:36 -0200'
    """
    
    MONTHS = {'jan': 1, 'fev': 2, 'mar': 3, 'abr': 4,  'mai': 5,  'jun': 6,
          'jul': 7, 'ago': 8, 'set': 9, 'out': 10, 'nov': 11, 'dez': 12}
    
    FULL_MONTHS = {'janeiro': 1,  'fevereiro': 2, u'março': 3,    'abril': 4,
               'maio': 5,     'junho': 6,     'julho': 7,     'agosto': 8,
               'setembro': 9, 'outubro': 10,  'novembro': 11, 'dezembro': 12}


    date_info = date_string.lower().split()
    if date_info.count('de') == 2 or len(date_info) == 3:
        if ',' in date_info[0]:
            date_string = date_string.split(',')[1]
        date_info = date_string.lower().replace('de ', '').split()
        day, month_pt, year = date_info
        if len(month_pt) == 3:
            month = MONTHS[month_pt]
        else:
            month = FULL_MONTHS[month_pt]
        date_iso = '{}-{:02d}-{:02d}'.format(year, int(month), int(day))
        date_object = datetime.datetime.strptime(date_iso, '%Y-%m-%d')
        return date_object
    else:
        _, day, month_pt, year, hour_minute_second, offset = date_info

        if offset.lower() == 'gmt':
            offset = '+0000'
        offset_signal = int(offset[0] + '1')
        offset_hours = int(offset[1:3])
        offset_minutes = int(offset[3:5])
        total_offset_seconds = offset_signal * (offset_hours * 3600 +
                                                offset_minutes * 60)
        offset_in_days = total_offset_seconds / (3600.0 * 24)

        month = MONTHS[month_pt]
        datetime_iso = '{}-{:02d}-{:02d}T{}'.format(year, month, int(day),
                hour_minute_second)
        datetime_object = datetime.datetime.strptime(datetime_iso,
                '%Y-%m-%dT%H:%M:%S')
        return datetime_object - datetime.timedelta(offset_in_days)

## 2.1. Loading and scraping Datasets

### 2.1.1. Loading IPCA (consumption interest rate)

In [284]:
##############################################################################################################################
# Webscraping done in the SINDRA website 'https://sidra.ibge.gov.br/tabela/1737' from 
# 'https://dados.gov.br/dados/conjuntos-dados', 'https://dados.gov.br/dados/conjuntos-dados/ia-indice-nacional-de-precos-ao-consumidor-amplo-ipca'
##############################################################################################################################

head = {'user-agent':'Mozilla/5.0'}
answer = requests.get('https://sidra.ibge.gov.br/geratabela?format=br.csv&name=tabela1737.csv&terr=NCS&rank=-&query=t/1737/n1/all/v/2266/p/all/d/v2266%2013/l/v,p,t&measurecol=true', 
                      headers = head)

soup = BS(answer.content, 'html.parser')
soup = soup.text

### 2.1.2. IPCA Pre-processing

In [287]:
df_ipca = pd.DataFrame(soup.split(';'))[6:-7]

In [288]:
df_ipca = df_ipca.rename({0:'month'}, axis='columns')
df_ipca = df_ipca.replace('', np.nan)
df_ipca = df_ipca.dropna()
df_ipca = df_ipca[~(df_ipca['month'].str.startswith('"Núm'))]
df_ipca = df_ipca.applymap(lambda x: x.replace('"', ''))
df_ipca = df_ipca.applymap(lambda x: x.replace(',', '.'))
df_ipca = df_ipca.applymap(lambda x: x.replace('\r\nBR',''))
df_ipca = df_ipca.replace('Brasil', np.nan).replace('1', np.nan).dropna()
df_ipca = df_ipca.reset_index().drop(columns=['index'])

In [289]:
if len(df_ipca) % 2 == 0:
    df_ipca['IPCA'] = df_ipca.tail(int(len(df_ipca)/2)).reset_index().drop(columns=['index'])
    df_ipca.iloc[:,0] = df_ipca.head(int(len(df_ipca)/2)).reset_index().drop(columns=['index'])
else:
    df_ipca['IPCA'] = df_ipca.tail(int((len(df_ipca)-1)/2)).reset_index().drop(columns=['index'])
    df_ipca.iloc[:,0] = df_ipca.head(int((len(df_ipca)-1)/2)).reset_index().drop(columns=['index'])

df_ipca = df_ipca.dropna()

In [290]:
df_ipca.iloc[:,1] = pd.to_numeric(df_ipca.iloc[:,1])

  df_ipca.iloc[:,1] = pd.to_numeric(df_ipca.iloc[:,1])


In [291]:
list_df_month = df_ipca.iloc[:,0].tolist()
df_ipca.iloc[:,0] = [parse_pt_date('01 ' + month) for month in list_df_month]
df_ipca

  df_ipca.iloc[:,0] = [parse_pt_date('01 ' + month) for month in list_df_month]


Unnamed: 0,month,IPCA
0,1979-12-01,7.618300e-09
1,1980-01-01,8.122300e-09
2,1980-02-01,8.497300e-09
3,1980-03-01,9.010400e-09
4,1980-04-01,9.486700e-09
...,...,...
515,2022-11-01,6.434200e+03
516,2022-12-01,6.474090e+03
517,2023-01-01,6.508400e+03
518,2023-02-01,6.563070e+03


### 1.2.2. Loading stocks dataset

In [None]:
tic = time.time()

# Defining the stock ticker (symbol)
tickers = ["ITSA4.SA"]

# obtaining the data
stocks = {}
for i in tickers:
    stock = yf.Ticker(tickers[0])
    stocks[i] = stock.history(period="5y", interval="1d")

# # Print the stocks data
# for i, data in stocks.items:
#     print(f"{i}")
#     print(data.tail())

toc = time.time()
time_loading02 = toc - tic
print(f'Time loading stocks dataset: {time_loading02}')

stocks['ITSA4.SA']

Time loading stocks dataset: 0.8619523048400879


Unnamed: 0_level_0,Open,High,Low,Close,Volume,Dividends,Stock Splits
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2018-04-30 00:00:00-03:00,8.322322,8.352673,8.213057,8.261619,14354109,0.0,0.0
2018-05-02 00:00:00-03:00,8.206990,8.206990,7.739581,7.769931,50261107,0.0,0.0
2018-05-03 00:00:00-03:00,7.800282,7.854914,7.545331,7.557471,46139605,0.0,0.0
2018-05-04 00:00:00-03:00,7.551402,7.587824,7.436068,7.436068,28024816,0.0,0.0
2018-05-07 00:00:00-03:00,7.472488,7.502839,7.411785,7.448207,17477633,0.0,0.0
...,...,...,...,...,...,...,...
2023-04-24 00:00:00-03:00,8.510000,8.670000,8.480000,8.570000,25977100,0.0,0.0
2023-04-25 00:00:00-03:00,8.530000,8.600000,8.520000,8.590000,27183900,0.0,0.0
2023-04-26 00:00:00-03:00,8.600000,8.600000,8.450000,8.480000,25655800,0.0,0.0
2023-04-27 00:00:00-03:00,8.500000,8.590000,8.480000,8.590000,17660400,0.0,0.0
