# Scrapy SELIC

In [265]:
import pandas as pd
import requests # for http requests
from bs4 import BeautifulSoup
import datetime

In [266]:
response = requests.get("https://www.bcb.gov.br/pec/copom/port/taxaselic.asp", timeout=60)
print("get page: ", response)

# criação do parser para navegação na árvore DOM
content = BeautifulSoup(response.content, "html.parser")
table = content.find_all("tr")
#HTML(str(table))

get page:  <Response [200]>


In [267]:
## Removing Multple spaces
def remove_multiple_spaces(string):
    """
    Se é String, então faz remoção de espaços em branco 
    trans_string: remove os valores textuais por espaços em branco 
    trans_n: / por espaço em branco AND (,) por (.) para facilitar a conversão de tipos
    """
    if type(string)==str:
        trans_string = string.replace('baixa',' ').replace('alta',' ').replace(' ex.','').replace('uso',' ')
        divisao = ' '.join(trans_string.split())
        trans_n = divisao.replace('/',' ').replace(',','.')
        return trans_n
    return string

In [268]:
# neste dataframe vou armazenar a data e a taxa SELIC
df_selic = pd.DataFrame(columns=['Ano', 'Mes', 'Taxa SELIC'])

# percorre table
for row in table:
    text = row.text
    text = remove_multiple_spaces(text)
    
    # insere dados diferentes a cada linha
    text_dados = text.split(sep=" ")
    
    # convert list to String
    ano = ''.join(text_dados[6:7])
    mes = ''.join(text_dados[5:6])
    taxa = ''.join(text_dados[13:])
                
    # variável p organizar a inserção no dataframe
    dados = pd.DataFrame([[ano, mes, taxa]], columns=['Ano', 'Mes', 'Taxa SELIC'])    
    df_selic = df_selic.append(dados)

In [269]:
# drop rows with Strings
df_selic = df_selic[3:]

# select 10 years
year = (datetime.date.today().year)
data_inicial = (f'{year-10}')
df_selic = df_selic[df_selic.Ano >= data_inicial]

In [270]:
# Conversão de tipos object
df_selic.Ano = df_selic.Ano.astype('int16') # int8 não aceitou
df_selic.Mes = df_selic.Mes.astype('int8')
df_selic['Taxa SELIC'] = df_selic['Taxa SELIC'].astype('float16')

print(df_selic.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 81 entries, 0 to 0
Data columns (total 3 columns):
Ano           81 non-null int16
Mes           81 non-null int8
Taxa SELIC    81 non-null float16
dtypes: float16(1), int16(1), int8(1)
memory usage: 1.0 KB
None


In [272]:
# Salvamento do dataframe
df_selic.to_csv('df_selic.csv', 
                    index=False, sep=',', encoding='utf-8', decimal='.')
print('\n(df_selic.csv) salvo!')


(df_selic.csv) salvo!


In [284]:
import pymysql
from sqlalchemy import create_engine

user = 'yourUserName'
passw = 'password'
host =  'hostName'  # either localhost or ip e.g. '172.17.0.2' or hostname address 
port = 3306 
database = 'dataBaseName'

engine = create_engine('mysql+pymysql://' + user + ':' + passw + '@' + host + ':' + str(port) + '/' + database , echo=False)
df_selic.to_sql('df_selic', con=engine, if_exists='replace', index=False)

OperationalError: (pymysql.err.OperationalError) (2003, "Can't connect to MySQL server on 'hostName' ([Errno -2] Name or service not known)") (Background on this error at: http://sqlalche.me/e/e3q8)

In [280]:
engine = create_engine('sqlite://', echo=False)

engine.execute("SELECT * FROM df_selic").fetchall()

[(2018, 2, 6.6484375),
 (2017, 12, 6.8984375),
 (2017, 10, 7.3984375),
 (2017, 9, 8.1484375),
 (2017, 7, 9.1484375),
 (2017, 6, 10.1484375),
 (2017, 4, 11.1484375),
 (2017, 2, 12.1484375),
 (2017, 1, 12.8984375),
 (2016, 12, 13.6484375),
 (2016, 10, 13.8984375),
 (2016, 9, 14.1484375),
 (2016, 7, 14.1484375),
 (2016, 6, 14.1484375),
 (2016, 4, 14.1484375),
 (2016, 3, 14.1484375),
 (2016, 1, 14.1484375),
 (2015, 11, 14.1484375),
 (2015, 10, 14.1484375),
 (2015, 9, 14.1484375),
 (2015, 7, 14.1484375),
 (2015, 6, 13.6484375),
 (2015, 4, 13.1484375),
 (2015, 3, 12.6484375),
 (2015, 1, 12.1484375),
 (2014, 12, 11.6484375),
 (2014, 10, 11.1484375),
 (2014, 9, 10.8984375),
 (2014, 7, 10.8984375),
 (2014, 5, 10.8984375),
 (2014, 4, 10.8984375),
 (2014, 2, 10.6484375),
 (2014, 1, 10.3984375),
 (2013, 11, 9.8984375),
 (2013, 10, 9.3984375),
 (2013, 8, 8.8984375),
 (2013, 7, 8.3984375),
 (2013, 5, 7.8984375),
 (2013, 4, 7.3984375),
 (2013, 3, 7.16015625),
 (2013, 1, 7.12109375),
 (2012, 11, 7.140