# Web scraping code
Website: http://cnes2.datasus.gov.br/

In [1]:
# importing all relevant libraries

from selenium import webdriver as driver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import Select 

from lxml import html
import requests
import pandas as pd
import numpy as np


In [2]:
# Loading initial page
page01 = requests.get('http://cnes2.datasus.gov.br/')    
tree = html.fromstring(page01.content)
# Setting up Selenium
driver = driver.Chrome()
driver.get('http://cnes2.datasus.gov.br/')

The website has a drop down menu, which is activated when a mouse passes over it. The code below identify the correct drop down list, changes its attribute from 'hidden' to 'visible', and then select the appropriate item (Equipments)

In [3]:
# identifying the correct drop down list
element = driver.find_element_by_xpath('/html/body/div[7]')
# changing visibility from 'hidden' to 'visible': this will make the correspondent drop down list to be visible
driver.execute_script("arguments[0].style='position: absolute; visibility: visible; background-color: rgb(0, 102, 153); width: 202px; height: 505px; font-family: sans-serif, verdana, \"comic sans ms\"; font-weight: normal; font-style: normal; font-size: 8pt; z-index: 102; top: 5px; left: 334px;'", element)
# with the drop down list visible, it's possible to select the correspondent element
login_form = driver.find_element_by_xpath("html/body/div[7]/div[3]").click()

In [4]:
#  
# page02 = requests.get(driver.current_url)
# tree = html.fromstring(page02.content)

# Obtaining all possible States names (codes and names) in a list
all_states = driver.find_elements_by_xpath("//*[@id='2']/option[@value]")

states_code = []
states_name = []
for state in all_states:
    attribute_state = state.get_attribute('value')
    states_code.append(attribute_state)
    attribute_state02 = state.text
    states_name.append(attribute_state02)
print(states_code)
print(states_name)

# Converting both lists into pandas Series
states_code = pd.Series(states_code)
states_name = pd.Series(states_name)
states_list = pd.DataFrame([states_code, states_name])
states_list = states_list.T
states_list.drop(states_list.index[[0, 1, 29]]) 

# Renaming all the columns
states_list.columns = states_list.columns.map(str)
states_list.columns.values[0] = 'codes'
states_list.columns.values[1] = 'states'
# states_list   # make this line run to see the dataframe created

['', '00', '12', '27', '13', '16', '29', '23', '53', '32', '52', '21', '31', '50', '51', '15', '25', '26', '22', '41', '33', '24', '11', '14', '43', '42', '28', '35', '17', '']
['ESCOLHA ESTADO', 'TODOS', 'ACRE', 'ALAGOAS', 'AMAZONAS', 'AMAPA', 'BAHIA', 'CEARA', 'DISTRITO FEDERAL', 'ESPIRITO SANTO', 'GOIAS', 'MARANHAO', 'MINAS GERAIS', 'MATO GROSSO DO SUL', 'MATO GROSSO', 'PARA', 'PARAIBA', 'PERNAMBUCO', 'PIAUI', 'PARANA', 'RIO DE JANEIRO', 'RIO GRANDE DO NORTE', 'RONDONIA', 'RORAIMA', 'RIO GRANDE DO SUL', 'SANTA CATARINA', 'SERGIPE', 'SAO PAULO', 'TOCANTINS', '---MUNICÍPIO---']


Now, we ask the user to insert some information: 

1) On which State he is interested in;


2) On which City he wants information;


3) On which period he needs data.

The preliminar Antitrust analysis usually is made using cross-sectional data, i.e., data on one specific city in a specific month. This analysis is made by comparing the share of the health care institution in terms of number of equipment available for each procedure. The more equipment one institution has, higher is the proportion of the population he can be of service.

In [5]:
# We ask for the inputs and test to see if it was inserted correctly
while True:
    state_input_temp = input('Choose a State (capital letters, no accents): ')
    if any(state_input_temp.upper() in s for s in states_name):
        state_input = state_input_temp.upper()
        print('State chosen:', state_input) 
        break
    else:
        print('\033[1;41m Invalid State. Please insert a State name \033[1;m')
        continue

# Now we apply the given state to the drop down list combo to open the cities options
select = Select(driver.find_element_by_name('ComboEstado')) 
select.select_by_visible_text(state_input)


Choose a State (capital letters, no accents): sao paulo
State chosen: SAO PAULO


In [6]:
# Now we performe the same thing for city level

# Obtaining all possible Cities names (codes and names) in a list
all_cities = driver.find_elements_by_xpath("//*[@id='2']/option[@value]")

cities_code = []
cities_name = []
for city in all_cities:
    attribute_city = city.get_attribute('value')
    cities_code.append(attribute_city)
    attribute_city02 = city.text
    cities_name.append(attribute_city02)
#print(cities_code)
#print(cities_name)

# Converting both lists into pandas Series
cities_code = pd.Series(cities_code)
cities_name = pd.Series(cities_name)
cities_list = pd.DataFrame([cities_code, cities_name])
cities_list = cities_list.T
#cities_list.drop(cities_list.index[[0:31, len(cities_name)+2]]) 

# Renaming all the columns
cities_list.columns = cities_list.columns.map(str)
cities_list.columns.values[0] = 'codes'
cities_list.columns.values[1] = 'cities'
   

#We ask for the inputs and test to see if it was inserted correctly
while True:
    city_input_temp = input('Choose a City (capital letters, no accents): ')
    if any(city_input_temp.upper() in s for s in cities_name):
        city_input = city_input_temp.upper()
        print('City chosen:', city_input) 
        break
    else:
        print('\033[1;41m Invalid State. Please insert a City name \033[1;m')
        continue

# Now we apply the given state to the drop down list combo to open the cities options
select = Select(driver.find_element_by_name('ComboMunicipio')) 
select.select_by_visible_text(city_input)


Choose a City (capital letters, no accents): sao paulo
City chosen: SAO PAULO


In [13]:
# Now we performe the same thing for city level

# Obtaining all possible Cities names (codes and names) in a list
all_periods = driver.find_elements_by_id("cboCompetencia")

periods_code = []
periods_name = []
for period in all_periods:
    attribute_period = period.get_attribute('value')
    periods_code.append(attribute_period)
    attribute_period02 = period.text
    periods_name.append(attribute_period02)
#print(cities_code)
#print(cities_name)

# Converting both lists into pandas Series
periods_code = pd.Series(periods_code)
periods_name = pd.Series(periods_name)
periods_list = pd.DataFrame([periods_code, periods_name])
periods_list = periods_list.T
#periods_list.drop(periods_list.index[[0:31, len(periods_name)+2]]) 

# Renaming all the columns       
periods_list.columns = periods_list.columns.map(str)
periods_list.columns.values[0] = 'codes'
periods_list.columns.values[1] = 'periods'

# The dataframe for the periods is weird, all data is in one unique cell.
# Nevertheless, checking the correct format and existence of the input period works fine.
#periods_list.head(5)

#We ask for the inputs and test to see if it was inserted correctly
while True:
    period_input_temp = input('Choose a year and month (format: MM/YYYY):')
    if any(period_input_temp in s for s in periods_name):
        period_input = period_input_temp
        print('Period chosen:', period_input) 
        break
    else:
        print('\033[1;41m Invalid period. Please choose a year and month (format: MM/YYYY): \033[1;m')
        continue
select = Select(driver.find_element_by_name('cboCompetencia')) 
select.select_by_visible_text(period_input)

Choose a year and month (format: MM/YYYY):09/2017
Period chosen: 09/2017


In [17]:
page03 = requests.get(driver.current_url)
tree = html.fromstring(page03.content)
equip_list = tree.xpath('//a/text()')

# Now we ask for the name of the health care equipment 
#print('Equipment chosen:', equip_temp)

while True:
    equip_temp = input('Choose an Equipment (capital letters, no accents): ')
    if any(equip_temp in s for s in equip_list):
        equip_input = equip_temp
        print('Equipment chosen:', equip_temp)
        break
    elif any(equip_temp.title() in s for s in equip_list):
        equip_input = equip_temp.title()
        print('Equipment chosen:', equip_temp)
        break
    elif any(equip_temp.upper() in s for s in equip_list):
        equip_input = equip_temp.upper()
        print('Equipment chosen:', equip_temp)
        break
    else:
        print('\033[1;41m Invalid equipment. Please choose an Equipment (capital letters, no accents): \033[1;m')
        continue

# Select health care equipment of interest
driver.find_element_by_link_text(equip_input).click()

Choose an Equipment (capital letters, no accents): tomografo computadorizado
[1;41m Invalid equipment. Please choose an Equipment (capital letters, no accents): [1;m
Choose an Equipment (capital letters, no accents): mamografo computadorizado
Equipment chosen: mamografo computadorizado


In [21]:
page04 = requests.get(driver.current_url)
tree = html.fromstring(page04.content)

# scrapping data from table
institution = tree.xpath('//a/text()')
inst_code = tree.xpath('///tr[*]/td[1]/font/text()')
city = tree.xpath('///tr[*]/td[3]/font/text()')
equip = tree.xpath('///tr[*]/td[4]/font/text()')
equip_public = tree.xpath('///tr[*]/td[5]/font/text()')

# creating a database     /html/body/table/tbody/tr/td/p/table/tbody/tr[2]/td[5]/font
data01 = pd.DataFrame(institution, columns={'Institution'})
data02 = pd.DataFrame(inst_code[1:len(inst_code)-2], columns={'CNES Code'})
data03 = pd.DataFrame(equip[1:len(inst_code)-2], columns={'Number of Equipments'})
data04 = pd.DataFrame(equip_public[1:len(inst_code)-2], columns={'Accept SUS patients?'})
data04.replace('S', 'Y', inplace=True)
data = pd.concat([data01, data02, data03, data04], axis=1)
data.head(5)

Unnamed: 0,Institution,CNES Code,Number of Equipments,Accept SUS patients?
0,A C CAMARGO CANCER CENTER,2077531,3,Y
1,A MAIS MEDICINA DIAGNOSTICA UNID AUGUSTO TOLLE,9227571,1,N
2,A MAIS MEDICINA DIAGNOSTICA UNIDADE CAMPO BELO,6441106,1,N
3,A MAIS MEDICINA DIAGNOSTICA UNIDADE MOEMA,5167612,1,N
4,AFIP VILA CLEMENTINO,3813517,1,Y


In [22]:
# adding column with equipment name
all_text = tree.xpath('//font/text()')
equip_name = all_text[0]
data['Equipment Name'] = equip_name[20+equip_name[18:].index('-'):len(equip_name)]
data.head(5)

Unnamed: 0,Institution,CNES Code,Number of Equipments,Accept SUS patients?,Equipment Name
0,A C CAMARGO CANCER CENTER,2077531,3,Y,MAMOGRAFO COMPUTADORIZADO
1,A MAIS MEDICINA DIAGNOSTICA UNID AUGUSTO TOLLE,9227571,1,N,MAMOGRAFO COMPUTADORIZADO
2,A MAIS MEDICINA DIAGNOSTICA UNIDADE CAMPO BELO,6441106,1,N,MAMOGRAFO COMPUTADORIZADO
3,A MAIS MEDICINA DIAGNOSTICA UNIDADE MOEMA,5167612,1,N,MAMOGRAFO COMPUTADORIZADO
4,AFIP VILA CLEMENTINO,3813517,1,Y,MAMOGRAFO COMPUTADORIZADO


In [24]:
# exporting database
filepath = '/kolmogorov/OneDrive/Nerv/2_PhD/7_MachineLearning/95888_Python_Spring2018/Project/GitHub/DataFocusedPython_Group07/'
data.to_csv(filepath+'Example.csv')