# **Data Acquisition**

---

## **Prepare Environment**

<br/>

### Imports

In [1]:
# Data analysis and data wrangling
import numpy as np
import pandas as pd

# Plotting
import seaborn as sns
import matplotlib.pyplot as plt
import missingno as msno # missing values

# Wragling
from bs4 import BeautifulSoup

# Other
import configparser
import subprocess
import warnings
import pprint
import time
import os

<br/>

### Prepare Principal Directory

In [2]:
def path_to_work(end_directory: str='notebooks'):
    curr_dir = os.path.dirname(os.path.realpath ("__file__")) 
    
    if curr_dir.endswith(end_directory):
        os.chdir('..')
        return f'Change directory to: {curr_dir}'
    
    return f'Current working directory: {curr_dir}'

In [3]:
path_to_work('notebooks')

'Change directory to: /home/campos/projects/data-analysis-of-spending-by-brazilian-senators/notebooks'

<br/>

### Set Config

In [4]:
# Visualization inside the jupyter
%matplotlib inline

# Load the "autoreload" extension so that code can change
%load_ext autoreload

# ----------
# Plot
# ----------
# graph style
sns.set_style("darkgrid")
plt.style.use('fivethirtyeight')

# ----------
# Pandas
# ----------
# Floating point
pd.options.display.float_format = '{:.2f}'.format

# Print xxxx rows and all columns
pd.set_option('display.max_rows', 300)
pd.set_option('display.max_columns', None)

# ----------
# Python
# ----------
# pretty print
pp = pprint.PrettyPrinter(indent=4)

# Supress unnecessary warnings so that presentation looks clean
warnings.filterwarnings('ignore')

---

## **Collect Initial Data**
- The data were collected from [Portal da transparÃªncia](https://www12.senado.leg.br/transparencia/dados-abertos-transparencia/dados-abertos-ceaps)
- **Years**: 2008 until 2021
- **Format**: CSV

In [5]:
# Site to get csv
url = 'https://www12.senado.leg.br/transparencia/dados-abertos-transparencia/dados-abertos-ceaps'

In [6]:
# from <package>.<module> import <class>
from src.download_data import *

In [8]:
%%time

download_file_csv(url)

Try analysing page ...
https://www.senado.gov.br/transparencia/LAI/verba/despesa_ceaps_2021.csv
https://www.senado.gov.br/transparencia/LAI/verba/despesa_ceaps_2020.csv
https://www.senado.gov.br/transparencia/LAI/verba/despesa_ceaps_2019.csv
https://www.senado.gov.br/transparencia/LAI/verba/despesa_ceaps_2018.csv
https://www.senado.gov.br/transparencia/LAI/verba/despesa_ceaps_2017.csv
https://www.senado.gov.br/transparencia/LAI/verba/despesa_ceaps_2016.csv
https://www.senado.gov.br/transparencia/LAI/verba/despesa_ceaps_2015.csv
https://www.senado.gov.br/transparencia/LAI/verba/despesa_ceaps_2014.csv
https://www.senado.gov.br/transparencia/LAI/verba/despesa_ceaps_2013.csv
https://www.senado.gov.br/transparencia/LAI/verba/despesa_ceaps_2012.csv
https://www.senado.gov.br/transparencia/LAI/verba/despesa_ceaps_2011.csv
https://www.senado.gov.br/transparencia/LAI/verba/despesa_ceaps_2010.csv
https://www.senado.gov.br/transparencia/LAI/verba/despesa_ceaps_2009.csv
https://www.senado.gov.br/tr

In [9]:
!ls -lt data/raw

total 75584
-rw-rw-r-- 1 campos campos  437714 dez 21 08:25 despesa_ceaps_2008.csv
-rw-rw-r-- 1 campos campos 3370591 dez 21 08:25 despesa_ceaps_2009.csv
-rw-rw-r-- 1 campos campos 3771610 dez 21 08:25 despesa_ceaps_2010.csv
-rw-rw-r-- 1 campos campos 5578887 dez 21 08:24 despesa_ceaps_2011.csv
-rw-rw-r-- 1 campos campos 6740856 dez 21 08:24 despesa_ceaps_2012.csv
-rw-rw-r-- 1 campos campos 7830956 dez 21 08:23 despesa_ceaps_2013.csv
-rw-rw-r-- 1 campos campos 6272586 dez 21 08:22 despesa_ceaps_2014.csv
-rw-rw-r-- 1 campos campos 7466156 dez 21 08:22 despesa_ceaps_2015.csv
-rw-rw-r-- 1 campos campos 7611002 dez 21 08:21 despesa_ceaps_2016.csv
-rw-rw-r-- 1 campos campos 7610055 dez 21 08:20 despesa_ceaps_2017.csv
-rw-rw-r-- 1 campos campos 6951876 dez 21 08:19 despesa_ceaps_2018.csv
-rw-rw-r-- 1 campos campos 5896424 dez 21 08:19 despesa_ceaps_2019.csv
-rw-rw-r-- 1 campos campos 3725203 dez 21 08:18 despesa_ceaps_2020.csv
-rw-rw-r-- 1 campos campos 4110356 dez 21 08:18 despesa_ceaps_202

---