# Prepare environment and Data Acquisition

### Imports

In [1]:
# Data analysis and data wrangling
import numpy as np
import pandas as pd

# Plotting
import seaborn as sns
import matplotlib.pyplot as plt
import missingno as msno # analise de variáveis vazia
from IPython.display import Image

# Wragling
from bs4 import BeautifulSoup

# Other
import configparser
import subprocess
import warnings
import time
import os

### Prepare Principal Directory

In [2]:
def prepare_directory_work(end_directory: str='notebooks'):
    # Current path
    curr_dir = os.path.dirname (os.path.realpath ("__file__")) 
    
    if curr_dir.endswith(end_directory):
        os.chdir('..')
        return curr_dir
    
    return f'Current working directory: {curr_dir}'

In [3]:
prepare_directory_work(end_directory='notebooks')

'/home/campos/projetos/artificial_inteligence/projects/analise_despesas_senadores/notebooks'

### Prepare Environment

In [4]:
# from <package>.<module> import <class>
from src.environment.prepare_env import *


main()

Requirements this project:

numpy==1.16.4
pandas==0.24.2
seaborn==0.9.0
missingno==0.4.1
matplotlib==3.1.1
ipython==7.6.1
beautifulsoup4==4.7.1
atlas==0.27.0

--------------------------------------------------
Configuration Environment:

OS:
Linux
Distributor ID:	Ubuntu
Description:	Ubuntu 19.04
Release:	19.04
Codename:	disco

Python Version:
Python 3.7.3

Pip Version:
pip 19.1.1 from /home/campos/projetos/artificial_inteligence/projects/analise_despesas_senadores/src/environment/venv/lib/python3.7/site-packages/pip (python 3.7)

Jupyter Version:
4.4.0

--------------------------------------------------

Disk Usage:

data:
65M	data/

virtual env:
377M	src/environment/venv/

all:
492M	.

--------------------------------------------------
Structure This Project:

.
├── data
│   ├── cleansing
│   │   ├── dados_limpos_ceaps_cleansing.csv
│   │   └── map_senadores.csv
│   └── dumps
│       ├── 2008.csv
│       ├── 2009.csv
│       ├── 2010.csv
│       ├── 2011.csv
│       ├── 2012.csv
│    

### Cell Format

In [5]:
# Guarantees visualization inside the jupyter
%matplotlib inline

# OPTIONAL: Load the "autoreload" extension so that code can change
%load_ext autoreload

# Format the data os all table (float_format 3)
pd.set_option('display.float_format', '{:.6}'.format)

# Print xxxx rows and columns
pd.set_option('display.max_rows', 300)
pd.set_option('display.max_columns', None)

# Supress unnecessary warnings so that presentation looks clean
warnings.filterwarnings('ignore')

# Graph style
sns.set(style='dark', palette='deep')

---

## Data Acquisition

- **Fonte:**
portal da transparência: https://www12.senado.leg.br/transparencia/dados-abertos-transparencia/dados-abertos-ceaps
- **Ano**: 2008 até 2019
- **Formato**: CSV

In [6]:
# Site to get csv
url = 'https://www12.senado.leg.br/transparencia/dados-abertos-transparencia/dados-abertos-ceaps'

In [7]:
# from <package>.<module> import <class>
from src.dump_data import *

In [8]:
dump_file_csv(url)

Try analysing page ...
http://www.senado.gov.br/transparencia/LAI/verba/2019.csv
http://www.senado.gov.br/transparencia/LAI/verba/2018.csv
http://www.senado.gov.br/transparencia/LAI/verba/2017.csv
http://www.senado.gov.br/transparencia/LAI/verba/2016.csv
http://www.senado.gov.br/transparencia/LAI/verba/2015.csv
http://www.senado.gov.br/transparencia/LAI/verba/2014.csv
http://www.senado.gov.br/transparencia/LAI/verba/2013.csv
http://www.senado.gov.br/transparencia/LAI/verba/2012.csv
http://www.senado.gov.br/transparencia/LAI/verba/2011.csv
http://www.senado.gov.br/transparencia/LAI/verba/2010.csv
http://www.senado.gov.br/transparencia/LAI/verba/2009.csv
http://www.senado.gov.br/transparencia/LAI/verba/2008.csv
data/dumps/2019.csv downloaded!
data/dumps/2018.csv downloaded!
data/dumps/2017.csv downloaded!
data/dumps/2016.csv downloaded!
data/dumps/2015.csv downloaded!
data/dumps/2014.csv downloaded!
data/dumps/2013.csv downloaded!
data/dumps/2012.csv downloaded!
data/dumps/2011.csv downl

In [9]:
!ls -lt data/dumps

total 62200
-rw-rw-r-- 1 campos campos  398514 jul  6 17:03 2008.csv
-rw-rw-r-- 1 campos campos 3217390 jul  6 17:03 2009.csv
-rw-rw-r-- 1 campos campos 3607479 jul  6 17:02 2010.csv
-rw-rw-r-- 1 campos campos 5348705 jul  6 17:02 2011.csv
-rw-rw-r-- 1 campos campos 6469271 jul  6 17:02 2012.csv
-rw-rw-r-- 1 campos campos 7566564 jul  6 17:01 2013.csv
-rw-rw-r-- 1 campos campos 6074552 jul  6 17:00 2014.csv
-rw-rw-r-- 1 campos campos 7206031 jul  6 17:00 2015.csv
-rw-rw-r-- 1 campos campos 7343514 jul  6 16:59 2016.csv
-rw-rw-r-- 1 campos campos 7339762 jul  6 16:58 2017.csv
-rw-rw-r-- 1 campos campos 6706729 jul  6 16:58 2018.csv
-rw-rw-r-- 1 campos campos 2390072 jul  6 16:57 2019.csv
