## read data: download states dataset from sql database

In [1]:
from sqlalchemy import create_engine
from sqlalchemy import text 
import pandas as pd

In [2]:
from dotenv import dotenv_values

config = dotenv_values()

pg_user = config['POSTGRES_USER']  
pg_host = config['POSTGRES_HOST']
pg_port = config['POSTGRES_PORT']
pg_db = config['POSTGRES_DB']
pg_schema = config['POSTGRES_SCHEMA']
pg_pass = config['POSTGRES_PASS']

In [3]:
url = f'postgresql://{pg_user}:{pg_pass}@{pg_host}:{pg_port}/{pg_db}' #the same like version 1

engine = create_engine(url, echo=False) 

my_schema = 'team_jjat'

with engine.begin() as conn: 
    result = conn.execute(text(f'SET search_path TO {my_schema};'))

In [4]:
prep_states = pd.read_sql('SELECT * FROM states;', con=engine)
prep_states.head()

Unnamed: 0,UF,State,Capital,Region,Area,Population,Demographic Density,Cities count,GDP,GDP rate,Poverty,Latitude,Longitude
0,AC,Acre,Rio Branco,North,164123.73,881935,5.37,22,17201.95,0.5,0.189,-8.77,-70.55
1,AL,Alagoas,Maceió,Northeast,27843.295,3337357,119.86,102,15653.51,0.5,0.205,-9.62,-36.82
2,AM,Amazonas,Manaus,North,1559168.1,4144597,2.66,62,22936.28,0.7,0.193,-3.47,-65.1
3,AP,Amapá,Macapá,North,142470.77,845731,5.94,16,19405.11,0.6,0.128,1.41,-51.77
4,BA,Bahia,Salvador,Northeast,564722.6,14873064,26.34,417,17508.67,0.6,0.177,-13.29,-41.71


## Data inspection

In [5]:
prep_states.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27 entries, 0 to 26
Data columns (total 13 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   UF                   27 non-null     object 
 1   State                27 non-null     object 
 2   Capital              27 non-null     object 
 3   Region               27 non-null     object 
 4   Area                 27 non-null     float64
 5   Population           27 non-null     int64  
 6   Demographic Density  27 non-null     float64
 7   Cities count         27 non-null     int64  
 8   GDP                  27 non-null     float64
 9   GDP rate             27 non-null     float64
 10  Poverty              27 non-null     float64
 11  Latitude             27 non-null     float64
 12  Longitude            27 non-null     float64
dtypes: float64(7), int64(2), object(4)
memory usage: 2.9+ KB


In [6]:
prep_states.isnull().sum()

UF                     0
State                  0
Capital                0
Region                 0
Area                   0
Population             0
Demographic Density    0
Cities count           0
GDP                    0
GDP rate               0
Poverty                0
Latitude               0
Longitude              0
dtype: int64

In [7]:
prep_states.duplicated().sum()

np.int64(0)

In [8]:
prep_states['UF'].nunique()

27

## Data cleaning

In [9]:
# Convert all column names to lowercase
prep_states.columns = prep_states.columns.str.lower()

# Verify
print(prep_states.columns)

Index(['uf', 'state', 'capital', 'region', 'area', 'population',
       'demographic density', 'cities count', 'gdp', 'gdp rate', 'poverty',
       'latitude', 'longitude'],
      dtype='object')


In [11]:
prep_states = prep_states.applymap(
    lambda x: x.lower() if isinstance(x, str) else x
)
prep_states

  prep_states = prep_states.applymap(


Unnamed: 0,uf,state,capital,region,area,population,demographic density,cities count,gdp,gdp rate,poverty,latitude,longitude
0,ac,acre,rio branco,north,164123.73,881935,5.37,22,17201.95,0.5,0.189,-8.77,-70.55
1,al,alagoas,maceió,northeast,27843.295,3337357,119.86,102,15653.51,0.5,0.205,-9.62,-36.82
2,am,amazonas,manaus,north,1559168.1,4144597,2.66,62,22936.28,0.7,0.193,-3.47,-65.1
3,ap,amapá,macapá,north,142470.77,845731,5.94,16,19405.11,0.6,0.128,1.41,-51.77
4,ba,bahia,salvador,northeast,564722.6,14873064,26.34,417,17508.67,0.6,0.177,-13.29,-41.71
5,ce,ceará,fortaleza,northeast,148894.75,9132078,61.33,184,16394.99,0.5,0.184,-5.2,-39.53
6,df,distrito federal,brasília,center-west,5760.783,3015268,523.41,1,80502.47,2.5,0.019,-15.83,-47.86
7,es,espírito santo,vitória,southeast,46074.445,4018650,87.22,78,28222.56,0.9,0.043,-19.19,-40.34
8,go,goiás,goiânia,center-west,340125.72,7018354,20.63,246,28308.77,0.9,0.037,-15.98,-49.86
9,ma,maranhão,são luís,northeast,329642.16,7075181,21.46,217,12788.75,0.4,0.263,-5.42,-45.44


In [12]:
prep_states.groupby('region')['state'].unique()


region
center-west    [distrito federal, goiás, mato grosso do sul, ...
north          [acre, amazonas, amapá, pará, rondônia, roraim...
northeast      [alagoas, bahia, ceará, maranhão, paraíba, per...
south                [paraná, rio grande do sul, santa catarina]
southeast      [espírito santo, minas gerais, rio de janeiro,...
Name: state, dtype: object

In [13]:
prep_states= prep_states.sort_values("region", ascending=True)
prep_states

Unnamed: 0,uf,state,capital,region,area,population,demographic density,cities count,gdp,gdp rate,poverty,latitude,longitude
12,mt,mato grosso,cuiabá,center-west,903207.0,3484466,3.86,141,37914.0,1.2,0.059,-12.64,-55.42
11,ms,mato grosso do sul,campo grande,center-west,357145.53,2778986,7.78,79,35520.45,1.1,0.05,-20.51,-54.54
8,go,goiás,goiânia,center-west,340125.72,7018354,20.63,246,28308.77,0.9,0.037,-15.98,-49.86
6,df,distrito federal,brasília,center-west,5760.783,3015268,523.41,1,80502.47,2.5,0.019,-15.83,-47.86
0,ac,acre,rio branco,north,164123.73,881935,5.37,22,17201.95,0.5,0.189,-8.77,-70.55
21,rr,roraima,boa vista,north,224273.83,605761,2.7,15,23158.06,0.7,0.179,1.99,-61.33
20,ro,rondônia,porto velho,north,237765.23,1777225,7.47,52,24092.81,0.8,0.079,-10.83,-63.34
13,pa,pará,belém,north,1245759.2,8602865,6.91,144,18549.33,0.6,0.192,-3.79,-52.48
26,to,tocantins,palmas,north,277720.4,1572866,5.66,139,21998.34,0.7,0.119,-9.46,-48.26
2,am,amazonas,manaus,north,1559168.1,4144597,2.66,62,22936.28,0.7,0.193,-3.47,-65.1


## Save data