## Read data: pop_municipalities_2018

In [19]:
import pandas as pd

df_population = pd.read_csv('../data/brazilian_e-commerce/pop_municipalities_2018.csv')

df_population.head()

Unnamed: 0,ibge_code,city_name_ibge,city_population_2018
0,1100015,Alta Floresta D'Oeste - RO,23167.0
1,1100023,Ariquemes - RO,106168.0
2,1100031,Cabixi - RO,5438.0
3,1100049,Cacoal - RO,84813.0
4,1100056,Cerejeiras - RO,16444.0


## Data inspection

In [20]:
df_population.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5571 entries, 0 to 5570
Data columns (total 3 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   ibge_code             5571 non-null   int64  
 1   city_name_ibge        5571 non-null   object 
 2   city_population_2018  5570 non-null   float64
dtypes: float64(1), int64(1), object(1)
memory usage: 130.7+ KB


In [21]:
df_population.isnull().sum()

ibge_code               0
city_name_ibge          0
city_population_2018    1
dtype: int64

In [22]:
df_population.duplicated().sum()

np.int64(0)

## Data cleaning:

In [23]:
df_population[['city_name', 'state_code']] = df_population['city_name_ibge'].str.split(' - ', expand=True)

df_population

Unnamed: 0,ibge_code,city_name_ibge,city_population_2018,city_name,state_code
0,1100015,Alta Floresta D'Oeste - RO,23167.0,Alta Floresta D'Oeste,RO
1,1100023,Ariquemes - RO,106168.0,Ariquemes,RO
2,1100031,Cabixi - RO,5438.0,Cabixi,RO
3,1100049,Cacoal - RO,84813.0,Cacoal,RO
4,1100056,Cerejeiras - RO,16444.0,Cerejeiras,RO
...,...,...,...,...,...
5566,5222005,Vianópolis - GO,13746.0,Vianópolis,GO
5567,5222054,Vicentinópolis - GO,8611.0,Vicentinópolis,GO
5568,5222203,Vila Boa - GO,6026.0,Vila Boa,GO
5569,5222302,Vila Propício - GO,5758.0,Vila Propício,GO


In [24]:
import unicodedata
import re

def clean_city(name):
    name = name.lower().strip()
    name = ''.join(c for c in unicodedata.normalize('NFD', name) if unicodedata.category(c) != 'Mn')
    name = re.sub(r'[^a-z0-9\s]', '', name)
    name = re.sub(r'\s+', ' ', name)
    return name

In [25]:
df_population['city_name_clean'] = df_population['city_name'].apply(clean_city)

df_population

Unnamed: 0,ibge_code,city_name_ibge,city_population_2018,city_name,state_code,city_name_clean
0,1100015,Alta Floresta D'Oeste - RO,23167.0,Alta Floresta D'Oeste,RO,alta floresta doeste
1,1100023,Ariquemes - RO,106168.0,Ariquemes,RO,ariquemes
2,1100031,Cabixi - RO,5438.0,Cabixi,RO,cabixi
3,1100049,Cacoal - RO,84813.0,Cacoal,RO,cacoal
4,1100056,Cerejeiras - RO,16444.0,Cerejeiras,RO,cerejeiras
...,...,...,...,...,...,...
5566,5222005,Vianópolis - GO,13746.0,Vianópolis,GO,vianopolis
5567,5222054,Vicentinópolis - GO,8611.0,Vicentinópolis,GO,vicentinopolis
5568,5222203,Vila Boa - GO,6026.0,Vila Boa,GO,vila boa
5569,5222302,Vila Propício - GO,5758.0,Vila Propício,GO,vila propicio


In [26]:
# List of unnecessary columns from states table
columns_to_drop = [
    'city_name_ibge',
    'city_name',
]

# Drop them
df_population.drop(columns=columns_to_drop, inplace=True)

df_population.head()

Unnamed: 0,ibge_code,city_population_2018,state_code,city_name_clean
0,1100015,23167.0,RO,alta floresta doeste
1,1100023,106168.0,RO,ariquemes
2,1100031,5438.0,RO,cabixi
3,1100049,84813.0,RO,cacoal
4,1100056,16444.0,RO,cerejeiras


## City classification

In [27]:
def classify_city(pop):
    if pop < 20000:
        return 'remote'
    elif pop < 100000:
        return 'small_city'
    elif pop < 500000:
        return 'medium_city'
    else:
        return 'large_urban'  

df_population['city_size_category']=df_population['city_population_2018'].apply(classify_city)
df_population.head()

Unnamed: 0,ibge_code,city_population_2018,state_code,city_name_clean,city_size_category
0,1100015,23167.0,RO,alta floresta doeste,small_city
1,1100023,106168.0,RO,ariquemes,medium_city
2,1100031,5438.0,RO,cabixi,remote
3,1100049,84813.0,RO,cacoal,small_city
4,1100056,16444.0,RO,cerejeiras,remote


In [28]:
df_population['city_name_clean'].value_counts()

city_name_clean
sao domingos            5
bom jesus               5
santa helena            4
santa terezinha         4
bonito                  4
                       ..
coracao de maria        1
contendas do sincora    1
condeuba                1
conceicao do jacuipe    1
brasilia                1
Name: count, Length: 5290, dtype: int64

In [29]:
df_population[df_population['city_name_clean'] == 'sao paulo']


Unnamed: 0,ibge_code,city_population_2018,state_code,city_name_clean,city_size_category
3829,3550308,12176866.0,SP,sao paulo,large_urban


In [30]:
df_population['city_size_category'].value_counts()

city_size_category
remote         3808
small_city     1445
medium_city     271
large_urban      47
Name: count, dtype: int64

In [31]:
df_population = df_population.apply(lambda x: x.str.lower() if x.dtype == "object" else x)

df_population.head()

Unnamed: 0,ibge_code,city_population_2018,state_code,city_name_clean,city_size_category
0,1100015,23167.0,ro,alta floresta doeste,small_city
1,1100023,106168.0,ro,ariquemes,medium_city
2,1100031,5438.0,ro,cabixi,remote
3,1100049,84813.0,ro,cacoal,small_city
4,1100056,16444.0,ro,cerejeiras,remote


## read data: states dataset

In [32]:
df_states=pd.read_csv('../data/brazilian_e-commerce/states.csv')
df_states.head()

Unnamed: 0,UF,State,Capital,Region,Area,Population,Demographic Density,Cities count,GDP,GDP rate,Poverty,Latitude,Longitude
0,AC,Acre,Rio Branco,North,164123.73,881935,5.37,22,17201.95,0.5,0.189,-8.77,-70.55
1,AL,Alagoas,Maceió,Northeast,27843.295,3337357,119.86,102,15653.51,0.5,0.205,-9.62,-36.82
2,AM,Amazonas,Manaus,North,1559168.1,4144597,2.66,62,22936.28,0.7,0.193,-3.47,-65.1
3,AP,Amapá,Macapá,North,142470.77,845731,5.94,16,19405.11,0.6,0.128,1.41,-51.77
4,BA,Bahia,Salvador,Northeast,564722.6,14873064,26.34,417,17508.67,0.6,0.177,-13.29,-41.71


## Data cleaning: states dataset

In [33]:
df_states = df_states.apply(lambda x: x.str.lower() if x.dtype == "object" else x)
df_states.head()

Unnamed: 0,UF,State,Capital,Region,Area,Population,Demographic Density,Cities count,GDP,GDP rate,Poverty,Latitude,Longitude
0,ac,acre,rio branco,north,164123.73,881935,5.37,22,17201.95,0.5,0.189,-8.77,-70.55
1,al,alagoas,maceió,northeast,27843.295,3337357,119.86,102,15653.51,0.5,0.205,-9.62,-36.82
2,am,amazonas,manaus,north,1559168.1,4144597,2.66,62,22936.28,0.7,0.193,-3.47,-65.1
3,ap,amapá,macapá,north,142470.77,845731,5.94,16,19405.11,0.6,0.128,1.41,-51.77
4,ba,bahia,salvador,northeast,564722.6,14873064,26.34,417,17508.67,0.6,0.177,-13.29,-41.71


## Merge data: population dataset + states dataset

In [34]:
prep_population_states = pd.merge(
    df_population,
    df_states,
    how="left",
    left_on="state_code",
    right_on="UF"
)
prep_population_states.head()

Unnamed: 0,ibge_code,city_population_2018,state_code,city_name_clean,city_size_category,UF,State,Capital,Region,Area,Population,Demographic Density,Cities count,GDP,GDP rate,Poverty,Latitude,Longitude
0,1100015,23167.0,ro,alta floresta doeste,small_city,ro,rondônia,porto velho,north,237765.23,1777225,7.47,52,24092.81,0.8,0.079,-10.83,-63.34
1,1100023,106168.0,ro,ariquemes,medium_city,ro,rondônia,porto velho,north,237765.23,1777225,7.47,52,24092.81,0.8,0.079,-10.83,-63.34
2,1100031,5438.0,ro,cabixi,remote,ro,rondônia,porto velho,north,237765.23,1777225,7.47,52,24092.81,0.8,0.079,-10.83,-63.34
3,1100049,84813.0,ro,cacoal,small_city,ro,rondônia,porto velho,north,237765.23,1777225,7.47,52,24092.81,0.8,0.079,-10.83,-63.34
4,1100056,16444.0,ro,cerejeiras,remote,ro,rondônia,porto velho,north,237765.23,1777225,7.47,52,24092.81,0.8,0.079,-10.83,-63.34


## Data cleaning: merged dataset

In [35]:
prep_population_states = prep_population_states.drop (columns = [
    'UF', 
    'Area',
    'Population',
    'Demographic Density',
    'Cities count',
    'GDP',
    'GDP rate',
    'Poverty',
    'Latitude',
    'Longitude'
    ])
prep_population_states.head()

Unnamed: 0,ibge_code,city_population_2018,state_code,city_name_clean,city_size_category,State,Capital,Region
0,1100015,23167.0,ro,alta floresta doeste,small_city,rondônia,porto velho,north
1,1100023,106168.0,ro,ariquemes,medium_city,rondônia,porto velho,north
2,1100031,5438.0,ro,cabixi,remote,rondônia,porto velho,north
3,1100049,84813.0,ro,cacoal,small_city,rondônia,porto velho,north
4,1100056,16444.0,ro,cerejeiras,remote,rondônia,porto velho,north


In [36]:
prep_population_states.columns= prep_population_states.columns.str.lower()
prep_population_states.head()

Unnamed: 0,ibge_code,city_population_2018,state_code,city_name_clean,city_size_category,state,capital,region
0,1100015,23167.0,ro,alta floresta doeste,small_city,rondônia,porto velho,north
1,1100023,106168.0,ro,ariquemes,medium_city,rondônia,porto velho,north
2,1100031,5438.0,ro,cabixi,remote,rondônia,porto velho,north
3,1100049,84813.0,ro,cacoal,small_city,rondônia,porto velho,north
4,1100056,16444.0,ro,cerejeiras,remote,rondônia,porto velho,north


In [37]:
prep_population_states = prep_population_states.rename(columns={
    'city_population_2018':'city_population'
})
prep_population_states.head()

Unnamed: 0,ibge_code,city_population,state_code,city_name_clean,city_size_category,state,capital,region
0,1100015,23167.0,ro,alta floresta doeste,small_city,rondônia,porto velho,north
1,1100023,106168.0,ro,ariquemes,medium_city,rondônia,porto velho,north
2,1100031,5438.0,ro,cabixi,remote,rondônia,porto velho,north
3,1100049,84813.0,ro,cacoal,small_city,rondônia,porto velho,north
4,1100056,16444.0,ro,cerejeiras,remote,rondônia,porto velho,north


In [38]:
new_order = [
     'ibge_code',
     'city_name_clean',
     'city_population',
     'city_size_category',
     'state_code',
     'state',
     'capital',
     'region'
 ]
prep_population_states = prep_population_states[new_order]

prep_population_states.head()

Unnamed: 0,ibge_code,city_name_clean,city_population,city_size_category,state_code,state,capital,region
0,1100015,alta floresta doeste,23167.0,small_city,ro,rondônia,porto velho,north
1,1100023,ariquemes,106168.0,medium_city,ro,rondônia,porto velho,north
2,1100031,cabixi,5438.0,remote,ro,rondônia,porto velho,north
3,1100049,cacoal,84813.0,small_city,ro,rondônia,porto velho,north
4,1100056,cerejeiras,16444.0,remote,ro,rondônia,porto velho,north


## Save dataset

In [39]:
prep_population_states.to_csv('prep_city_state_population.csv', index=False)

In [40]:
from sqlalchemy import create_engine, types
from sqlalchemy import text # to be able to pass string

In [41]:
# Let's load values from the .env file
from dotenv import dotenv_values

config = dotenv_values()

# define variables for the login
pg_user = config['POSTGRES_USER']  # align the key label with your .env file !
pg_host = config['POSTGRES_HOST']
pg_port = config['POSTGRES_PORT']
pg_db = config['POSTGRES_DB']
pg_schema = config['POSTGRES_SCHEMA']
pg_pass = config['POSTGRES_PASS']

In [42]:
# updating the url
url = f'postgresql://{pg_user}:{pg_pass}@{pg_host}:{pg_port}/{pg_db}' #the same like version 1

# let's switch the logging off again. .
engine = create_engine(url, echo=False) #the same like version 1


# writing dataframe to DB : Pandas Dataframe to DB Table in my own Schema
prep_population_states.to_sql(name = 'prep_city_state_population', 
                       con = engine, 
                       schema = pg_schema, # pandas is allowing to specify, in which schema the table shall be created
                       if_exists='replace', 
                       index=False
                      )

571