## Read data: geolocation dataset

In [93]:
import pandas as pd

df_geolocation=pd.read_csv('../../data/brazilian_e-commerce/olist_geolocation_dataset.csv')

## Data inspection: geolocation dataset

In [94]:
df_geolocation.head()

Unnamed: 0,geolocation_zip_code_prefix,geolocation_lat,geolocation_lng,geolocation_city,geolocation_state
0,1037,-23.545621,-46.639292,sao paulo,SP
1,1046,-23.546081,-46.64482,sao paulo,SP
2,1046,-23.546129,-46.642951,sao paulo,SP
3,1041,-23.544392,-46.639499,sao paulo,SP
4,1035,-23.541578,-46.641607,sao paulo,SP


In [95]:
df_geolocation.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000163 entries, 0 to 1000162
Data columns (total 5 columns):
 #   Column                       Non-Null Count    Dtype  
---  ------                       --------------    -----  
 0   geolocation_zip_code_prefix  1000163 non-null  int64  
 1   geolocation_lat              1000163 non-null  float64
 2   geolocation_lng              1000163 non-null  float64
 3   geolocation_city             1000163 non-null  object 
 4   geolocation_state            1000163 non-null  object 
dtypes: float64(2), int64(1), object(2)
memory usage: 38.2+ MB


In [96]:
df_geolocation.isnull().sum()

geolocation_zip_code_prefix    0
geolocation_lat                0
geolocation_lng                0
geolocation_city               0
geolocation_state              0
dtype: int64

In [97]:
df_geolocation.duplicated().sum()

np.int64(261831)

In [98]:
# Shows all rows that are duplicates of a previous row
duplicates = df_geolocation[df_geolocation.duplicated()]
print(duplicates)


         geolocation_zip_code_prefix  geolocation_lat  geolocation_lng  \
15                              1046       -23.546081       -46.644820   
44                              1046       -23.546081       -46.644820   
65                              1046       -23.546081       -46.644820   
66                              1009       -23.546935       -46.636588   
67                              1046       -23.546081       -46.644820   
...                              ...              ...              ...   
1000153                        99970       -28.343273       -51.873734   
1000154                        99950       -28.070493       -52.011342   
1000159                        99900       -27.877125       -52.224882   
1000160                        99950       -28.071855       -52.014716   
1000162                        99950       -28.070104       -52.018658   

        geolocation_city geolocation_state  
15             sao paulo                SP  
44             sao pa

In [99]:
df_geolocation['geolocation_state'].unique()

array(['SP', 'RN', 'AC', 'RJ', 'ES', 'MG', 'BA', 'SE', 'PE', 'AL', 'PB',
       'CE', 'PI', 'MA', 'PA', 'AP', 'AM', 'RR', 'DF', 'GO', 'RO', 'TO',
       'MT', 'MS', 'RS', 'PR', 'SC'], dtype=object)

In [100]:
df_geolocation['geolocation_state'].nunique()


27

In [101]:
df_geolocation['geolocation_city'].unique()

array(['sao paulo', 'são paulo', 'sao bernardo do campo', ..., 'ciríaco',
       'estação', 'vila lângaro'], shape=(8011,), dtype=object)

In [102]:
df_geolocation['geolocation_city'].nunique()

8011

In [131]:
df_geolocation['geolocation_zip_code_prefix'].nunique()

19015

In [103]:
# Check all unique city names per state
cities_per_state = df_geolocation.groupby('geolocation_state')['geolocation_city'].unique()
for state, cities in cities_per_state.items():
    print(state, cities[:10])  # print first 10 cities for brevity

AC ['sao paulo' 'rio de janeiro' 'sena madureira' 'rio branco' 'feijo'
 'senador guiomard' 'cruzeiro do sul' 'xapuri' 'feijó' 'manoel urbano']
AL ['maceio' 'maceió' 'maceia³' 'barra de sao miguel' 'rio largo'
 'marechal deodoro' 'pilar' 'satuba' 'santa luzia do norte'
 'barra de são miguel']
AM ['manaus' 'parintins' 'itacoatiara' 'silves' 'rio preto da eva'
 'urucurituba' 'maues' 'nhamunda' 'boa vista do ramos' 'maués']
AP ['macapa' 'serra do navio' 'laranjal do jari' 'macapá' 'santana'
 'oiapoque' 'pedra branca do amapari' 'vitoria do jari' 'mazagao'
 'clevelândia do norte']
BA ['salvador' 'salvador ' 'lauro de freitas' 'madre de deus' 'abrantes'
 "dias d'avila" 'camacari' 'camaçari' "dias d'ávila" 'monte gordo']
CE ['fortaleza' 'caucaia' 'eusebio' 'eusébio' 'aquiraz' 'pacatuba'
 'itaitinga' 'guaiuba' 'maracanau' 'maranguape']
DF ['brasilia' 'brasília' 'cruzeiro' 'guara' 'guará' 'paranoa' 'lago norte'
 'lago sul' 'sao sebastiao' 'são sebastião']
ES ['vitória' 'vitoria' 'vila velha' 's

## Data cleaning: geolocation dataset

In [104]:
df_geolocation['geolocation_zip_code_prefix'] = (
    df_geolocation['geolocation_zip_code_prefix']
    .astype(str)
    .str.zfill(5)
)
df_geolocation.head()

Unnamed: 0,geolocation_zip_code_prefix,geolocation_lat,geolocation_lng,geolocation_city,geolocation_state
0,1037,-23.545621,-46.639292,sao paulo,SP
1,1046,-23.546081,-46.64482,sao paulo,SP
2,1046,-23.546129,-46.642951,sao paulo,SP
3,1041,-23.544392,-46.639499,sao paulo,SP
4,1035,-23.541578,-46.641607,sao paulo,SP


In [105]:
df_sorted = df_geolocation.sort_values(
    by=['geolocation_state', 'geolocation_city', 'geolocation_zip_code_prefix'],
    ascending=[True, True, True]
).reset_index(drop=True)

df_sorted

Unnamed: 0,geolocation_zip_code_prefix,geolocation_lat,geolocation_lng,geolocation_city,geolocation_state
0,69945,-10.074669,-67.055162,acrelandia,AC
1,69945,-10.080001,-67.053719,acrelandia,AC
2,69945,-9.937884,-66.928913,acrelandia,AC
3,69945,-10.077416,-67.052816,acrelandia,AC
4,69945,-10.077757,-67.049242,acrelandia,AC
...,...,...,...,...,...
1000158,77880,-6.412219,-48.531582,xambioá,TO
1000159,77880,-6.413853,-48.538046,xambioá,TO
1000160,77880,-6.407600,-48.534747,xambioá,TO
1000161,77880,-6.417654,-48.535332,xambioá,TO


In [106]:
import unicodedata

def has_accent(text):
    if not isinstance(text, str):
        return False
    return any(
        unicodedata.category(c) == "Mn"
        for c in unicodedata.normalize("NFD", text)
    )

# Apply to geolocation_city
df_geolocation['has_accent'] = df_geolocation['geolocation_city'].apply(has_accent)

# Count how many cities have accents
df_geolocation['has_accent'].value_counts()


has_accent
False    926726
True      73437
Name: count, dtype: int64

In [107]:
import unicodedata

def remove_accents(text):
    if isinstance(text, str):
        return ''.join(
            c for c in unicodedata.normalize('NFD', text)
            if unicodedata.category(c) != 'Mn'
        )
    return text

df_geolocation['geolocation_city_noaccent'] = df_geolocation['geolocation_city'].apply(remove_accents)
df_geolocation


Unnamed: 0,geolocation_zip_code_prefix,geolocation_lat,geolocation_lng,geolocation_city,geolocation_state,has_accent,geolocation_city_noaccent
0,01037,-23.545621,-46.639292,sao paulo,SP,False,sao paulo
1,01046,-23.546081,-46.644820,sao paulo,SP,False,sao paulo
2,01046,-23.546129,-46.642951,sao paulo,SP,False,sao paulo
3,01041,-23.544392,-46.639499,sao paulo,SP,False,sao paulo
4,01035,-23.541578,-46.641607,sao paulo,SP,False,sao paulo
...,...,...,...,...,...,...,...
1000158,99950,-28.068639,-52.010705,tapejara,RS,False,tapejara
1000159,99900,-27.877125,-52.224882,getulio vargas,RS,False,getulio vargas
1000160,99950,-28.071855,-52.014716,tapejara,RS,False,tapejara
1000161,99980,-28.388932,-51.846871,david canabarro,RS,False,david canabarro


In [108]:
df_geolocation['geolocation_city_clean'] = (
    df_geolocation['geolocation_city_noaccent']
        .str.lower()
        .str.strip()
)
df_geolocation

Unnamed: 0,geolocation_zip_code_prefix,geolocation_lat,geolocation_lng,geolocation_city,geolocation_state,has_accent,geolocation_city_noaccent,geolocation_city_clean
0,01037,-23.545621,-46.639292,sao paulo,SP,False,sao paulo,sao paulo
1,01046,-23.546081,-46.644820,sao paulo,SP,False,sao paulo,sao paulo
2,01046,-23.546129,-46.642951,sao paulo,SP,False,sao paulo,sao paulo
3,01041,-23.544392,-46.639499,sao paulo,SP,False,sao paulo,sao paulo
4,01035,-23.541578,-46.641607,sao paulo,SP,False,sao paulo,sao paulo
...,...,...,...,...,...,...,...,...
1000158,99950,-28.068639,-52.010705,tapejara,RS,False,tapejara,tapejara
1000159,99900,-27.877125,-52.224882,getulio vargas,RS,False,getulio vargas,getulio vargas
1000160,99950,-28.071855,-52.014716,tapejara,RS,False,tapejara,tapejara
1000161,99980,-28.388932,-51.846871,david canabarro,RS,False,david canabarro,david canabarro


In [109]:
df_geolocation['geolocation_city_clean'] = df_geolocation['geolocation_city_clean'].str.replace(r'\s+', ' ', regex=True)
df_geolocation

Unnamed: 0,geolocation_zip_code_prefix,geolocation_lat,geolocation_lng,geolocation_city,geolocation_state,has_accent,geolocation_city_noaccent,geolocation_city_clean
0,01037,-23.545621,-46.639292,sao paulo,SP,False,sao paulo,sao paulo
1,01046,-23.546081,-46.644820,sao paulo,SP,False,sao paulo,sao paulo
2,01046,-23.546129,-46.642951,sao paulo,SP,False,sao paulo,sao paulo
3,01041,-23.544392,-46.639499,sao paulo,SP,False,sao paulo,sao paulo
4,01035,-23.541578,-46.641607,sao paulo,SP,False,sao paulo,sao paulo
...,...,...,...,...,...,...,...,...
1000158,99950,-28.068639,-52.010705,tapejara,RS,False,tapejara,tapejara
1000159,99900,-27.877125,-52.224882,getulio vargas,RS,False,getulio vargas,getulio vargas
1000160,99950,-28.071855,-52.014716,tapejara,RS,False,tapejara,tapejara
1000161,99980,-28.388932,-51.846871,david canabarro,RS,False,david canabarro,david canabarro


In [132]:
df_geolocation['geolocation_city_clean'].value_counts().head(20)

geolocation_city_clean
sao paulo                3183
brasilia                  509
rio de janeiro            401
salvador                  276
goiania                   236
belo horizonte            204
fortaleza                 175
curitiba                  167
porto alegre              142
guarulhos                 139
sao bernardo do campo     139
recife                    130
osasco                     90
belem                      86
campo grande               85
nova iguacu                83
sao goncalo                75
teresina                   74
campinas                   73
manaus                     73
Name: count, dtype: int64

In [110]:
from rapidfuzz import process

reference_cities = df_geolocation['geolocation_city_clean'].unique()

def correct_city(city):
    match = process.extractOne(city, reference_cities)
    return match[0] if match[1] >= 85 else city
df_geolocation

Unnamed: 0,geolocation_zip_code_prefix,geolocation_lat,geolocation_lng,geolocation_city,geolocation_state,has_accent,geolocation_city_noaccent,geolocation_city_clean
0,01037,-23.545621,-46.639292,sao paulo,SP,False,sao paulo,sao paulo
1,01046,-23.546081,-46.644820,sao paulo,SP,False,sao paulo,sao paulo
2,01046,-23.546129,-46.642951,sao paulo,SP,False,sao paulo,sao paulo
3,01041,-23.544392,-46.639499,sao paulo,SP,False,sao paulo,sao paulo
4,01035,-23.541578,-46.641607,sao paulo,SP,False,sao paulo,sao paulo
...,...,...,...,...,...,...,...,...
1000158,99950,-28.068639,-52.010705,tapejara,RS,False,tapejara,tapejara
1000159,99900,-27.877125,-52.224882,getulio vargas,RS,False,getulio vargas,getulio vargas
1000160,99950,-28.071855,-52.014716,tapejara,RS,False,tapejara,tapejara
1000161,99980,-28.388932,-51.846871,david canabarro,RS,False,david canabarro,david canabarro


In [111]:
# Most frequent city/state per ZIP prefix
city_state_mode = (
    df_geolocation
    .groupby('geolocation_zip_code_prefix')
    .agg({
        'geolocation_city': lambda x: x.value_counts().idxmax(),       # original name
        'geolocation_city_clean': lambda x: x.value_counts().idxmax(), # cleaned name
        'geolocation_state': lambda x: x.value_counts().idxmax()
    })
)


In [112]:
coords_mean = (
    df_geolocation
    .groupby('geolocation_zip_code_prefix')
    .agg({
        'geolocation_lat': 'mean',
        'geolocation_lng': 'mean'
    })
)


In [113]:
df_geolocation = coords_mean.merge(
    city_state_mode,
    left_index=True,
    right_index=True
).reset_index()


In [114]:
df_geolocation

Unnamed: 0,geolocation_zip_code_prefix,geolocation_lat,geolocation_lng,geolocation_city,geolocation_city_clean,geolocation_state
0,01001,-23.550190,-46.634024,sao paulo,sao paulo,SP
1,01002,-23.548146,-46.634979,sao paulo,sao paulo,SP
2,01003,-23.548994,-46.635731,sao paulo,sao paulo,SP
3,01004,-23.549799,-46.634757,sao paulo,sao paulo,SP
4,01005,-23.549456,-46.636733,sao paulo,sao paulo,SP
...,...,...,...,...,...,...
19010,99960,-27.953722,-52.025511,charrua,charrua,RS
19011,99965,-28.183372,-52.039850,agua santa,agua santa,RS
19012,99970,-28.343766,-51.874689,ciriaco,ciriaco,RS
19013,99980,-28.389129,-51.843836,david canabarro,david canabarro,RS


## Read data: customers

In [115]:
df_customers=pd.read_csv('../../data/brazilian_e-commerce/prep_customers.csv')
df_customers.head()

Unnamed: 0,customer_id,customer_unique_id,customer_zip_code_prefix,customer_city,customer_state,customer_city_clean
0,7ae2a9337aa4bc799723511faa1d6830,0c1a20644f0dc126c3eaff8dbc1bd12c,1003,sao paulo,SP,sao paulo
1,a09edf8c1e842e94805a206b3d73eed5,968f6d2f674977d88a4b445a5117ccd8,1004,sao paulo,SP,sao paulo
2,ee9b73e88afb4904ee2322cfc89cf638,095e7c124c5c1ccb1eb9f731152eae6a,1004,sao paulo,SP,sao paulo
3,15d7dbcd027b5b24866db33e2b819021,ddab5650ba76e2fa2d7e25ed3343bd92,1005,sao paulo,SP,sao paulo
4,f3e31afdae80581be48ce94e7b0f3366,ded4351942c7fc292b88e5b090af2b46,1005,sao paulo,SP,sao paulo


In [116]:
# Count frequency
city_counts = df_geolocation['geolocation_city_clean'].value_counts()

# See rare cities (likely typos)
rare_cities = city_counts[city_counts == 1]
print(rare_cities.head(20))


geolocation_city_clean
pium                  1
caroebe               1
sao luiz              1
paranaiguara          1
rorainopolis          1
curuca                1
nazario               1
sao joao da baliza    1
novo jardim           1
moipora               1
amatura               1
jandaia               1
ouroana               1
montividiu            1
amorinopolis          1
manaquiri             1
anori                 1
anama                 1
codajas               1
caldas novas          1
Name: count, dtype: int64


## Data cleaning: Correct geolocation city using customer city

In [117]:
from rapidfuzz import process

# Reference city list from customers
reference_cities = df_customers['customer_city_clean'].unique()

# Function to correct customer city
def correct_geolocation_city(city):
    match = process.extractOne(city, reference_cities)
    return match[0] if match[1] >= 85 else city

# Apply to geolocation table
df_geolocation['geolocation_city_clean'] = df_geolocation['geolocation_city_clean'].apply(correct_geolocation_city)
df_geolocation.head()

Unnamed: 0,geolocation_zip_code_prefix,geolocation_lat,geolocation_lng,geolocation_city,geolocation_city_clean,geolocation_state
0,1001,-23.55019,-46.634024,sao paulo,sao paulo,SP
1,1002,-23.548146,-46.634979,sao paulo,sao paulo,SP
2,1003,-23.548994,-46.635731,sao paulo,sao paulo,SP
3,1004,-23.549799,-46.634757,sao paulo,sao paulo,SP
4,1005,-23.549456,-46.636733,sao paulo,sao paulo,SP


In [120]:
df_geolocation[df_geolocation.duplicated(['geolocation_zip_code_prefix'], keep=False)]


Unnamed: 0,geolocation_zip_code_prefix,geolocation_lat,geolocation_lng,geolocation_city,geolocation_city_clean,geolocation_state


## Read data: states dataset

In [133]:
df_states=pd.read_csv('../../data/brazilian_e-commerce/states.csv')
df_states.head()

Unnamed: 0,UF,State,Capital,Region,Area,Population,Demographic Density,Cities count,GDP,GDP rate,Poverty,Latitude,Longitude
0,AC,Acre,Rio Branco,North,164123.73,881935,5.37,22,17201.95,0.5,0.189,-8.77,-70.55
1,AL,Alagoas,Maceió,Northeast,27843.295,3337357,119.86,102,15653.51,0.5,0.205,-9.62,-36.82
2,AM,Amazonas,Manaus,North,1559168.1,4144597,2.66,62,22936.28,0.7,0.193,-3.47,-65.1
3,AP,Amapá,Macapá,North,142470.77,845731,5.94,16,19405.11,0.6,0.128,1.41,-51.77
4,BA,Bahia,Salvador,Northeast,564722.6,14873064,26.34,417,17508.67,0.6,0.177,-13.29,-41.71


## Merge data: geolocation dataset + states dataset

In [122]:
df_geolocation_states = pd.merge(
    df_geolocation,
    df_states,
    how="left",
    left_on="geolocation_state",
    right_on="UF"
)
df_geolocation_states.head()

Unnamed: 0,geolocation_zip_code_prefix,geolocation_lat,geolocation_lng,geolocation_city,geolocation_city_clean,geolocation_state,UF,State,Capital,Region,Area,Population,Demographic Density,Cities count,GDP,GDP rate,Poverty,Latitude,Longitude
0,1001,-23.55019,-46.634024,sao paulo,sao paulo,SP,SP,São Paulo,São Paulo,Southeast,248219.48,45919049,184.99,645,47008.77,1.5,0.027,-22.19,-48.79
1,1002,-23.548146,-46.634979,sao paulo,sao paulo,SP,SP,São Paulo,São Paulo,Southeast,248219.48,45919049,184.99,645,47008.77,1.5,0.027,-22.19,-48.79
2,1003,-23.548994,-46.635731,sao paulo,sao paulo,SP,SP,São Paulo,São Paulo,Southeast,248219.48,45919049,184.99,645,47008.77,1.5,0.027,-22.19,-48.79
3,1004,-23.549799,-46.634757,sao paulo,sao paulo,SP,SP,São Paulo,São Paulo,Southeast,248219.48,45919049,184.99,645,47008.77,1.5,0.027,-22.19,-48.79
4,1005,-23.549456,-46.636733,sao paulo,sao paulo,SP,SP,São Paulo,São Paulo,Southeast,248219.48,45919049,184.99,645,47008.77,1.5,0.027,-22.19,-48.79


## Data cleaning: geolodation_states dataset

In [123]:
df_geolocation_states = df_geolocation_states.drop (columns = [
    'UF', 
    'Area',
    'Population',
    'Demographic Density',
    'Cities count',
    'GDP',
    'GDP rate',
    'Poverty',
    'Latitude',
    'Longitude'
    ])
df_geolocation_states.head()

Unnamed: 0,geolocation_zip_code_prefix,geolocation_lat,geolocation_lng,geolocation_city,geolocation_city_clean,geolocation_state,State,Capital,Region
0,1001,-23.55019,-46.634024,sao paulo,sao paulo,SP,São Paulo,São Paulo,Southeast
1,1002,-23.548146,-46.634979,sao paulo,sao paulo,SP,São Paulo,São Paulo,Southeast
2,1003,-23.548994,-46.635731,sao paulo,sao paulo,SP,São Paulo,São Paulo,Southeast
3,1004,-23.549799,-46.634757,sao paulo,sao paulo,SP,São Paulo,São Paulo,Southeast
4,1005,-23.549456,-46.636733,sao paulo,sao paulo,SP,São Paulo,São Paulo,Southeast


In [124]:
df_geolocation_states.columns= df_geolocation_states.columns.str.lower()
df_geolocation_states.head()

Unnamed: 0,geolocation_zip_code_prefix,geolocation_lat,geolocation_lng,geolocation_city,geolocation_city_clean,geolocation_state,state,capital,region
0,1001,-23.55019,-46.634024,sao paulo,sao paulo,SP,São Paulo,São Paulo,Southeast
1,1002,-23.548146,-46.634979,sao paulo,sao paulo,SP,São Paulo,São Paulo,Southeast
2,1003,-23.548994,-46.635731,sao paulo,sao paulo,SP,São Paulo,São Paulo,Southeast
3,1004,-23.549799,-46.634757,sao paulo,sao paulo,SP,São Paulo,São Paulo,Southeast
4,1005,-23.549456,-46.636733,sao paulo,sao paulo,SP,São Paulo,São Paulo,Southeast


In [125]:
df_geolocation_states = df_geolocation_states.rename(columns={
    'geolocation_city' : 'geolocation_city_old',
    'geolocation_state': 'geolocation_state_code',
    'state': 'geolocation_state',
    'capital': 'geolocation_capital',
    'region': 'geolocation_region'
})
df_geolocation_states.head()

Unnamed: 0,geolocation_zip_code_prefix,geolocation_lat,geolocation_lng,geolocation_city_old,geolocation_city_clean,geolocation_state_code,geolocation_state,geolocation_capital,geolocation_region
0,1001,-23.55019,-46.634024,sao paulo,sao paulo,SP,São Paulo,São Paulo,Southeast
1,1002,-23.548146,-46.634979,sao paulo,sao paulo,SP,São Paulo,São Paulo,Southeast
2,1003,-23.548994,-46.635731,sao paulo,sao paulo,SP,São Paulo,São Paulo,Southeast
3,1004,-23.549799,-46.634757,sao paulo,sao paulo,SP,São Paulo,São Paulo,Southeast
4,1005,-23.549456,-46.636733,sao paulo,sao paulo,SP,São Paulo,São Paulo,Southeast


In [127]:
new_order = [
     'geolocation_zip_code_prefix',
     'geolocation_lat',
     'geolocation_lng',
     'geolocation_city_old',
     'geolocation_city_clean',
     'geolocation_state_code',
     'geolocation_state',
     'geolocation_capital',
     'geolocation_region'
 ]
df_geolocation_states = df_geolocation_states[new_order]

df_geolocation_states.head()

Unnamed: 0,geolocation_zip_code_prefix,geolocation_lat,geolocation_lng,geolocation_city_old,geolocation_city_clean,geolocation_state_code,geolocation_state,geolocation_capital,geolocation_region
0,1001,-23.55019,-46.634024,sao paulo,sao paulo,SP,São Paulo,São Paulo,Southeast
1,1002,-23.548146,-46.634979,sao paulo,sao paulo,SP,São Paulo,São Paulo,Southeast
2,1003,-23.548994,-46.635731,sao paulo,sao paulo,SP,São Paulo,São Paulo,Southeast
3,1004,-23.549799,-46.634757,sao paulo,sao paulo,SP,São Paulo,São Paulo,Southeast
4,1005,-23.549456,-46.636733,sao paulo,sao paulo,SP,São Paulo,São Paulo,Southeast


In [128]:
df_geolocation_states = df_geolocation_states.apply(lambda x: x.str.lower() if x.dtype == "object" else x)
df_geolocation_states.head()

Unnamed: 0,geolocation_zip_code_prefix,geolocation_lat,geolocation_lng,geolocation_city_old,geolocation_city_clean,geolocation_state_code,geolocation_state,geolocation_capital,geolocation_region
0,1001,-23.55019,-46.634024,sao paulo,sao paulo,sp,são paulo,são paulo,southeast
1,1002,-23.548146,-46.634979,sao paulo,sao paulo,sp,são paulo,são paulo,southeast
2,1003,-23.548994,-46.635731,sao paulo,sao paulo,sp,são paulo,são paulo,southeast
3,1004,-23.549799,-46.634757,sao paulo,sao paulo,sp,são paulo,são paulo,southeast
4,1005,-23.549456,-46.636733,sao paulo,sao paulo,sp,são paulo,são paulo,southeast


## Save data: geolocation_states dataset

In [130]:
df_geolocation_states.to_csv('prep_geolocation.csv', index=False)

In [None]:
from sqlalchemy import create_engine, types
from sqlalchemy import text # to be able to pass string

In [None]:
# Let's load values from the .env file
from dotenv import dotenv_values

config = dotenv_values()

# define variables for the login
pg_user = config['POSTGRES_USER']  # align the key label with your .env file !
pg_host = config['POSTGRES_HOST']
pg_port = config['POSTGRES_PORT']
pg_db = config['POSTGRES_DB']
pg_schema = config['POSTGRES_SCHEMA']
pg_pass = config['POSTGRES_PASS']

In [None]:
# updating the url
url = f'postgresql://{pg_user}:{pg_pass}@{pg_host}:{pg_port}/{pg_db}' #the same like version 1

# let's switch the logging off again. .
engine = create_engine(url, echo=False) #the same like version 1


# writing dataframe to DB : Pandas Dataframe to DB Table in my own Schema
df_geolocation_states.to_sql(name = 'prep_geolocation', 
                       con = engine, 
                       schema = pg_schema, # pandas is allowing to specify, in which schema the table shall be created
                       if_exists='replace', 
                       index=False
                      )