This scrip is made to create a df_lookup that will be used as index to complete the missing values in the df (based on users.csv table of bigquery-public-data.thelook_ecommerce) to create a df_final to be saved as users_clean.csv 

In [25]:
#library import
import pandas as pd
import requests
import time
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [26]:
#upload and dataset visualization
df = pd.read_csv("users_cleaned.csv")
df.head()

Unnamed: 0,user_id,first_name,last_name,email,age,gender,state,street_address,postal_code,city,country,traffic_source,created_at,user_geom
0,75822,Janet,Boyd,janetboyd@example.com,43,F,Acre,91843 Mcdaniel Ways,69980-000,,Brasil,Facebook,2019-06-08 11:39:00,POINT(-72.87094866 -8.065346116)
1,36479,Christopher,Hardy,christopherhardy@example.org,33,M,Acre,69363 Myers Junctions,69980-000,,Brasil,Search,2019-07-18 01:03:00,POINT(-72.87094866 -8.065346116)
2,35495,Eric,Shelton,ericshelton@example.com,41,M,Acre,178 Paul Row,69980-000,,Brasil,Search,2019-10-01 02:06:00,POINT(-72.87094866 -8.065346116)
3,97574,Audrey,Patterson,audreypatterson@example.com,57,F,Acre,608 Rachel Orchard,69980-000,,Brasil,Search,2024-02-12 15:00:00,POINT(-72.87094866 -8.065346116)
4,82326,Lisa,Alexander,lisaalexander@example.org,59,F,Acre,189 Rebecca Ferry Suite 092,69980-000,,Brasil,Search,2025-05-14 09:21:00,POINT(-72.87094866 -8.065346116)


In [27]:
#puntual data error fetching and cleaning
df.loc[(df['postal_code'] == '29142') & (df['country'] == 'Brasil'), ['city', 'postal_code']] = ['Cariacica', '29142-000']

In [28]:
#tuplas compose dictionary creation for problematic and dupolicate postal_code values
fix_map_country = {
    ("29140", "France"): "Rosporden",
    ("29140", "Spain"): "Málaga",
    ("38300", "France"): "Bourgoin-Jallieu",
    ("38300", "Spain"): "La Orotava",
    ("13090", "France"): "Aix-en-Provence",
    ("13090", "United States"): "Liverpool",
    ("3581", "Spain"): "L'Alfàs del Pi",
    ("30045", "China"): "Xiqing",
    ("300384", "China"): "Xiqing",
    ("46394", "Spain"):  "Riba-roja de Túria"
    }

In [29]:
#dictionary application
df["city"] = df.apply(
    lambda row: fix_map_country.get((row["postal_code"], row["country"]), row["city"]),
    axis=1
)

In [30]:
#checking for NaN values for postal codes
missing_codes = df[df['city'].isnull()]['postal_code'].unique()
print(missing_codes)    


['69980-000' '57360-000' '68924-000' '69630-000' '69880-000' '69600-000'
 '69830-000' '69460-000' '69800-000' '69280-000' '69435-000' '69250-000'
 '69735-000' '69200-000' '69230-000' '69190-000' '41130' '50197' '50191'
 '50059' '50016' '47240-000' '47350-000' '48760-000' '44330-000'
 '48790-000' '48475-000' '38434' '25191' '62150-000' '63610-000'
 '63560-000' '63540-000' '46988' '28946' '28947' '28909' '28524' '19803'
 '72016-190' '72110-600' '72025-065' '32060' '33579' '34482' '34476'
 '32092' '32826' '32828' '32940' '33437' '15174' '30102' '30028' '30093'
 '30044' '30016' '30045' '75370-000' '73900-000' '631-830' '631-820'
 '631-810' '630-850' '630-857' '630-856' '630-040' '630-520' '630-492'
 '630-490' '630-500' '642-370' '641-465' '641-860' '642-315' '641-920'
 '70706' '65380-000' '65393-000' '65365-000' '65278-000' '65725-000'
 '65415-000' '65690-000' '65530-000' '78850-000' '48044' '39260-000'
 '88201' '12065' '33129' '27537' '84600-000' '68250-000' '68230-000'
 '68330-000' '6830

In [31]:
#looking for city by postal code and country using OpenStreetMap API function definition
def get_city_osm(postal_code, country):
    url = "https://nominatim.openstreetmap.org/search"
    params = {
        "postalcode": postal_code,
        "country": country,
        "format": "json",
        "addressdetails": 1,
        "limit": 1
    }

    headers = {
        "User-Agent": "AlessandroPostalLookup/1.0"
    }

    try:
        r = requests.get(url, params=params, headers=headers, timeout=10)
        if r.status_code == 200 and len(r.json()) > 0:
            addr = r.json()[0]["address"]
            return (
                addr.get("city") or
                addr.get("town") or
                addr.get("village") or
                addr.get("municipality")
            )
    except:
        return None

    return None

In [32]:
#looking for city by postal code and country using OpenStreetMap API
results = []

missing_rows = df[df['city'].isnull()][['postal_code', 'country']].drop_duplicates()

for idx, row in missing_rows.iterrows():
    pc = row['postal_code']
    country = row['country']

    city = get_city_osm(pc, country)

    results.append({
        "postal_code": pc,
        "city": city
    })

    print(f"{pc} ({country}) → {city}")

    time.sleep(1)  #respecting OSM rate limit

69980-000 (Brasil) → Cruzeiro do Sul
57360-000 (Brasil) → Girau do Ponciano
68924-000 (Brasil) → Vitória do Jari
69630-000 (Brasil) → Benjamin Constant
69880-000 (Brasil) → Eirunepé
69600-000 (Brasil) → São Paulo de Olivença
69830-000 (Brasil) → Lábrea
69460-000 (Brasil) → Coari
69800-000 (Brasil) → Humaitá
69280-000 (Brasil) → None
69435-000 (Brasil) → None
69250-000 (Brasil) → Careiro
69735-000 (Brasil) → Presidente Figueiredo
69200-000 (Brasil) → Borba
69230-000 (Brasil) → None
69190-000 (Brasil) → Jacareacanga
41130 (Spain) → La Puebla del Río
50197 (Spain) → Zaragoza
50191 (Spain) → Zaragoza
50059 (Spain) → Zaragoza
50016 (Spain) → Zaragoza
47240-000 (Brasil) → None
47350-000 (Brasil) → Sento Sé
48760-000 (Brasil) → Araci
44330-000 (Brasil) → São Gonçalo dos Campos
48790-000 (Brasil) → Tucano
48475-000 (Brasil) → Itapicuru
38434 (Spain) → Icod de los Vinos
25191 (Spain) → Lleida
62150-000 (Brasil) → None
63610-000 (Brasil) → Mombaça
63560-000 (Brasil) → Acopiara
63540-000 (Brasil)

In [33]:
#sending results to a dataframe
df_lookup = pd.DataFrame(results)

In [34]:
#merging the lookup dataframe with the original dataframe
df_lookup = df_lookup.merge(
    df[['postal_code', 'country', 'state']].drop_duplicates(),
    on='postal_code',
    how='left'
)

In [13]:
print(df_lookup)

    postal_code                        city        country  \
0     69980-000             Cruzeiro do Sul         Brasil   
1     57360-000           Girau do Ponciano         Brasil   
2     68924-000             Vitória do Jari         Brasil   
3     69630-000           Benjamin Constant         Brasil   
4     69880-000                    Eirunepé         Brasil   
5     69600-000       São Paulo de Olivença         Brasil   
6     69830-000                      Lábrea         Brasil   
7     69460-000                       Coari         Brasil   
8     69800-000                     Humaitá         Brasil   
9     69280-000                        None         Brasil   
10    69435-000                        None         Brasil   
11    69250-000                     Careiro         Brasil   
12    69735-000       Presidente Figueiredo         Brasil   
13    69200-000                       Borba         Brasil   
14    69230-000                        None         Brasil   
15    69

In [35]:
#looking for missing info
missing_info = df_lookup[df_lookup['city'].isnull()].copy()
print(f"{missing_info}\nLength: {len(missing_info)}") 

    postal_code  city        country             state
9     69280-000  None         Brasil          Amazonas
10    69435-000  None         Brasil          Amazonas
14    69230-000  None         Brasil          Amazonas
21    47240-000  None         Brasil             Bahia
29    62150-000  None         Brasil             Ceará
38        19803  None  United States          Delaware
42        32060  None  United States           Florida
43        33579  None  United States           Florida
44        34482  None  United States           Florida
45        34476  None  United States           Florida
46        32092  None  United States           Florida
48        32828  None  United States           Florida
49        32940  None  United States           Florida
50        33437  None  United States           Florida
52        30102  None  United States           Georgia
53        30028  None  United States           Georgia
54        30093  None  United States           Georgia
55        

In [36]:
#dataframe filtering
to_retry = missing_info[
    missing_info['country'].isin(['United States', 'Brasil', 'Brazil'])
]['postal_code']

In [37]:
#function for city extraction from brasilapi
def get_city_br(postal_code):
    pc = str(postal_code).replace("-", "")
    url = f"https://brasilapi.com.br/api/cep/v1/{pc}"
    try:
        r = requests.get(url, timeout=5)
        if r.status_code == 200:
            return r.json().get("city")
    except:
        return None
    return None

#function for city extraction from zippopotam
def get_city_us(postal_code):
    url = f"https://api.zippopotam.us/us/{postal_code}"
    try:
        r = requests.get(url, timeout=5)
        if r.status_code == 200:
            return r.json()["places"][0]["place name"]
    except:
        return None
    return None

In [39]:
#functions application
retry_results = []

for pc in to_retry:
    country = missing_info.loc[missing_info['postal_code'] == pc, 'country'].iloc[0]

    if country in ['Brasil', 'Brazil']:
        city = get_city_br(pc)
    elif country == 'United States':
        city = get_city_us(pc)
    else:
        city = None

    retry_results.append({
        "postal_code": pc,
        "city_retry": city
    })

    print(f"{pc} ({country}) → {city}")
    time.sleep(0.2)

69280-000 (Brasil) → Manicoré
69435-000 (Brasil) → Manaquiri
69230-000 (Brasil) → Nova Olinda do Norte
47240-000 (Brasil) → Pilão Arcado
62150-000 (Brasil) → Santana do Acaraú
19803 (United States) → Wilmington
32060 (United States) → Live Oak
33579 (United States) → Riverview
34482 (United States) → Ocala
34476 (United States) → Ocala
32092 (United States) → Saint Augustine
32828 (United States) → Orlando
32940 (United States) → Melbourne
33437 (United States) → Boynton Beach
30102 (United States) → Acworth
30028 (United States) → Cumming
30093 (United States) → Norcross
30044 (United States) → Lawrenceville
30016 (United States) → Covington
70706 (United States) → Denham Springs
68330-000 (Brasil) → Porto de Moz
68617-000 (Brasil) → Cachoeira do Piriá
79706 (United States) → Midland
77379 (United States) → Spring
77389 (United States) → Spring
23236 (United States) → Richmond
22407 (United States) → Fredericksburg
22406 (United States) → Fredericksburg
22556 (United States) → Staffor

In [40]:
#df_lookup data enrichment
df_retry = pd.DataFrame(retry_results)
df_lookup = df_lookup.merge(df_retry, on='postal_code', how='left')
df_lookup['city'] = df_lookup['city'].fillna(df_lookup['city_retry'])
df_lookup = df_lookup.drop(columns=['city_retry'])


In [41]:
#looking for missing city info
missing_info2 = df_lookup[df_lookup['city'].isnull()].copy()
print(missing_info2)   

   postal_code city      country             state
61     631-830  NaN  South Korea  Gyeongsangnam-do
62     631-820  NaN  South Korea  Gyeongsangnam-do
63     631-810  NaN  South Korea  Gyeongsangnam-do
64     630-850  NaN  South Korea  Gyeongsangnam-do
65     630-857  NaN  South Korea  Gyeongsangnam-do
66     630-856  NaN  South Korea  Gyeongsangnam-do
67     630-040  NaN  South Korea  Gyeongsangnam-do
68     630-520  NaN  South Korea  Gyeongsangnam-do
69     630-492  NaN  South Korea  Gyeongsangnam-do
70     630-490  NaN  South Korea  Gyeongsangnam-do
71     630-500  NaN  South Korea  Gyeongsangnam-do
72     642-370  NaN  South Korea  Gyeongsangnam-do
73     641-465  NaN  South Korea  Gyeongsangnam-do
74     641-860  NaN  South Korea  Gyeongsangnam-do
75     642-315  NaN  South Korea  Gyeongsangnam-do
76     641-920  NaN  South Korea  Gyeongsangnam-do


In [42]:
# dictionary creatoin for missing values
korea_map = {
    "631-830": "Changwon",
    "631-820": "Changwon",
    "631-810": "Changwon",
    "630-850": "Changwon",
    "630-857": "Changwon",
    "630-856": "Changwon",
    "630-040": "Changwon",
    "630-520": "Changwon",
    "631-840": "Changwon",
    "630-490": "Changwon",
    "641-870": "Changwon",
    "630-500": "Changwon",
    "642-370": "Changwon",
    "641-465": "Changwon",
    "641-860": "Changwon",
    "642-315": "Changwon",
    "641-920": "Changwon",
    "630-492": "Changwon"
}
#df_lookup data enrichment
df_lookup['city'] = df_lookup.apply(
    lambda row: korea_map.get(row['postal_code'], row['city'])
    if row['country'] == 'South Korea' else row['city'],
    axis=1
)

In [43]:
#looking for missing values
missing_info3 = df_lookup[df_lookup['city'].isnull()].copy()
print(missing_info3)   

Empty DataFrame
Columns: [postal_code, city, country, state]
Index: []


In [44]:
print(df_lookup)

    postal_code                        city        country  \
0     69980-000             Cruzeiro do Sul         Brasil   
1     57360-000           Girau do Ponciano         Brasil   
2     68924-000             Vitória do Jari         Brasil   
3     69630-000           Benjamin Constant         Brasil   
4     69880-000                    Eirunepé         Brasil   
5     69600-000       São Paulo de Olivença         Brasil   
6     69830-000                      Lábrea         Brasil   
7     69460-000                       Coari         Brasil   
8     69800-000                     Humaitá         Brasil   
9     69280-000                    Manicoré         Brasil   
10    69435-000                   Manaquiri         Brasil   
11    69250-000                     Careiro         Brasil   
12    69735-000       Presidente Figueiredo         Brasil   
13    69200-000                       Borba         Brasil   
14    69230-000        Nova Olinda do Norte         Brasil   
15    69

In [45]:
#df_final creation by merging df with df_lookup 
df_final = df.merge(
    df_lookup[['postal_code', 'city']],
    on='postal_code',
    how='left',
    suffixes=('', '_lookup')
)

df_final['city'] = df_final['city'].fillna(df_final['city_lookup'])

df_final = df_final.drop(columns=['city_lookup'])


In [46]:
print(df_final[['postal_code', 'city', 'country', 'state']].head(60))

   postal_code               city country  state
0    69980-000    Cruzeiro do Sul  Brasil   Acre
1    69980-000    Cruzeiro do Sul  Brasil   Acre
2    69980-000    Cruzeiro do Sul  Brasil   Acre
3    69980-000    Cruzeiro do Sul  Brasil   Acre
4    69980-000    Cruzeiro do Sul  Brasil   Acre
5    69980-000    Cruzeiro do Sul  Brasil   Acre
6    69980-000    Cruzeiro do Sul  Brasil   Acre
7    69980-000    Cruzeiro do Sul  Brasil   Acre
8    69980-000    Cruzeiro do Sul  Brasil   Acre
9    69980-000    Cruzeiro do Sul  Brasil   Acre
10   69980-000    Cruzeiro do Sul  Brasil   Acre
11   69970-000           Tarauacá  Brasil   Acre
12   69970-000           Tarauacá  Brasil   Acre
13   69970-000           Tarauacá  Brasil   Acre
14   69970-000           Tarauacá  Brasil   Acre
15   69970-000           Tarauacá  Brasil   Acre
16   69970-000           Tarauacá  Brasil   Acre
17   69970-000           Tarauacá  Brasil   Acre
18   69970-000           Tarauacá  Brasil   Acre
19   69970-000      

In [47]:
#puntual data error fetching and cleaning
df_final.loc[df_final['postal_code'] == '480-0102', 'city'] = ' Fusō'

In [48]:
#looking for missing values
missing_info4 = df_final[df_final['city'].isnull()]
print(missing_info4)

Empty DataFrame
Columns: [user_id, first_name, last_name, email, age, gender, state, street_address, postal_code, city, country, traffic_source, created_at, user_geom]
Index: []


In [None]:
#save the final df to users_clean.csv
df_final.to_csv("users_cleaned2.csv", index=False)