In [12]:
import requests
import pandas as pd

# Geocoding
### Convert coordenates to address using Google's API

#### Requirements:

A dataset with fields called `latitude` and `longitude` without nulls or incorrect values.

#### Dataset:

Currently, it's being used a dataset of inmobiliary properties distribiuted by Properatti ([Kaggle, Properatti 2020-2021](https://www.kaggle.com/datasets/jluza92/argentina-properati-listings-dataset-20202021))

In [17]:
# Function to obtain an address, zipcode and city based on a latitude and longitude, using Google's API

def get_address_from_coords(lat, lon):
    url = f'https://maps.googleapis.com/maps/api/geocode/json?latlng={lat},{lon}&key={API_KEY}'
    response = requests.get(url)
    
    if response.status_code == 200:
        result = response.json()
        if 'results' in result and len(result['results']) > 0:
            return result['results'][0]['formatted_address']
        else:
            return "No address found"
        
    else:
        return f"Error: {response.status_code}"

In [119]:
# Import a file already cleaned that has columns latitude and longitude

data = pd.read_csv("./ProperattiDataset.csv")

In [43]:
# Keep only necesarry columns

df = data[["latitude", "longitude"]]

In [114]:
# API Key of Google API
# https://cloud.google.com/apis/?utm_source=google&utm_medium=cpc&utm_campaign=latam-AR-all-es-dr-SKWS-all-all-trial-b-dr-1707800-LUAC0020059&utm_content=text-ad-none-any-DEV_c-CRE_649255894226-ADGP_Hybrid+%7C+SKWS+-+BRO+%7C+Txt_API+Management-General-KWID_43700075322947659-kwd-23207200&utm_term=KW_api-ST_API&gad_source=1&gclid=Cj0KCQjwrp-3BhDgARIsAEWJ6Sww4IHGtBSG-iK1ejCfnjjEAzJt56JNPNfEsp5COZAO5f7iiKh7cuwaAnyHEALw_wcB&gclsrc=aw.ds

API_KEY = "API-KEY"

In [18]:
# Applies function 'get_address_from_coords' to each row and adds it on a new columns named address

data['address'] = df.apply(lambda row: get_address_from_coords(row['latitude'], row['longitude']), axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['address'] = df.apply(lambda row: get_address_from_coords(row['latitude'], row['longitude']), axis=1)


In [79]:
# REGEX function to keep only the zipcode from the address field.
# It keeps all characthers that starts with a capital letter, an optional space and 4 digits

data['zipcode'] = data['address'].str.extract(r'([A-Z]\s?\d{4})', expand=False)

In [80]:
# Keep only the direction from the address field.
# All characters after before the first comma

data['address'] = data['address'].str.split(',', n=1, expand=True)[0].str.strip()

In [None]:
# Delete nulls and prints how many nulls each column has

data.dropna(axis=0, how="any", inplace=True)
data.isnull().sum()

In [117]:
# Export the data

data.to_csv('datasetFinalissima_.csv', index=False)