In [1]:
!pip install openai



In [12]:
# importing the necessary module.
import pandas as pd
from openai import OpenAI
from tqdm import tqdm
import os
import time
import getpass
from functools import lru_cache
import concurrent.futures

In [5]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [6]:
# reading the data and then storing the returned DataFrame
df = pd.read_csv('/content/drive/MyDrive/Copy of bq-results-20241031-072408-1730359465771.csv')

# printing the DataFrame
df

Unnamed: 0,id,user_id,city_name
0,48,75,BIMA
1,87,125,SIDOARJO
2,338,445,JEPARA
3,635,755,BEKASI
4,649,769,BEKASI
...,...,...,...
15532,34107,34821,BANGKA SELATAN
15533,34113,34827,BANDUNG
15534,34115,34829,DEPOK
15535,34116,34830,JAKARTA BARAT


In [7]:
os.environ["OPENAI_API_KEY"] = getpass.getpass("OpenAI API Key:")

OpenAI API Key:··········


In [13]:
@lru_cache(maxsize=1000)
def find_zip_code(city_name):
    client = OpenAI()

    # Create a chat message for the new API format
    response = client.chat.completions.create(
      model="gpt-4o",  # Use the GPT-4o
      messages=[
        {"role": "system", "content": "You are an expert in geography, addresses, and locations of Indonesia. Your task is to identify the zip code for the specified city name. Provide the zip code of the city name in Indonesia, and ensure that the output contains only the zip code without any additional information. The zip code (or kode pos) for Indonesia consists of 5 digits only."},
        {"role": "user", "content": city_name}
      ],
      temperature=1,
      max_tokens=50,
      top_p=1,
      frequency_penalty=0,
      presence_penalty=0,
    )

    # Extract the response text (which should contain the zip code)
    zip_code = response.choices[0].message.content.strip()
    time.sleep(1)  # Add delay to avoid hitting API rate limits
    return zip_code

In [14]:
# Define a function to process the list of city names in parallel
def fetch_zip_codes(city_names):
    with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
        results = list(tqdm(executor.map(find_zip_code, city_names), total=len(city_names)))
    return results

In [15]:
# Apply the function to the "city_name" column with progress bar
df['zip_code'] = fetch_zip_codes(df['city_name'])

100%|██████████| 15537/15537 [01:16<00:00, 201.97it/s]


In [16]:
df

Unnamed: 0,id,user_id,city_name,zip_code
0,48,75,BIMA,84111
1,87,125,SIDOARJO,61215
2,338,445,JEPARA,59417
3,635,755,BEKASI,17111
4,649,769,BEKASI,17111
...,...,...,...,...
15532,34107,34821,BANGKA SELATAN,33711
15533,34113,34827,BANDUNG,40111
15534,34115,34829,DEPOK,16449
15535,34116,34830,JAKARTA BARAT,11730


In [17]:
df.to_csv('/content/drive/MyDrive/df_with_zip_code.csv')