In [6]:
import pandas as pd
import openai
import os
import gspread
import numpy as np
from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv()) # read local .env file

openai.api_key  = os.getenv('OPENAIKEY')

In [9]:
def read_url(url, SHEET_NAME):
    SHEET_ID = url.split("/")[5]
    spreadsheet = gc.open_by_key(SHEET_ID)
    worksheet = spreadsheet.worksheet(SHEET_NAME)
    rows = worksheet.get_all_records()
    df_spread = pd.DataFrame(rows)
    return df_spread, worksheet

In [10]:
serviceaccount = "../../google_drive_personal.json"
gc = gspread.service_account(filename=serviceaccount)
GC_URL = "https://docs.google.com/spreadsheets/d/1o5gFmZPUoDwrrbfE6M26uJF3HnEZll02ivnOxP6K6Xw/edit?usp=sharing"

In [11]:
SHEETNAME = "select_city_classifier"
city_meta, other_worksheet = read_url(GC_URL, SHEETNAME)
city_meta = city_meta[city_meta['City']!=''].reset_index(drop = True)
city_meta["country_clean"] = np.where(city_meta["Country"].isin(["USA", "United States"]), 
                                      "United States of America",
                                      city_meta["Country"]
                                     )

In [7]:

def get_completion(messages, model="gpt-3.5-turbo"): # Andrew mentioned that the prompt/ completion paradigm is preferable for this class
    # messages = [{"role": "user", "content": prompt}]
    response = openai.chat.completions.create(
        model=model,
        messages=messages,
        temperature=0, # this is the degree of randomness of the model's output
    )
    return response.choices[0].message.content

In [22]:
city_meta_str = city_meta.apply(lambda x: f"{x['City']}, {x['country_clean']}", axis=1).tolist()

input_city = ";".join(city_meta_str[:2])
input_city

'Hindupur, India;Vijayawada, India'

In [42]:
prompt = """
Your task is to:
Read a city and country combinations below inside the triple backquote. \
The city and country are separated by a comma. \
Input examples is: "city_1, country_1".
Given the input city and country combination, you should return the state or province of the city and country combination in the format like:\
"state: returned state or province".\
Input city and country combinations:```{input_city}```
"""
# messages = [{"role": "user", "content": prompt}]
# answer = get_completion(messages)
# # # messages.append({"role": "assistant", "content": answer})
# answer

In [47]:
from tqdm import tqdm
allstates = []
for input_city in tqdm(city_meta_str):
    # prompt.format(input_city = input_city)
    messages = [{"role": "user", "content": prompt.format(input_city = input_city)}]
    answer = get_completion(messages)
    print(answer)
    allstates.append(answer)

  1%|          | 1/127 [00:00<01:42,  1.22it/s]

Output: state: Andhra Pradesh


  2%|▏         | 2/127 [00:01<01:43,  1.21it/s]

Output: state: Andhra Pradesh


  2%|▏         | 3/127 [00:02<01:27,  1.41it/s]

state: Rangpur Division


  3%|▎         | 4/127 [00:02<01:16,  1.61it/s]

state: British Columbia


  4%|▍         | 5/127 [00:03<01:05,  1.85it/s]

state: Santiago


  5%|▍         | 6/127 [00:03<01:03,  1.91it/s]

state: Cesar


  6%|▌         | 7/127 [00:04<01:11,  1.68it/s]

state: Perm Krai


  6%|▋         | 8/127 [00:04<01:09,  1.72it/s]

state: Rivne


  7%|▋         | 9/127 [00:05<01:02,  1.88it/s]

state: West Java


  8%|▊         | 10/127 [00:05<01:00,  1.92it/s]

state: West Java


  9%|▊         | 11/127 [00:06<01:11,  1.61it/s]

Output: state: Tel Aviv District


  9%|▉         | 12/127 [00:07<01:05,  1.75it/s]

state: Bahia


 10%|█         | 13/127 [00:07<01:01,  1.85it/s]

Output:
state: Metro Manila


 11%|█         | 14/127 [00:09<01:53,  1.01s/it]

Output: state: Gombe


 12%|█▏        | 15/127 [00:10<01:38,  1.14it/s]

Output:
state: Guatemala


 13%|█▎        | 16/127 [00:10<01:24,  1.32it/s]

state: Maharashtra


 13%|█▎        | 17/127 [00:11<01:29,  1.23it/s]

state: Pays de la Loire


 14%|█▍        | 18/127 [00:12<01:17,  1.41it/s]

state: Cochabamba


 15%|█▍        | 19/127 [00:12<01:08,  1.57it/s]

state: Bahia


 16%|█▌        | 20/127 [00:13<01:06,  1.61it/s]

Output: state: Nizhny Novgorod Oblast


 17%|█▋        | 21/127 [00:13<01:00,  1.74it/s]

Output: state: Uttar Pradesh


 17%|█▋        | 22/127 [00:14<00:56,  1.85it/s]

state: Astrakhan Oblast


 18%|█▊        | 23/127 [00:14<00:53,  1.94it/s]

Output: state: Overijssel


 19%|█▉        | 24/127 [00:15<00:53,  1.91it/s]

Output: state: Tyumen Oblast


 20%|█▉        | 25/127 [00:15<00:50,  2.01it/s]

state: Texas


 20%|██        | 26/127 [00:15<00:47,  2.14it/s]

state: California


 21%|██▏       | 27/127 [00:16<00:49,  2.04it/s]

state: Negros Occidental


 22%|██▏       | 28/127 [00:17<00:49,  1.99it/s]

Output: state: Sicily


 23%|██▎       | 29/127 [00:17<00:48,  2.03it/s]

Output: state: Cebu


 24%|██▎       | 30/127 [00:17<00:45,  2.13it/s]

state: Florida


 24%|██▍       | 31/127 [00:18<00:55,  1.72it/s]

Output: state: Central Region


 25%|██▌       | 32/127 [00:19<00:54,  1.73it/s]

Output: state: Florida


 26%|██▌       | 33/127 [00:19<00:52,  1.79it/s]

state: Kaunas County


 27%|██▋       | 34/127 [00:20<00:49,  1.87it/s]

state: California


 28%|██▊       | 35/127 [00:20<00:52,  1.75it/s]

state: Greater Manchester


 28%|██▊       | 36/127 [00:21<00:49,  1.85it/s]

state: Île-de-France


 29%|██▉       | 37/127 [00:22<01:15,  1.20it/s]

state: Jalisco


 30%|██▉       | 38/127 [00:23<01:06,  1.34it/s]

state: Tamaulipas


 31%|███       | 39/127 [00:23<00:59,  1.49it/s]

Output: state: Belgrade


 31%|███▏      | 40/127 [00:24<00:54,  1.58it/s]

State: Minnesota


 32%|███▏      | 41/127 [00:24<00:48,  1.78it/s]

state: Antwerp


 33%|███▎      | 42/127 [00:25<00:54,  1.57it/s]

state: Greater Accra


 34%|███▍      | 43/127 [00:26<00:51,  1.63it/s]

Output:
state: Tocantins


 35%|███▍      | 44/127 [00:27<00:55,  1.50it/s]

Output: state: Sinaloa


 35%|███▌      | 45/127 [00:27<00:50,  1.61it/s]

Output:
state: Ohio


 36%|███▌      | 46/127 [00:28<00:45,  1.77it/s]

state: Maharashtra


 37%|███▋      | 47/127 [00:28<00:41,  1.94it/s]

Output: state: Maharashtra


 38%|███▊      | 48/127 [00:29<00:50,  1.56it/s]

Output: state: Rajshahi


 39%|███▊      | 49/127 [00:29<00:44,  1.74it/s]

Output: state: Kerala


 39%|███▉      | 50/127 [00:30<00:40,  1.89it/s]

Output: state: Ohio


 40%|████      | 51/127 [00:30<00:42,  1.80it/s]

Output: state: Santa Catarina


 41%|████      | 52/127 [00:31<00:44,  1.68it/s]

Output: state: Wellington


 42%|████▏     | 53/127 [00:32<00:49,  1.51it/s]

state: Massachusetts


 43%|████▎     | 54/127 [00:33<00:52,  1.40it/s]

Output: state: England


 43%|████▎     | 55/127 [00:33<00:51,  1.41it/s]

Output:
state: Sao Paulo


 44%|████▍     | 56/127 [00:34<00:48,  1.45it/s]

Output: state: North Sumatra


 45%|████▍     | 57/127 [00:35<00:49,  1.41it/s]

state: Bavaria


 46%|████▌     | 58/127 [00:35<00:45,  1.50it/s]

state: South-East


 46%|████▋     | 59/127 [00:36<00:44,  1.54it/s]

state: Jerusalem District


 47%|████▋     | 60/127 [00:36<00:39,  1.68it/s]

state: South Sumatra


 48%|████▊     | 61/127 [00:37<00:37,  1.74it/s]

Output: state: Maharashtra


 49%|████▉     | 62/127 [00:37<00:35,  1.83it/s]

state: Madrid


 50%|████▉     | 63/127 [00:38<00:36,  1.75it/s]

Output: state: Stockholm County


 50%|█████     | 64/127 [00:39<00:35,  1.78it/s]

Output: state: Kyiv City


 51%|█████     | 65/127 [00:39<00:32,  1.92it/s]

state: Taipei City


 52%|█████▏    | 66/127 [00:40<00:32,  1.88it/s]

state: Uttar Pradesh


 53%|█████▎    | 67/127 [00:40<00:34,  1.74it/s]

state: Capital Region


 54%|█████▎    | 68/127 [00:41<00:33,  1.78it/s]

Output:
state: Minas Gerais


 54%|█████▍    | 69/127 [00:41<00:31,  1.83it/s]

state: Kigali


 55%|█████▌    | 70/127 [00:42<00:28,  2.02it/s]

state: Colorado


 56%|█████▌    | 71/127 [00:42<00:27,  2.06it/s]

state: Dubai


 57%|█████▋    | 72/127 [00:43<00:26,  2.08it/s]

state: Vienna


 57%|█████▋    | 73/127 [00:43<00:25,  2.12it/s]

state: Oregon


 58%|█████▊    | 74/127 [00:44<00:25,  2.06it/s]

Output: state: North Carolina


 59%|█████▉    | 75/127 [00:44<00:27,  1.92it/s]

Output: state: Pichincha


 60%|█████▉    | 76/127 [00:45<00:24,  2.07it/s]

state: Karnataka


 61%|██████    | 77/127 [00:45<00:24,  2.03it/s]

state: Quebec


 61%|██████▏   | 78/127 [00:46<00:23,  2.09it/s]

state: Maharashtra


 62%|██████▏   | 79/127 [00:46<00:24,  1.99it/s]

state: Brussels Capital Region


 63%|██████▎   | 80/127 [00:47<00:23,  2.01it/s]

Output: state: Rajasthan


 64%|██████▍   | 81/127 [00:52<01:33,  2.04s/it]

Output: state: Cundinamarca


 65%|██████▍   | 82/127 [00:53<01:10,  1.56s/it]

state: Michigan


 65%|██████▌   | 83/127 [00:53<00:54,  1.23s/it]

state: N/A


 66%|██████▌   | 84/127 [00:54<00:47,  1.11s/it]

state: Parana


 67%|██████▋   | 85/127 [00:55<00:41,  1.00it/s]

Output: state: Central Macedonia


 68%|██████▊   | 86/127 [00:55<00:34,  1.18it/s]

state: Budapest


 69%|██████▊   | 87/127 [00:56<00:30,  1.33it/s]

state: Masovian Voivodeship


 69%|██████▉   | 88/127 [00:57<00:31,  1.23it/s]

state: N/A


 70%|███████   | 89/127 [00:57<00:29,  1.27it/s]

Output: state: Delhi


 71%|███████   | 90/127 [00:58<00:24,  1.49it/s]

state: Lazio


 72%|███████▏  | 91/127 [00:58<00:22,  1.57it/s]

state: Berlin


 72%|███████▏  | 92/127 [00:59<00:20,  1.74it/s]

state: Seoul


 73%|███████▎  | 93/127 [00:59<00:18,  1.81it/s]

Output: state: Yamaguchi


 74%|███████▍  | 94/127 [01:00<00:16,  1.97it/s]

state: Western Cape


 75%|███████▍  | 95/127 [01:00<00:16,  1.97it/s]

state: Nairobi


 76%|███████▌  | 96/127 [01:01<00:15,  2.02it/s]

state: Lagos


 76%|███████▋  | 97/127 [01:01<00:14,  2.01it/s]

Output: state: Telangana


 77%|███████▋  | 98/127 [01:02<00:14,  1.94it/s]

state: New York


 78%|███████▊  | 99/127 [01:02<00:14,  1.95it/s]

state: Attica


 79%|███████▊  | 100/127 [01:03<00:15,  1.74it/s]

state: Mexico City


 80%|███████▉  | 101/127 [01:04<00:14,  1.75it/s]

Output: state: Saint Petersburg


 80%|████████  | 102/127 [01:04<00:15,  1.59it/s]

state: Auckland


 81%|████████  | 103/127 [01:05<00:13,  1.80it/s]

state: Moscow


 82%|████████▏ | 104/127 [01:05<00:12,  1.91it/s]

state: California


 83%|████████▎ | 105/127 [01:06<00:10,  2.02it/s]

Output:
state: Istanbul


 83%|████████▎ | 106/127 [01:06<00:12,  1.68it/s]

Output: state: Tokyo


 84%|████████▍ | 107/127 [01:07<00:11,  1.69it/s]

Output: state: New South Wales


 85%|████████▌ | 108/127 [01:07<00:10,  1.80it/s]

state: England


 86%|████████▌ | 109/127 [01:08<00:09,  1.92it/s]

Output: state: Okayama


 87%|████████▋ | 110/127 [01:08<00:09,  1.88it/s]

Output: state: Gauteng


 87%|████████▋ | 111/127 [01:09<00:08,  1.91it/s]

Output: state: Fukuoka Prefecture


 88%|████████▊ | 112/127 [01:09<00:07,  1.94it/s]

state: Dhaka


 89%|████████▉ | 113/127 [01:10<00:07,  1.93it/s]

state: Federal Territory of Kuala Lumpur


 90%|████████▉ | 114/127 [01:11<00:10,  1.21it/s]

state: Texas


 91%|█████████ | 115/127 [01:12<00:08,  1.42it/s]

Output: state: Buenos Aires


 91%|█████████▏| 116/127 [01:12<00:06,  1.58it/s]

state: North Holland


 92%|█████████▏| 117/127 [01:13<00:05,  1.71it/s]

state: Pennsylvania


 93%|█████████▎| 118/127 [01:14<00:05,  1.63it/s]

Output:
state: Metro Manila


 94%|█████████▎| 119/127 [01:14<00:04,  1.80it/s]

state: Ontario


 94%|█████████▍| 120/127 [01:15<00:03,  1.76it/s]

Output: state: Sao Paulo


 95%|█████████▌| 121/127 [01:15<00:03,  1.78it/s]

Output: state: Lima


 96%|█████████▌| 122/127 [01:16<00:02,  1.84it/s]

Output:
state: Lombardy


 97%|█████████▋| 123/127 [01:16<00:02,  1.89it/s]

Output: state: Aichi


 98%|█████████▊| 124/127 [01:17<00:01,  1.95it/s]

state: Rio de Janeiro


 98%|█████████▊| 125/127 [01:17<00:00,  2.00it/s]

Output:
state: Jakarta


 99%|█████████▉| 126/127 [01:18<00:00,  2.02it/s]

Output: state: Bangkok


100%|██████████| 127/127 [01:18<00:00,  1.61it/s]

state: Illinois





In [49]:
allstates_clean = [x.split(":")[-1].strip() for x in allstates]
allstates_clean

['Andhra Pradesh',
 'Andhra Pradesh',
 'Rangpur Division',
 'British Columbia',
 'Santiago',
 'Cesar',
 'Perm Krai',
 'Rivne',
 'West Java',
 'West Java',
 'Tel Aviv District',
 'Bahia',
 'Metro Manila',
 'Gombe',
 'Guatemala',
 'Maharashtra',
 'Pays de la Loire',
 'Cochabamba',
 'Bahia',
 'Nizhny Novgorod Oblast',
 'Uttar Pradesh',
 'Astrakhan Oblast',
 'Overijssel',
 'Tyumen Oblast',
 'Texas',
 'California',
 'Negros Occidental',
 'Sicily',
 'Cebu',
 'Florida',
 'Central Region',
 'Florida',
 'Kaunas County',
 'California',
 'Greater Manchester',
 'Île-de-France',
 'Jalisco',
 'Tamaulipas',
 'Belgrade',
 'Minnesota',
 'Antwerp',
 'Greater Accra',
 'Tocantins',
 'Sinaloa',
 'Ohio',
 'Maharashtra',
 'Maharashtra',
 'Rajshahi',
 'Kerala',
 'Ohio',
 'Santa Catarina',
 'Wellington',
 'Massachusetts',
 'England',
 'Sao Paulo',
 'North Sumatra',
 'Bavaria',
 'South-East',
 'Jerusalem District',
 'South Sumatra',
 'Maharashtra',
 'Madrid',
 'Stockholm County',
 'Kyiv City',
 'Taipei City',
 

In [51]:
city_meta["State/Province"] = allstates_clean

In [52]:
other_worksheet.update([city_meta.columns.values.tolist()] + city_meta.values.tolist())

{'spreadsheetId': '1o5gFmZPUoDwrrbfE6M26uJF3HnEZll02ivnOxP6K6Xw',
 'updatedRange': 'select_city_classifier!A1:V128',
 'updatedRows': 128,
 'updatedColumns': 22,
 'updatedCells': 2816}