In [69]:
import pandas as pd
import os
import json
from random import sample
from openai import OpenAI
from dotenv import load_dotenv  
from tqdm import tqdm

load_dotenv()
api_key = os.getenv("OPENAI_API_KEY")
client = OpenAI()


In [81]:
df = pd.read_csv('../data/raw/field_notes_raw.csv')

In [5]:
names = sorted(df['Bird Species'].unique())
print(f"Number of unique strings: {len(names)}")

Number of unique strings: 277


In [6]:
print(sample(names, 5))

['white bellied fishing eagle', 'green reed warbler', 'little egret', 'lesser white throat', 'egret']


#### Context-aware spell check 

In [57]:

def get_corrected_names(names_list):
  response = client.chat.completions.create(
    model="gpt-3.5-turbo",
    messages=[
      {
        "role": "system",
        "content": [
          {
            "type": "text",
            "text": "you are a helpful spelling assistant helping correct incorrectly \
            spelled bird names. Given a list of birds return a python dict \
            with the input bird names as keys and the spelling and case corrected names as values.\
            The output dict should have the form \
            {input_name1: corrected_name1, input_name2:corrected_name2, ...}"
          }
        ]
      },
      {
        "role": "user",
        "content": [
          {
            "type": "text",
            "text": str(names_list)
          }
        ]
      },
    ],
    temperature=1,
    max_tokens=2000,
    top_p=1,
    frequency_penalty=0,
    presence_penalty=0,
    response_format={
      "type": "text"
    }
  )
  return response

In [70]:
responses = []
chunk_size = 25

for i in tqdm(range(0, len(names), chunk_size)):
    chunk = names[i:i + chunk_size]
    response = get_corrected_names(chunk)
    responses.append(response)

corrected_names = {}
for response in responses:
    l = json.loads(response.choices[0].message.content.replace("'", '"'))
    corrected_names = {**corrected_names, **l}

100%|██████████| 12/12 [00:50<00:00,  4.22s/it]


In [82]:
df['corrected_name'] = df['Bird Species'].map(lambda x: corrected_names.get(x))
df.sample(5)

Unnamed: 0,Date,Time,Location,Weather,Bird Species,Notes,Count,Sex,Comments,corrected_name
998,,,,,purple sunbird,,,,,Purple Sunbird
916,,,,,indian tree pipet,,,,,Indian Tree Pipit
1,,,,,White Browed Bulbul,Heard,,,,White-browed bulbul
694,,,,,loten sunbird,,,,,Loten Sunbird
883,,,,,coucal,,,,,coucal


In [83]:
df.to_csv('../data/processed/field_notes_with_corrected_names.csv', index=False)