# Demystifying APIs

In [20]:
import pandas as pd  
from openai import OpenAI  
from dotenv import load_dotenv
import os 
from groq import Groq

In [21]:
# Load the Excel file into a DataFrame
os.chdir(r'C:\Users\beperron\Documents\Projects\demystifying-APIs')
df = pd.read_excel(r'LBC_MetaData.xlsx').sample(50, random_state=20)

In [22]:
#-----------------------------#
#     ChatGPT API             #
#-----------------------------#

load_dotenv()

base_url = "https://api.openai.com/v1/chat/completions"
model = "gpt-4o"
openai_api_key = os.getenv("OPENAI_API_KEY")
client = OpenAI()

In [23]:
system_extraction_prompt = "You are a helpful assistant with expertise in social work research and Chinese geography."

user_extraction_prompt = """Carefully read the following scientific abstract.
Identify and extract any Chinese locations
that are first-level administrative divisions under China's central government in mainland China.
These include:

Provinces,
Municipalities directly under the central government,
Autonomous regions.

If a city or county is mentioned instead of a province or autonomous region,
return corresponding province, municipality, or autonomous region in English.
For example, if '广州市' is mentioned, return 'Guangdong Province'. Only extract locations at this administrative level.
If multiple locations are present in a single abstract,
separate each location with a semicolon.
Return NONE if no specific province, city, or administrative region is mentioned.
For unspecified mentions (e.g., "10 provinces", "several regions","southwestern China","western China"), return 'NONE'.
Do not make any assumptions or inferences about unspecified locations.
Return NONE if no specific name mentioned, or need guess or infer any locations based on context or general knowledge."""


In [24]:
# Define a function to extract country names from the text using GPT-4o via OpenAI API
def extract_location(abstract):
           
    response = client.chat.completions.create(
        model=model,  # Specify the model to use
        messages=[
            {
                "role": "system",
                "content": system_extraction_prompt
            },
            {
                "role": "user",
                "content": user_extraction_prompt + abstract
                
            }
        ],
        temperature=0,  # Set response randomness (0 means deterministic responses)
        max_tokens=2500,  # Set the maximum token length of the response
        top_p=1,  # Use nucleus sampling (1 means only the most likely tokens are considered)
        frequency_penalty=0,  # No penalty for frequent tokens
        presence_penalty=0  # No penalty for new topic introduction
    )
    # Extract country names from the response
    location = response.choices[0].message.content.strip()
    return location

# Apply the function to the 'abstract' column and store the API response in a new column 'Location'
df['Location'] = df['Abstract'].apply(extract_location)

In [25]:
# Print all Locations that are not None
#df.to_excel('LBC_MetaData_Locations_Extracted.xlsx', index=False)


print(df['Location'])

117                       Anhui Province
80                                  NONE
198                                 NONE
44                                  NONE
108               Chongqing Municipality
13                      Guangxi Province
62                    Guangdong Province
188                                 NONE
28                                  NONE
90                                  NONE
139                                 NONE
132                                 NONE
111                                 NONE
20                                  NONE
197                                 NONE
169                                 NONE
168                                 NONE
34                      Shaanxi Province
84                        Hebei Province
135                                 NONE
72                                  NONE
63                                  NONE
98                                  NONE
143                                 NONE
196             