# Demystifying APIs

In [1]:
import pandas as pd  
from openai import OpenAI  
from dotenv import load_dotenv
import os 

In [2]:
# Load the Excel file into a DataFrame
#os.chdir(r'C:\Users\beperron\Desktop')
os.chdir(r'C:\Users\Zixua\OneDrive\Data Projects 2024\api_paper')
df = pd.read_excel(r'LBC_MetaData.xlsx')

In [3]:
# Load the .env file
load_dotenv()

# Access the API key
#openai_api_key = os.getenv("OPENAI_API_KEY")
deepseek_api_key = os.getenv("DEEPSEEK_API_KEY")

In [12]:
# This is for Initializing OpenAI client for GPT, please use the next block of code for DeepSeek.
'''
client = OpenAI()
base_url = "https://api.openai.com/v1/chat/completions"
'''

In [4]:
# Initialize OpenAI client
client = OpenAI(api_key=deepseek_api_key, base_url = "https://api.deepseek.com")

In [5]:
system_extraction_prompt = "You are a helpful assistant with expertise in social work research."

user_extraction_prompt = """Carefully read the following scientific abstract. You must identify 
                            any Chinese province, city, and special administrative region noted 
                            in the abstract. If multiple locations are present in a single abstract, 
                            separate each location with a semicolon. Return NONE if no specific 
                            province, city, or administrative region is mentioned. You must 
                            perform this task accurately. Do not make up or guess any information."""


In [6]:
# Define a function to extract country names from the text using GPT-4o via OpenAI API
def extract_location(abstract):
           
    response = client.chat.completions.create(
        #model="gpt-4o",  # Specify the model to use
        model="deepseek-chat",
        messages=[
            {
                "role": "system",
                "content": system_extraction_prompt
            },
            {
                "role": "user",
                "content": user_extraction_prompt + abstract
                
            }
        ],
        temperature=0,  # Set response randomness (0 means deterministic responses)
        max_tokens=2500,  # Set the maximum token length of the response
        top_p=1,  # Use nucleus sampling (1 means only the most likely tokens are considered)
        frequency_penalty=0,  # No penalty for frequent tokens
        presence_penalty=0  # No penalty for new topic introduction
    )
    # Extract country names from the response
    location = response.choices[0].message.content.strip()
    return location

# Apply the function to the 'abstract' column and store the API response in a new column 'Location'
df['Location'] = df['Abstract'].apply(extract_location)

In [7]:
# Print all Locations that are not None
#df.to_excel('LBC_MetaData_Locations_Extracted.xlsx', index=False)

print(df['Location'])

0                    NONE
1                    NONE
2                    NONE
3                    NONE
4      Guangdong Province
              ...        
196                  NONE
197                  NONE
198                  NONE
199                  NONE
200                  NONE
Name: Location, Length: 201, dtype: object
