In [None]:
import requests
import pandas as pd
import time
import os
from dotenv import load_dotenv
import json

In [None]:
sample_df = pd.read_csv('../data/local/clean/movie_sample.csv')

Rquest to extract ID and topics

In [None]:
# Load the API key from the .env file
load_dotenv()
API_KEY_DOESTHEDOGDIE = os.getenv("API_KEY_DOESTHEDOGDIE")

# Define the URL for the API request
BASE_URL = "https://www.doesthedogdie.com/dddsearch?q="

# Function to extract topics of interest from the 'stats' field
def extract_topics(stats_str):
    # Convert the string into a Python dictionary
    try:
        stats = json.loads(stats_str)  # stats is a string, so we parse it into a dictionary
    except json.JSONDecodeError:
        print("Error decoding stats:", stats_str)
        return ""
    
    # Check if 'topics' key exists
    if 'topics' not in stats:
        print("No 'topics' field in response:", stats)  # Print full response to investigate
        return ""
    
    # Extract topics where 'definitelyYes' > 'definitelyNo', and both are greater than 1
    topics_of_interest = []
    
    for topic_id, data in stats['topics'].items():
        definitely_yes = int(data['definitelyYes'])  # Ensure definitelyYes is an integer
        definitely_no = int(data['definitelyNo'])  # Ensure definitelyNo is an integer
        
        # Apply condition: definitelyYes > definitelyNo and both values should be greater than 1
        if definitely_yes > definitely_no:
            topics_of_interest.append(str(topic_id))  # Store topic ID as a string
    
    return ",".join(topics_of_interest)  # Join the topics into a comma-separated string

# Function to process the DataFrame in batches
def process_in_batches(df, batch_size=50):
    # Add the 'topics' and 'doesthedog_id' columns to the DataFrame
    df['topics'] = None
    df['doesthedog_id'] = None
    
    # Initialize a list to store responses
    responses = []
    
    # Loop through the entire DataFrame in batches
    for start in range(0, len(df), batch_size):  # Process the entire DataFrame
        end = min(start + batch_size, len(df))  # Define the batch end
        batch = df.iloc[start:end]
        
        # Initialize a list to store the current batch's responses
        current_responses = []
        
        for title in batch['title']:  # Use 'title' instead of 'clean_title'
            url = f"{BASE_URL}{title}"
            headers = {
                "Accept": "application/json",
                "X-API-KEY": API_KEY_DOESTHEDOGDIE,
            }
            
            # Make the GET request to the API
            response = requests.get(url, headers=headers)
            
            if response.status_code == 200:
                current_responses.append(response.json())
            else:
                print(f"Error with request for {title}: {response.status_code}")
        
        # Store the current batch responses
        responses.extend(current_responses)
        
        # Process the responses and update the DataFrame
        for index, response in enumerate(current_responses):
            items = response.get('items', [])
            if items:
                stats_str = items[0].get('stats', '{}')  # Get the 'stats' field for the first item
                topics = extract_topics(stats_str)
                
                # Extract the 'id' from the response (this is the 'id' field in each 'item')
                doesthedog_id = items[0].get('id', None)
                
                # Assign the topics and id to the correct row in the DataFrame
                df.loc[start + index, 'topics'] = topics
                df.loc[start + index, 'doesthedog_id'] = doesthedog_id
        
        # Save the DataFrame to CSV after each batch, appending if the file exists
        file_name = "backup_topics_and_ids.csv"
        if os.path.exists(file_name):
            # If the file exists, append the new data (without writing the header again)
            df.iloc[start:end].to_csv(file_name, mode='a', header=False, index=False)
        else:
            # If the file doesn't exist, write it with the header
            df.iloc[start:end].to_csv(file_name, mode='w', header=True, index=False)
        
        time.sleep(2)  # Sleep to prevent rate limiting
    
    return df

# # Process the entire DataFrame in batches of 50 rows
# sample_df = process_in_batches(sample_df, batch_size=50)

# # Display the final DataFrame with 'topics' and 'doesthedog_id' columns added
# print(sample_df[['title', 'topics', 'doesthedog_id']])