In [1]:
import requests
import pandas as pd
import time
import os
from dotenv import load_dotenv
import json

In [2]:
# sample_df = pd.read_csv('../data/local/clean/movie_sample_pre19.csv')
sample_df = pd.read_csv('../data/local/raw/pending_letterboxd_films_for_request.csv')

Rquest to extract ID and topics

In [None]:
load_dotenv()
API_KEY_DOESTHEDOGDIE = os.getenv("API_KEY_DOESTHEDOGDIE")

BASE_URL = "https://www.doesthedogdie.com/dddsearch?q="

# extract topics of interest from the 'stats' field
def extract_topics(stats_str):
    try:
        stats = json.loads(stats_str)  # stats is a string, so we parse it into a dictionary
    except json.JSONDecodeError:
        print("Error decoding stats:", stats_str)
        return ""
    
    if 'topics' not in stats:
        print("No 'topics' field in response:", stats)  # response stats
        return ""
    
    # get topics where 'definitelyYes' > 'definitelyNo', and both are greater than 1
    topics_of_interest = []
    
    for topic_id, data in stats['topics'].items():
        definitely_yes = int(data['definitelyYes'])
        definitely_no = int(data['definitelyNo'])
        
        if definitely_yes > definitely_no:
            topics_of_interest.append(str(topic_id)) 
    
    return ",".join(topics_of_interest)  # join topics w comma

# process in batches
def process_in_batches(df, batch_size=50):
    df['topics'] = None
    df['doesthedog_id'] = None
    
    responses = []
    
    for start in range(0, len(df), batch_size):  
        end = min(start + batch_size, len(df))  # define batch end
        batch = df.iloc[start:end]
        
        current_responses = []
        
        for title in batch['title']:
            url = f"{BASE_URL}{title}"
            headers = {
                "Accept": "application/json",
                "X-API-KEY": API_KEY_DOESTHEDOGDIE,
            }
            
            response = requests.get(url, headers=headers)
            
            if response.status_code == 200:
                current_responses.append(response.json())
            else:
                print(f"Error with request for {title}: {response.status_code}")
        
        responses.extend(current_responses)
        
        for index, response in enumerate(current_responses):
            items = response.get('items', [])
            if items:
                stats_str = items[0].get('stats', '{}')  # get the 'stats' field
                topics = extract_topics(stats_str)
                
                doesthedog_id = items[0].get('id', None)
                
                df.loc[start + index, 'topics'] = topics
                df.loc[start + index, 'doesthedog_id'] = doesthedog_id
        
        # save to CSV after each batch, appending if the file exists
        file_name = "letterboxd_request_backup.csv"
        if os.path.exists(file_name):
            df.iloc[start:end].to_csv(file_name, mode='a', header=False, index=False) # append new data if file exists
        else:
            df.iloc[start:end].to_csv(file_name, mode='w', header=True, index=False)
        
        time.sleep(2)  # prevent rate limiting
    
    return df

sample_df = process_in_batches(sample_df, batch_size=50)

print(sample_df[['title', 'topics', 'doesthedog_id']])