Scrape missing data from doesthedogdie.com

In [None]:
import json
import os
import requests
import pandas as pd
from dotenv import load_dotenv

In [None]:
import sys
sys.path.append('../utils')
import functions

In [None]:
sample_df = pd.read_csv('../data/local/clean/movie_sample.csv')
# display(sample_df)

In [None]:
sample_df.head(5)

In [None]:
# Load the API key from the .env file
load_dotenv()
API_KEY_DOESTHEDOGDIE = os.getenv("API_KEY_DOESTHEDOGDIE")

# Sample data for demonstration purposes. Replace this with your actual DataFrame.
# sample_df = pd.read_csv("path_to_your_csv.csv")  # Example of how you might load your DataFrame

# Define the URL for the API request
BASE_URL = "https://www.doesthedogdie.com/dddsearch?q="

# Initialize a list to store the responses
responses = []

# Loop through the first 5 rows of 'clean_title' in the sample_df
for clean_name in sample_df['clean_title'][:5]:  # First 5 rows
    url = f"{BASE_URL}{clean_name}"
    headers = {
        "Accept": "application/json",
        "X-API-KEY": API_KEY_DOESTHEDOGDIE,
    }
    
    # Make the GET request to the API
    response = requests.get(url, headers=headers)
    
    if response.status_code == 200:
        responses.append(response.json())
    else:
        print(f"Error with request for {clean_name}: {response.status_code}")

# Function to extract topics of interest from the 'stats' field
def extract_topics(stats_str):
    # Convert the string into a Python dictionary
    try:
        stats = json.loads(stats_str)  # stats is a string, so we parse it into a dictionary
    except json.JSONDecodeError:
        print("Error decoding stats:", stats_str)
        return ""
    
    # Check if 'topics' key exists
    if 'topics' not in stats:
        print("No 'topics' field in response:", stats)  # Print full response to investigate
        return ""
    
    # Extract topics where 'definitelyYes' > 'definitelyNo', and both are greater than 1
    topics_of_interest = []
    
    for topic_id, data in stats['topics'].items():
        definitely_yes = int(data['definitelyYes'])  # Ensure definitelyYes is an integer
        definitely_no = int(data['definitelyNo'])  # Ensure definitelyNo is an integer
        
        # Apply condition: definitelyYes > definitelyNo and both values should be greater than 1
        if definitely_yes > definitely_no:
            topics_of_interest.append(str(topic_id))  # Store topic ID as a string
    
    return ",".join(topics_of_interest)  # Join the topics into a comma-separated string

# Add the 'topics' and 'doesthedog_id' columns to the DataFrame
sample_df['topics'] = None
sample_df['doesthedog_id'] = None

# Loop through the responses and assign topics to the 'topics' column
for index, response in enumerate(responses):
    items = response.get('items', [])
    if items:
        stats_str = items[0].get('stats', '{}')  # Get the 'stats' field for the first item
        topics = extract_topics(stats_str)
        
        # Get the 'id' from the response (this is the 'id' field in each 'item')
        doesthedog_id = items[0].get('id', None)
        
        # Assign the topics and id to the correct row in the DataFrame
        sample_df.loc[index, 'topics'] = topics
        sample_df.loc[index, 'doesthedog_id'] = doesthedog_id

# Display the DataFrame with the 'topics' and 'doesthedog_id' columns added (using 'clean_title' column)
print(sample_df[['clean_title', 'topics', 'doesthedog_id']])


In [None]:
sample_df.head()

In [None]:
# Assuming sample_df is your DataFrame and 'topics' is the column of interest

# Create a new column 'topics_length' to store the length of comma-separated values in 'topics'
sample_df['topics_length'] = sample_df['topics'].apply(lambda x: len(x.split(',')) if isinstance(x, str) else 0)

# Display the DataFrame with the new 'topics_length' column
print(sample_df[['topics', 'topics_length']])
