Scrape missing data from doesthedogdie.com

In [8]:
import json
import os
import requests
import pandas as pd
from dotenv import load_dotenv

In [9]:
import sys
sys.path.append('../utils')
import functions

In [10]:
sample_df = pd.read_csv('../data/local/clean/movie_sample.csv')
# display(sample_df)

In [11]:
sample_df.head(5)

Unnamed: 0,title,clean_title,original_title,genres,director,release_year,runtime,budget,revenue,popularity,tmdb_rating,tmdb_votes,imdb_rating,imdb_votes,language,tmdb_id,imdb_id
0,Bad Boys for Life,bad boys for life,Bad Boys for Life,"thriller, action, crime","Bilall Fallah, Adil El Arbi",2020,124,90000000,426505244,67.7,7.1,8251,6.5,186716,English,38700,tt1502397
1,Avatar: The Way of Water,avatar the way of water,Avatar: The Way of Water,"science fiction, adventure, action",James Cameron,2022,192,460000000,2320250281,158.5,7.6,11946,7.5,518001,English,76600,tt1630029
2,Pet Sematary,pet sematary,Pet Sematary,"horror, thriller","Dennis Widmyer, Kevin Kölsch",2019,100,21000000,113118226,26.1,5.7,3186,5.7,100618,English,157433,tt0837563
3,How to Train Your Dragon: The Hidden World,how to train your dragon the hidden world,How to Train Your Dragon: The Hidden World,"animation, family, adventure",Dean DeBlois,2019,104,129000000,524580592,207.5,7.8,6381,7.4,158269,English,166428,tt2386490
4,Star Wars: The Rise of Skywalker,star wars the rise of skywalker,Star Wars: The Rise of Skywalker,"adventure, action, science fiction",J.J. Abrams,2019,142,416000000,1074144248,63.2,6.3,9886,6.4,507321,English,181812,tt2527338


In [None]:
# Load the API key from the .env file
load_dotenv()
API_KEY_DOESTHEDOGDIE = os.getenv("API_KEY_DOESTHEDOGDIE")

# Sample data for demonstration purposes. Replace this with your actual DataFrame.
# sample_df = pd.read_csv("path_to_your_csv.csv")  # Example of how you might load your DataFrame

# Define the URL for the API request
BASE_URL = "https://www.doesthedogdie.com/dddsearch?q="

# Initialize a list to store the responses
responses = []

# Loop through the first 5 rows of 'clean_title' in the sample_df
for clean_name in sample_df['clean_title'][:5]:  # First 5 rows
    url = f"{BASE_URL}{clean_name}"
    headers = {
        "Accept": "application/json",
        "X-API-KEY": API_KEY_DOESTHEDOGDIE,
    }
    
    # Make the GET request to the API
    response = requests.get(url, headers=headers)
    
    if response.status_code == 200:
        responses.append(response.json())
    else:
        print(f"Error with request for {clean_name}: {response.status_code}")

# Function to extract topics of interest from the 'stats' field
def extract_topics(stats_str):
    # Convert the string into a Python dictionary
    try:
        stats = json.loads(stats_str)  # stats is a string, so we parse it into a dictionary
    except json.JSONDecodeError:
        print("Error decoding stats:", stats_str)
        return ""
    
    # Check if 'topics' key exists
    if 'topics' not in stats:
        print("No 'topics' field in response:", stats)  # Print full response to investigate
        return ""
    
    # Extract topics where 'definitelyYes' > 'definitelyNo', and both are greater than 1
    topics_of_interest = []
    
    for topic_id, data in stats['topics'].items():
        definitely_yes = int(data['definitelyYes'])  # Ensure definitelyYes is an integer
        definitely_no = int(data['definitelyNo'])  # Ensure definitelyNo is an integer
        
        # Apply condition: definitelyYes > definitelyNo and both values should be greater than 1
        if definitely_yes > definitely_no:
            topics_of_interest.append(str(topic_id))  # Store topic ID as a string
    
    return ",".join(topics_of_interest)  # Join the topics into a comma-separated string

# Add the 'topics' column to the DataFrame
sample_df['topics'] = None

# Loop through the responses and assign topics to the 'topics' column
for index, response in enumerate(responses):
    items = response.get('items', [])
    if items:
        stats_str = items[0].get('stats', '{}')  # Get the 'stats' field for the first item
        topics = extract_topics(stats_str)
        
        # Assign the topics to the correct row in the DataFrame
        sample_df.loc[index, 'topics'] = topics

# Display the DataFrame with the 'topics' column added (using 'clean_title' column)
print(sample_df[['clean_title', 'topics']])


                                    clean_title  \
0                             bad boys for life   
1                       avatar the way of water   
2                                  pet sematary   
3     how to train your dragon the hidden world   
4               star wars the rise of skywalker   
...                                         ...   
1162                            the last breath   
1163                                    moana 2   
1164     sound of hope the story of possum trot   
1165                                      waves   
1166                                am i racist   

                                                 topics  
0     167,177,180,184,188,192,193,198,208,211,228,23...  
1     158,161,164,167,168,177,181,188,189,190,191,19...  
2     153,158,161,164,165,168,171,176,177,180,184,18...  
3     190,196,211,222,229,233,243,244,252,270,289,35...  
4     161,164,167,168,181,184,188,198,202,207,214,23...  
...                                    

In [14]:
sample_df.head()

Unnamed: 0,title,clean_title,original_title,genres,director,release_year,runtime,budget,revenue,popularity,tmdb_rating,tmdb_votes,imdb_rating,imdb_votes,language,tmdb_id,imdb_id,topics
0,Bad Boys for Life,bad boys for life,Bad Boys for Life,"thriller, action, crime","Bilall Fallah, Adil El Arbi",2020,124,90000000,426505244,67.7,7.1,8251,6.5,186716,English,38700,tt1502397,"167,177,180,184,188,192,193,198,208,211,228,23..."
1,Avatar: The Way of Water,avatar the way of water,Avatar: The Way of Water,"science fiction, adventure, action",James Cameron,2022,192,460000000,2320250281,158.5,7.6,11946,7.5,518001,English,76600,tt1630029,"158,161,164,167,168,177,181,188,189,190,191,19..."
2,Pet Sematary,pet sematary,Pet Sematary,"horror, thriller","Dennis Widmyer, Kevin Kölsch",2019,100,21000000,113118226,26.1,5.7,3186,5.7,100618,English,157433,tt0837563,"153,158,161,164,165,168,171,176,177,180,184,18..."
3,How to Train Your Dragon: The Hidden World,how to train your dragon the hidden world,How to Train Your Dragon: The Hidden World,"animation, family, adventure",Dean DeBlois,2019,104,129000000,524580592,207.5,7.8,6381,7.4,158269,English,166428,tt2386490,"190,196,211,222,229,233,243,244,252,270,289,35..."
4,Star Wars: The Rise of Skywalker,star wars the rise of skywalker,Star Wars: The Rise of Skywalker,"adventure, action, science fiction",J.J. Abrams,2019,142,416000000,1074144248,63.2,6.3,9886,6.4,507321,English,181812,tt2527338,"161,164,167,168,181,184,188,198,202,207,214,23..."
