IMDB Scraping

In [1]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
import time

In [2]:
import sys
sys.path.append('../utils')
import functions

In [3]:
df = pd.read_csv('../data/local/clean/2018_films.csv')
display(df)

Unnamed: 0.1,Unnamed: 0,title,original_title,clean_title,release_year,imdb_id,imdb_rating,imdb_votes,genres,director,revenue,budget,runtime,original_language,popularity,language
0,0,Krystal,Krystal,krystal,2018,tt0835802,5.5,1773.0,"drama, comedy",William H. Macy,0,0,90,en,4.9,English
1,1,Monkey Magic,大闹西游,monkey magic,2018,tt10443316,6.4,31.0,"animation, fantasy, adventure",Jihai Ma,0,0,66,zh,20.4,Chinese
2,2,20 Seconds of Joy,20 Seconds of Joy,20 seconds of joy,2018,tt1202339,7.5,152.0,documentary,Jens Hoffmann,0,337200,60,de,2.7,German
3,3,Oliver Tambo: Have You Heard From Johannesburg,Oliver Tambo: Have You Heard From Johannesburg,oliver tambo have you heard from johannesburg,2018,tt7954272,,,documentary,Connie Field,0,0,97,en,0.6,English
4,4,Super Troopers 2,Super Troopers 2,super troopers 2,2018,tt0859635,6.0,28791.0,"comedy, crime, mystery",Jay Chandrasekhar,31626386,13500000,100,en,10.2,English
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33176,33176,Mold,Mold,mold,2018,,,,,Ryan Wyer,0,0,3,en,0.6,English
33177,33177,Janson Directive,Janson Directive,janson directive,2018,,,,,,0,0,108,en,0.7,English
33178,33178,O Fim da Noite,O Fim da Noite,o fim da noite,2018,,,,,"Julia De Simone, Ricardo Pretti",0,0,21,pt,0.6,Portuguese
33179,33179,The 12th House,The 12th House,the 12th house,2018,,,,"drama, music",Stephanie Coffey,0,0,5,en,0.6,English


In [4]:
functions.show_missing_values(df)


Missing Values in Columns:
Unnamed: 0               0
title                    0
original_title           0
clean_title              3
release_year             0
imdb_id              14118
imdb_rating          19180
imdb_votes           19180
genres                7875
director              4612
revenue                  0
budget                   0
runtime                  0
original_language        0
popularity               0
language                 0
dtype: int64


Create a DF where 'imdb_rating', 'imdb_votes' and 'runtime' have empty rows.

In [5]:
# 'runime' values of 0 as NaN
df.loc[df['runtime'] == 0, 'runtime'] = np.nan

In [6]:
# new df with empty rows
nan_df = df[df['imdb_rating'].isna() | df['imdb_votes'].isna()]

display(nan_df)

Unnamed: 0.1,Unnamed: 0,title,original_title,clean_title,release_year,imdb_id,imdb_rating,imdb_votes,genres,director,revenue,budget,runtime,original_language,popularity,language
3,3,Oliver Tambo: Have You Heard From Johannesburg,Oliver Tambo: Have You Heard From Johannesburg,oliver tambo have you heard from johannesburg,2018,tt7954272,,,documentary,Connie Field,0,0,97.0,en,0.6,English
5,5,Frances: Bedtime for Frances,Frances: Bedtime for Frances,frances bedtime for frances,2018,,,,"animation, family",,0,0,46.0,en,0.6,English
9,9,2018 Dream Concert,2018 Dream Concert,2018 dream concert,2018,,,,music,,0,0,110.0,en,0.5,English
10,10,Led Zeppelin: How the West Was Won,Led Zeppelin: How the West Was Won,led zeppelin how the west was won,2018,,,,music,,0,0,150.0,en,0.9,English
14,14,Cortázar in Love,Cortázar in Love,cortázar in love,2018,tt8219584,,,drama,James Alejandro,0,0,88.0,en,1.5,English
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33175,33175,マクロスΔ LIVE2018“ワルキューレは裏切らない“at 横浜アリーナ＜Day-2,マクロスΔ LIVE2018“ワルキューレは裏切らない“at 横浜アリーナ＜Day-2,マクロスδ live2018ワルキューレは裏切らないat 横浜アリーナday2,2018,,,,music,,0,0,,ja,0.7,Japanese
33176,33176,Mold,Mold,mold,2018,,,,,Ryan Wyer,0,0,3.0,en,0.6,English
33177,33177,Janson Directive,Janson Directive,janson directive,2018,,,,,,0,0,108.0,en,0.7,English
33178,33178,O Fim da Noite,O Fim da Noite,o fim da noite,2018,,,,,"Julia De Simone, Ricardo Pretti",0,0,21.0,pt,0.6,Portuguese


Get rating, votes and runtime from IMDB 

In [7]:
def get_imdb_data(imdb_id):
    imdb_url = f'https://www.imdb.com/title/{imdb_id}/'
    try:
        response = requests.get(imdb_url)
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Scrape rating, votes, runtime, genres, and director(s)
        rating = soup.find('span', itemprop='ratingValue')
        votes = soup.find('span', itemprop='ratingCount')
        runtime = soup.find('time', itemprop='duration')
        genre_elements = soup.find_all('span', class_='sc-16ede0c-2 iVxvPb')
        director_elements = soup.find_all('a', href=lambda x: x and x.startswith('/name/'))

        # Extract genre names
        genres = [genre.text.strip() for genre in genre_elements] if genre_elements else None
        
        # Extract director names
        directors = [director.text.strip() for director in director_elements] if director_elements else None
        
        imdb_rating = float(rating.text) if rating else None
        imdb_votes = int(votes.text.replace(',', '')) if votes else None
        imdb_runtime = int(runtime.text.strip().split()[0]) if runtime else None
        
        return imdb_rating, imdb_votes, imdb_runtime, genres, directors
    
    except Exception as e:
        print(f"Error fetching data for {imdb_id}: {e}")
        return None, None, None, None, None

# Function to process a batch of rows
def process_batch(df_batch):
    for index, row in df_batch.iterrows():
        imdb_id = row['imdb_id']
        
        # Check for missing values
        if pd.isna(row['imdb_rating']) or pd.isna(row['imdb_votes']) or pd.isna(row['runtime']) or pd.isna(row['genres']) or pd.isna(row['director']):
            imdb_rating, imdb_votes, imdb_runtime, genres, directors = get_imdb_data(imdb_id)
            
            # Update DataFrame if necessary
            if pd.isna(row['imdb_rating']) and imdb_rating is not None:
                df.at[index, 'imdb_rating'] = imdb_rating
            if pd.isna(row['imdb_votes']) and imdb_votes is not None:
                df.at[index, 'imdb_votes'] = imdb_votes
            if pd.isna(row['runtime']) and imdb_runtime is not None:
                df.at[index, 'runtime'] = imdb_runtime
            if pd.isna(row['genres']) and genres is not None:
                df.at[index, 'genres'] = ", ".join(genres)  # Join genres into a single string
            if pd.isna(row['director']) and directors is not None:
                df.at[index, 'director'] = ", ".join(directors)  # Join directors into a single string

# Process a batch of 20 movie titles (first 20 rows)
batch_size = 20
df_batch = df.iloc[:batch_size]  # Select first 20 rows (adjust as needed)

# Process the current batch
process_batch(df_batch)

# Display the updated DataFrame after scraping
print(df_batch)

# Add a delay between requests to avoid overloading IMDb
time.sleep(2)  # Adjust the sleep time as necessary

    Unnamed: 0                                           title  \
0            0                                         Krystal   
1            1                                    Monkey Magic   
2            2                               20 Seconds of Joy   
3            3  Oliver Tambo: Have You Heard From Johannesburg   
4            4                                Super Troopers 2   
5            5                    Frances: Bedtime for Frances   
6            6                 Elvis: The '68 Comeback Special   
7            7                    Iruttu Araiyil Murattu Kuthu   
8            8                                   Annanukku Jey   
9            9                              2018 Dream Concert   
10          10              Led Zeppelin: How the West Was Won   
11          11                                            Edge   
12          12                                    The Outsider   
13          13                                     Thugocratie   
14        

In [8]:
display(df_batch)

Unnamed: 0.1,Unnamed: 0,title,original_title,clean_title,release_year,imdb_id,imdb_rating,imdb_votes,genres,director,revenue,budget,runtime,original_language,popularity,language
0,0,Krystal,Krystal,krystal,2018,tt0835802,5.5,1773.0,"drama, comedy",William H. Macy,0,0,90.0,en,4.9,English
1,1,Monkey Magic,大闹西游,monkey magic,2018,tt10443316,6.4,31.0,"animation, fantasy, adventure",Jihai Ma,0,0,66.0,zh,20.4,Chinese
2,2,20 Seconds of Joy,20 Seconds of Joy,20 seconds of joy,2018,tt1202339,7.5,152.0,documentary,Jens Hoffmann,0,337200,60.0,de,2.7,German
3,3,Oliver Tambo: Have You Heard From Johannesburg,Oliver Tambo: Have You Heard From Johannesburg,oliver tambo have you heard from johannesburg,2018,tt7954272,,,documentary,Connie Field,0,0,97.0,en,0.6,English
4,4,Super Troopers 2,Super Troopers 2,super troopers 2,2018,tt0859635,6.0,28791.0,"comedy, crime, mystery",Jay Chandrasekhar,31626386,13500000,100.0,en,10.2,English
5,5,Frances: Bedtime for Frances,Frances: Bedtime for Frances,frances bedtime for frances,2018,,,,"animation, family",,0,0,46.0,en,0.6,English
6,6,Elvis: The '68 Comeback Special,Elvis: The '68 Comeback Special,elvis the 68 comeback special,2018,tt0285063,8.9,2333.0,"music, tv movie",Steve Binder,0,0,74.0,en,4.8,English
7,7,Iruttu Araiyil Murattu Kuthu,இருட்டு அறையில் முரட்டு குத்து,iruttu araiyil murattu kuthu,2018,tt7510220,4.2,937.0,"comedy, horror",Santhosh P. Jayakumar,0,0,118.0,ta,3.7,Tamil
8,8,Annanukku Jey,அண்ணனுக்கு ஜே,annanukku jey,2018,tt7023452,5.7,187.0,"comedy, drama",La. Rajkumar,0,0,111.0,ta,2.6,Tamil
9,9,2018 Dream Concert,2018 Dream Concert,2018 dream concert,2018,,,,music,,0,0,110.0,en,0.5,English
