In [1]:
!pip install beautifulsoup4 # Installing the beautiful soup package
!pip install html_parser # Installing html parser for use with the beautiful soup package 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import requests
from bs4 import BeautifulSoup as bs

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

[0mCollecting html_parser
  Downloading html-parser-0.2.tar.gz (904 bytes)
  Preparing metadata (setup.py) ... [?25l- done
[?25hCollecting ply
  Downloading ply-3.11-py2.py3-none-any.whl (49 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m49.6/49.6 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: html_parser
  Building wheel for html_parser (setup.py) ... [?25l- \ done
[?25h  Created wheel for html_parser: filename=html_parser-0.2-py3-none-any.whl size=1329 sha256=1328fbb071eca1d9f17aecc32549f47b086df64f5f5938163b76c31794e7adc4
  Stored in directory: /root/.cache/pip/wheels/11/86/38/0554afea46105c70bae8d223c427bba371aa0c83ce88d57b27
Successfully built html_parser
Installing collected packages: ply, html_parser
Successfully installed html_parser-0.2 ply-3.11
[0m

In [2]:
# # This is the link to cycle through the pages on imdb
# url = 'https://www.imdb.com/search/title/?title_type=video_game&sort=user_rating,asc&start={}&ref_=adv_nxt'
# # Total pages to cycle through
# pages = np.arange(1,number_of_pages_int,50)

In [3]:
# # Get total entries in the database
# page_url = 'https://www.imdb.com/search/title/?title_type=video_game&sort=user_rating,desc'
# pages_html = requests.get(page_url)
# pages_html = bs(pages_html.text, 'html.parser')
# current_entry = int(pages_html.find(class_='desc').span.get_text().strip().split(' ')[0].split('-')[1])
# total_entries = int(pages_html.find(class_='desc').span.get_text().strip().split(' ')[2].replace(',',''))
# current_entry

In [4]:
imdb_video_games_dict = {}
imdb_video_games_df = pd.DataFrame(columns=['title','year','genre','rating','votes','directors','plot'])



# Get total entries in the database
page_url = 'https://www.imdb.com/search/title/?title_type=video_game&sort=user_rating,desc'
pages_html = requests.get(page_url)
pages_html = bs(pages_html.text, 'html.parser')
current_entry = int(pages_html.find(class_='desc').span.get_text().strip().split(' ')[0].split('-')[1])
total_entries = int(pages_html.find(class_='desc').span.get_text().strip().split(' ')[2].replace(',',''))
current_entry

while current_entry < total_entries:
    imdb_html = requests.get(page_url)
    imdb = bs(imdb_html.text, 'html.parser')

    # Skip pages not read successfully
    if imdb_html.status_code!=200:
        continue
        
    for game_row in imdb.findAll(class_ = 'lister-item-content'):
       
        # Retrieve the title of the game
        title = game_row.h3.a.text
        
        # Retrive launching year
        year = game_row.find(class_ = 'lister-item-year text-muted unbold').text
        year = year.split('(') # The year is a string. Separate it using an opening parenthesis
        year = str(year).split(' ')[1].replace("'",'') # Split the string to retrieve the year
        
        # Obtain the genre
        try:
            genre = game_row.find('span', class_ = 'genre').get_text().strip()
        except:
            genre = 'missing'
            continue
        
        # Retrieve the rating
        try:
            rating = game_row.find(class_='inline-block ratings-imdb-rating').strong.get_text()
        except:
            rating = np.nan
            continue
        # Retrieve the plot
        plot = game_row.findAll('p')[1].get_text().replace('\n','')
        plot = plot.replace('Add a Plot','Missing') # Insert missing for blank plots
        
        # Retrieve the number of voters
        votes = game_row.find('p', class_='sort-num_votes-visible').find('span',attrs={'name':'nv'}).get_text()
        
        # Directors
        directors = 'Missing' # Set the default value for entries without director details
        director = game_row.findAll('p')[2].get_text()
        #print(director)
        for word in director.split('|'): # Split directors and stars using |
            word = word.strip() 
            if word.startswith('Dire'): 
                ind = word.index(':') + 1 
                directors =(word[ind:]).strip().replace('\n','') # Remove empty lines
        
        # Create a dictionary of all attributes of the game
        imdb_video_games_dict = dict([('title',title),('year',year),('genre',genre),('rating',rating),('votes',votes),('directors',directors),('plot',plot)])
        
        # Create a dataframe to store the game details
        game_row_df = pd.DataFrame(imdb_video_games_dict.values(),index=imdb_video_games_dict.keys()).T
        
        # Append the game details to the current dataframe
        imdb_video_games_df = pd.merge(imdb_video_games_df,game_row_df,on=['title','year','genre','rating','votes','directors','plot'],how='outer')
        
    # Obtain the url for the next page
    next_page_url_rel = imdb.find(class_='lister-page-next next-page')['href']
    next_page_url_abs = 'https://www.imdb.com'
    page_url = next_page_url_abs + next_page_url_rel + '&ref_=adv_nxt'
    current_entry += 50


In [5]:
# Let's check for duplicates
print(imdb_video_games_df.duplicated(subset=['title','year']).sum())

# Drop duplictes
imdb_video_games_df.drop_duplicates(keep='first',subset=['title','year'],inplace=True)

3


In [6]:
# Save to csv file 
imdb_video_games_df.to_csv('imdb_video_game_rating.csv')

In [7]:
# Preview the dataframe
imdb_video_games_df

Unnamed: 0,title,year,genre,rating,votes,directors,plot
0,The Last of Us: Part I,2022,"Action, Adventure, Drama",9.8,598,"Matthew Gallant, Bruce Straley",Experience the emotional storytelling and unfo...
1,Red Dead Redemption II,2018,"Action, Adventure, Crime",9.7,36432,Missing,Amidst the decline of the Wild West at the tur...
2,The Witcher 3: Wild Hunt - Blood and Wine,2016,"Action, Adventure, Drama",9.7,7610,Konrad Tomaszkiewicz,Geralt is in the southern province of Toussain...
3,The Witcher 3: Wild Hunt,2015,"Action, Adventure, Drama",9.7,26327,Konrad Tomaszkiewicz,A monster hunter for hire embarks on an epic j...
4,The Last of Us,2013,"Action, Adventure, Drama",9.7,61100,"Neil Druckmann, Bruce Straley","In a hostile, post-pandemic world, Joel and El..."
...,...,...,...,...,...,...,...
12632,Superman,1999,"Action, Adventure, Family",1.4,646,Missing,The first 3D Superman game. Your friends Lois ...
12633,Action 52,1991,"Action, Family, Fantasy",1.3,135,"Raul Gomila, Vince Perri",Play the action at your fingertips with 52 gam...
12634,Plumbers Don't Wear Ties,1994,"Comedy, Romance",1.3,338,Michael Anderson,"John, an unattached plumber, meets and falls i..."
12635,Animal Soccer World,2005,"Animation, Sport",1.2,125,Roswitha Haas,Everybody is busy with the preparations for th...
