<a href="https://www.kaggle.com/code/nyagami/web-scraping-video-game-ratings-on-imdb?scriptVersionId=115218030" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

<b><p style="font-size:28px; text-align:center;font-family:verdana; line-height: 1.7em">Web Scraping Video Game Details</p></b>

<p style = "font-family:verdana; "><b>Showing the process of web scraping details of video games from <a href="https://www.imdb.com/search/title/?title_type=video_game">imdb</a></b></p>
  
The data obtained from the games include: 
- Game title
- Launch year
- Game genre
- Game rating
- Number of voters
- Director
- Game description or plot



# Install and import required packages

In [None]:
!pip install beautifulsoup4 # Installing the beautiful soup package
!pip install html_parser # Installing html parser for use with the beautiful soup package 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import requests
from bs4 import BeautifulSoup as bs

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Scraping steps

In [None]:
imdb_video_games_dict = {}
imdb_video_games_df = pd.DataFrame(columns=['title','year','genre','rating','votes','directors','plot'])

# Get total entries in the database
page_url = 'https://www.imdb.com/search/title/?title_type=video_game&sort=user_rating,desc'
pages_html = requests.get(page_url)
pages_html = bs(pages_html.text, 'html.parser')
current_entry = int(pages_html.find(class_='desc').span.get_text().strip().split(' ')[0].split('-')[1])
total_entries = int(pages_html.find(class_='desc').span.get_text().strip().split(' ')[2].replace(',',''))
current_entry

while current_entry < total_entries:
    imdb_html = requests.get(page_url)
    imdb = bs(imdb_html.text, 'html.parser')

    # Skip pages not read successfully
    if imdb_html.status_code!=200:
        continue
        
    for game_row in imdb.findAll(class_ = 'lister-item-content'):
       
        # Retrieve the title of the game
        title = game_row.h3.a.text
        
        # Retrive launching year
        year = game_row.find(class_ = 'lister-item-year text-muted unbold').text
        year = year.split('(') # The year is a string. Separate it using an opening parenthesis
        year = str(year).split(' ')[1].replace("'",'') # Split the string to retrieve the year
        
        # Obtain the genre
        try:
            genre = game_row.find('span', class_ = 'genre').get_text().strip()
        except:
            genre = 'missing'
                    
        # Retrieve the rating
        try:
            rating = game_row.find(class_='inline-block ratings-imdb-rating').strong.get_text()
        except:
            rating = np.nan
        
        # Retrieve the plot
        try:
            plot = game_row.findAll('p')[1].get_text().replace('\n','')
            plot = plot.replace('Add a Plot','Missing') # Insert missing for blank plots
        except:
            plot = 'Missing'
        # Retrieve the number of voters
        try:
            votes = game_row.find('p', class_='sort-num_votes-visible').find('span',attrs={'name':'nv'}).get_text()
        except:
            votes = np.nan
        
        # Directors
        try:
            director = game_row.findAll('p')[2].get_text()
            #print(director)
            for word in director.split('|'): # Split directors and stars using |
                word = word.strip() 
                if word.startswith('Dire'): 
                    ind = word.index(':') + 1 
                    directors =(word[ind:]).strip().replace('\n','') # Remove empty lines
        except:
             directors = 'Missing' # Set the default value for entries without director details
        
        # Create a dictionary of all attributes of the game
        imdb_video_games_dict = dict([('title',title),('year',year),('genre',genre),('rating',rating),('votes',votes),('directors',directors),('plot',plot)])
        
        # Create a dataframe to store the game details
        game_row_df = pd.DataFrame(imdb_video_games_dict.values(),index=imdb_video_games_dict.keys()).T
        
        # Append the game details to the current dataframe
        imdb_video_games_df = pd.merge(imdb_video_games_df,game_row_df,on=['title','year','genre','rating','votes','directors','plot'],how='outer')

    # Obtain the url for the next page
    next_page_url_rel = imdb.find(class_='lister-page-next next-page')['href']
    next_page_url_abs = 'https://www.imdb.com'
    page_url = next_page_url_abs + next_page_url_rel + '&ref_=adv_nxt'
    current_entry += 50


# Data Cleaning

In [None]:
# Let's check for duplicates
print(imdb_video_games_df.duplicated(subset=['title','year']).sum())

# Drop duplictes
imdb_video_games_df.drop_duplicates(keep='first',subset=['title','year'],inplace=True)

# Saving output

In [None]:
imdb_video_games_df.to_csv('imdb_video_game_rating.csv')

# Preview the clean dataset

In [None]:
display(imdb_video_games_df)