# DATA102 Homework 1: Web Scraping

**Group Number**: 3
**Members**:
- Jose Maria Angelo Guerra
- Kyle Carlo Lasala
- Katrina Bianca Roco
- Antonio Jose Maria Lorenzo
- Josh Angelo Theodore Borro
- Charles Joseph Hinolan

**Section**: S11

**Instructor**: Mr. Jude Michael Teves

### Import Libraries

In [2]:
import requests
import pandas as pd
import time

### Parse Using BeautifulSoup

In [4]:
from bs4 import BeautifulSoup
page = "https://itch.io/games"
#contact is my personal email
headers = {"User-Agent": "EducationalScraper/1.0 (contact: hinolancj@gmail.com)"}

# Disallowed paths based on itch.io/robots.txt
disallowed_paths = ["/embed/", "/embed-upload/", "/search", "/checkout/", "/game/download/", "/bundle/download/", "/register-for-purchase/", "/email-feedback/"]

# Function to check if URL is allowed
def is_allowed(url):
    for path in disallowed_paths:
        if path in url:
            return False
    return True


url = f"{page}"
if is_allowed(url):
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.text, "html.parser")
        print(soup.prettify())
        time.sleep(2)
    except requests.exceptions.RequestException as e:
        print("Error: Unable to fetch the page.")
else:
    print(f"Skipping disallowed URL: {url}")

<!DOCTYPE HTML>
<html lang="en">
 <head>
  <meta charset="utf-8"/>
  <meta content="IBM HomePage Builder 2001 V5.0.0 for Windows" name="GENERATOR"/>
  <meta content="3BB4D18369B9C21326AF7A99FCCC5A09" name="msvalidate.01">
   <meta content="537395183072744" property="fb:app_id"/>
   <title>
    Top games - itch.io
   </title>
   <meta name="csrf_token" value="WyIzZEdvIiwxNzM3ODkyNzEwLCJqUHNudUVRNmEyc1FERGkiXQ==.OYgu/IWvslwVK5jK7m8H2O6WKJQ="/>
   <meta content="Top games" property="og:title"/>
   <meta content="itch.io" property="og:site_name"/>
   <meta content="4503599627724030" property="twitter:account_id"/>
   <link href="?page=2" rel="next"/>
   <link href="/static/manifest.json" rel="manifest"/>
   <meta content="@itchio" name="twitter:creator"/>
   <meta content="Top games" name="twitter:title"/>
   <meta content="Explore games on itch.io" name="twitter:description"/>
   <meta content="@itchio" name="twitter:site"/>
   <meta content="summary_large_image" name="twitter:card"/>
   

### Setup Browser Automation

In [102]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.chrome.service import Service #microsft edge, change to webdriver.chrome.service for chrome

#driver_path = "C:/Users/Charles/Desktop/COLLEGE/DATA102/edgedriver_win64/msedgedriver.exe" #edit your driver's path
url = "https://itch.io/games" # edit me

#service = Service(driver_path)
driver = webdriver.Chrome()

driver.get(url)
#print(driver.page_source)

### Extracting the Data

data to extract (for now):
1. game id (class=game_cell has_cover)
2. game title (class=game_title)
3. genre (class=game_genre)
4. author (class=game_author)
5. game text (class=game_text)

-------------- need to click on the game to get the following data below--------------

6. status
7. average rating
8. rating count
9. tags 
10. average session time 
11. platforms 
12. inputs -> i forgot to scrape heh

#### Auto Scrolling Algorithm 

In [103]:
#auto scrolling algorithm 
#NOTE: max_game_count limits the number of games to parse through -> limits both the auto scrolling and the games list
pause = 0.5
lastHeight = driver.execute_script("return document.body.scrollHeight")

length = 0
max_game_count = 50


while length<max_game_count:
    game_list = driver.find_elements(By.XPATH,"//div[@class='game_cell has_cover']")
    length = len(game_list)
    
    if length >= max_game_count:
        break
    
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    time.sleep(pause)
    newHeight = driver.execute_script("return document.body.scrollHeight")
    if newHeight == lastHeight:
        break
    lastHeight = newHeight

#### Retrieving Game Info from Homepage

In [104]:
# truncate based on max_game_count
games = game_list[:max_game_count]

info = [] 
link_list = []

#extract game data
#NOTE: some games do not have all the data, so we need to account for that by adding N/A if the data is not present in order for the lists to be the same length
#NOTE: moved the code for obtaining game links here because for some reason it always got different lengths of game lists
#TODO: probably a better way to do the conditions
for game in games:

    data = []
    #all games are guaranteed to have a game_id
    game_id = game.get_attribute("data-game_id")
    data.append(game_id)

    title = game.find_elements(By.XPATH, ".//a[@class='title game_link']")
    genre = game.find_elements(By.XPATH, ".//div[@class='game_genre']")
    author = game.find_elements(By.XPATH, ".//div[@class='game_author']")
    text = game.find_elements(By.XPATH, ".//div[@class='game_text']")

    if title:
        data.append(title[0].text)
    else:
        data.append("N/A")

    if genre:
        data.append(genre[0].text)
    else:
        data.append("N/A")

    if author:
        data.append(author[0].text)
    else:
        data.append("N/A")

    if text:
        data.append(text[0].text)
    else:
        data.append("N/A")

    info.append(data)
    
print(info)


[['2955066', 'Incredibox - Sprunki', 'N/A', 'wolf_hal', 'N/A'], ['3224595', 'Little Bartmares', 'Adventure', 'David Mills', 'WHY YOU LITTLE!!!'], ['3216520', 'The Apartment 57', 'Adventure', 'Infinity Entertainment', 'is a psychological horror game set in an abandoned hotel. Players explore dark hallways and uncover dark secrets.'], ['3148668', "Pretend it's not There", 'Adventure', 'Dreadloom', "Pretend that you can't see the monster, that may be the only way to survive."], ['2513640', 'Ignited Entry', 'Adventure', 'JordiBoi', 'The corpse is alive.'], ['3170979', 'Unbothered', 'Adventure', 'Yegboi', 'N/A'], ['3104891', 'The Dark Queen of Mortholme', 'Interactive Fiction', 'Mosu', "The final boss's side of the story"], ['323822', 'Death Trips', 'Interactive Fiction', 'Forameuss', 'Extremely short horror game.'], ['2553372', 'Fallacy Quiz', 'Educational', 'Tim Krief', 'Sharpen your critical thinking skills in this quiz game that challenges you to spot rhetorical fallacies.'], ['2742018'

In [105]:
print(f"Number of game info: {len(info)}")

Number of game info: 50


#### Accessing Info From Each Game Link

In [106]:
# getting the links 
link_list = []
for game in games:
    link = game.find_element(By.XPATH, ".//a[@class='title game_link']").get_attribute('href')
    link_list.append(link)

# print(f"First five: {link_list[:5]}")
print(f"Number of links: {len(link_list)}")

Number of links: 50


In [107]:
## delete before submission -> just checking if the im scrapping the right info
# accessing first game
driver.get(link_list[0])
# scroll and click 'more information' button 
info_button = driver.find_element(By.XPATH, "//a[@class='toggle_info_btn']")
driver.execute_script("arguments[0].scrollIntoView();", info_button)
info_button.click()


In [108]:
status = driver.find_element(By.XPATH, "//tr[td[text()='Status']]/td[2]")
print(status.text)

rating_row = driver.find_element(By.XPATH, "//tr[td[text()='Rating']]/td[2]")
rating = rating_row.find_element(By.XPATH, "//div[@class='star_value']").get_attribute("content")
print(rating)
rating_count = rating_row.find_element(By.XPATH, "//span[@class='rating_count']").get_attribute("content")
print(rating_count)

tags = driver.find_element(By.XPATH, "//tr[td[text()='Tags']]/td[2]")
print(tags.text)

#first link doesnt have average session time
#sesh_time = driver.find_element(By.XPATH, "//tr[td[text()='Average session']]/td[2]")
#print(sesh_time.text)

platforms = driver.find_element(By.XPATH, "//tr[td[text()='Platforms']]/td[2]")
print(platforms.text)


4.6
1172

HTML5, Windows, macOS, Linux, Android


In [109]:
#NOTE: by default, len(link_list) = max_game_count; left it here in case link_list is truncated even further for testing
link_list = link_list[:50]
len(link_list)

50

In [110]:
#NOTE: it takes around 3.5 min for 50 links

more_info = []

for i in range(len(link_list)):

    data = []

    driver.get(link_list[i])

    info_button = driver.find_element(By.XPATH, "//a[@class='toggle_info_btn']")

    # scroll and click 'more information' button 
    try:
        driver.execute_script("arguments[0].scrollIntoView();", info_button)
        info_button.click()
        time.sleep(2) # pause for it load a bit

    except NoSuchElementException:
        print(f"Link {i+1}: 'More Information' button not found")
        data.extend(["N/A", "N/A", "N/A", "N/A", "N/A", "N/A"])
        more_info.append(data)
        continue

    status = driver.find_elements(By.XPATH, "//tr[td[text()='Status']]/td[2]")

    rating_row = driver.find_element(By.XPATH, "//tr[td[text()='Rating']]/td[2]")
    rating = rating_row.find_element(By.XPATH, "//div[@class='star_value']").get_attribute("content")
    rating_count = rating_row.find_element(By.XPATH, "//span[@class='rating_count']").get_attribute("content")

    tags = driver.find_elements(By.XPATH, "//tr[td[text()='Tags']]/td[2]")

    sesh_time = driver.find_elements(By.XPATH, "//tr[td[text()='Average session']]/td[2]")
    platforms = driver.find_elements(By.XPATH, "//tr[td[text()='Platforms']]/td[2]")

    if status:
        data.append(status[0].text)
    else:
        data.append("N/A")

    if rating:
        data.append(rating)
    else:
        data.append("N/A")

    if rating_count:
        data.append(rating_count)
    else:
        data.append("N/A")

    if tags:
        data.append(tags[0].text)
    else:
        data.append("N/A")

    if sesh_time:
        data.append(sesh_time[0].text)
    else:
        data.append("N/A")

    if platforms:
        data.append(platforms[0].text)
    else:
        data.append("N/A")
    
    more_info.append(data)


### Loading the Data into the DataFrame

In [111]:
columns = ['Game ID', 'Title', 'Genre', 'Author', 'Text']
info = pd.DataFrame(info, columns=columns)
info

Unnamed: 0,Game ID,Title,Genre,Author,Text
0,2955066,Incredibox - Sprunki,,wolf_hal,
1,3224595,Little Bartmares,Adventure,David Mills,WHY YOU LITTLE!!!
2,3216520,The Apartment 57,Adventure,Infinity Entertainment,is a psychological horror game set in an aband...
3,3148668,Pretend it's not There,Adventure,Dreadloom,"Pretend that you can't see the monster, that m..."
4,2513640,Ignited Entry,Adventure,JordiBoi,The corpse is alive.
5,3170979,Unbothered,Adventure,Yegboi,
6,3104891,The Dark Queen of Mortholme,Interactive Fiction,Mosu,The final boss's side of the story
7,323822,Death Trips,Interactive Fiction,Forameuss,Extremely short horror game.
8,2553372,Fallacy Quiz,Educational,Tim Krief,Sharpen your critical thinking skills in this ...
9,2742018,Cursed Apartment,Adventure,Horror Ponds,Anomalies + Survival Horror


In [112]:
columns = ['Status', 'Average Rating', 'Rating Count', 'Tags', 'Average Session Time', 'Platforms']
more_info = pd.DataFrame(more_info, columns=columns)
more_info.head()

Unnamed: 0,Status,Average Rating,Rating Count,Tags,Average Session Time,Platforms
0,Released,4.6,1172,"Cute, Fangame, Horror, Incredibox, minigames, ...",,"HTML5, Windows, macOS, Linux, Android"
1,Released,4.6,27,"3D, Atmospheric, Horror, PSX (PlayStation), Ps...",,"Windows, macOS, Linux"
2,Released,4.1,84,"Creepy, Dark, Horror, Indie, Multiple Endings,...",,Windows
3,Released,4.4,265,"3D, Atmospheric, First-Person, Horror, PSX (Pl...",A few seconds,"Windows, macOS"
4,Released,4.8,298,"Atmospheric, Horror, Low-poly, PSX (PlayStatio...",About an hour,Windows


In [113]:
columns = ['Game ID', 'Title', 'Genre', 'Author', 'Text', 'Status', 'Average Rating', 'Rating Count', 'Tags', 'Average Session Time', 'Platforms']
result = pd.concat([info, more_info], axis=1)
df = pd.DataFrame(result)
df

Unnamed: 0,Game ID,Title,Genre,Author,Text,Status,Average Rating,Rating Count,Tags,Average Session Time,Platforms
0,2955066,Incredibox - Sprunki,,wolf_hal,,Released,4.6,1172,"Cute, Fangame, Horror, Incredibox, minigames, ...",,"HTML5, Windows, macOS, Linux, Android"
1,3224595,Little Bartmares,Adventure,David Mills,WHY YOU LITTLE!!!,Released,4.6,27,"3D, Atmospheric, Horror, PSX (PlayStation), Ps...",,"Windows, macOS, Linux"
2,3216520,The Apartment 57,Adventure,Infinity Entertainment,is a psychological horror game set in an aband...,Released,4.1,84,"Creepy, Dark, Horror, Indie, Multiple Endings,...",,Windows
3,3148668,Pretend it's not There,Adventure,Dreadloom,"Pretend that you can't see the monster, that m...",Released,4.4,265,"3D, Atmospheric, First-Person, Horror, PSX (Pl...",A few seconds,"Windows, macOS"
4,2513640,Ignited Entry,Adventure,JordiBoi,The corpse is alive.,Released,4.8,298,"Atmospheric, Horror, Low-poly, PSX (PlayStatio...",About an hour,Windows
5,3170979,Unbothered,Adventure,Yegboi,,Released,4.3,52,"Aliens, First-Person, Funny, Horror, storygame",,
6,3104891,The Dark Queen of Mortholme,Interactive Fiction,Mosu,The final boss's side of the story,Released,4.9,431,"2D, Atmospheric, Boss battle, Dark Fantasy, Fa...",,"Windows, macOS, Linux"
7,323822,Death Trips,Interactive Fiction,Forameuss,Extremely short horror game.,Released,4.6,616,"3D, Dark, First-Person, Halloween, Horrible, H...",A few seconds,"Windows, macOS, Linux"
8,2553372,Fallacy Quiz,Educational,Tim Krief,Sharpen your critical thinking skills in this ...,Released,4.8,23,"Brain Training, Casual, fallacy, Hand-drawn, I...",About a half-hour,"Windows, macOS, Linux"
9,2742018,Cursed Apartment,Adventure,Horror Ponds,Anomalies + Survival Horror,Released,4.3,29,"3D, anomaly, Creepy, First-Person, Horror, Mul...",About a half-hour,"Windows, macOS"


In [114]:
df[:50].isnull().sum()

Game ID                 0
Title                   0
Genre                   0
Author                  0
Text                    0
Status                  0
Average Rating          0
Rating Count            0
Tags                    0
Average Session Time    0
Platforms               0
dtype: int64

In [115]:
na_count = (df[:50] == "N/A").sum()
na_count

Game ID                  0
Title                    0
Genre                    3
Author                   0
Text                     3
Status                   0
Average Rating           0
Rating Count             0
Tags                     1
Average Session Time    21
Platforms                2
dtype: int64

### Exporting the DataFrame to CSV

In [116]:
df.to_csv("Homework 1 - itch.io Game List.csv",index=False)