# DATA102 Homework 1: Web Scraping

**Group Number**: 3
**Members**:
- Jose Maria Angelo Guerra
- Kyle Carlo Lasala
- Katrina Bianca Roco
- Antonio Jose Maria Lorenzo
- Josh Angelo Theodore Borro
- Charles Joseph Hinolan

**Section**: S11

**Instructor**: Mr. Jude Michael Teves

### Import Libraries

In [303]:
import requests
import pandas as pd
import time

### Parse Using BeautifulSoup

In [None]:
from bs4 import BeautifulSoup
page = "https://itch.io/games"
#contact is my personal email
headers = {"User-Agent": "EducationalScraper/1.0 (contact: hinolancj@gmail.com)"}

# Disallowed paths based on itch.io/robots.txt
disallowed_paths = ["/embed/", "/embed-upload/", "/search", "/checkout/", "/game/download/", "/bundle/download/", "/register-for-purchase/", "/email-feedback/"]

# Function to check if URL is allowed
def is_allowed(url):
    for path in disallowed_paths:
        if path in url:
            return False
    return True


url = f"{page}"
if is_allowed(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, "html.parser")
    print(soup.prettify())
    time.sleep(2)
else:
    print(f"Skipping disallowed URL: {url}")

### Setup Browser Automation

In [305]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.chrome.service import Service #microsft edge, change to webdriver.chrome.service for chrome

driver_path = "C:/Users/Charles/Desktop/COLLEGE/DATA102/edgedriver_win64/msedgedriver.exe" #edit your driver's path
url = "https://itch.io/games" # edit me

service = Service(driver_path)
driver = webdriver.Chrome()

driver.get(url)
#print(driver.page_source)

### Extracting the Data

data to extract (for now):
1. game id (class=game_cell has_cover)
2. game title (class=game_title)
3. genre (class=game_genre)
4. author (class=game_author)
5. game text (class=game_text)

-------------- need to click on the game to get the following data below--------------

6. status
7. average rating
8. rating count
9. tags 
10. average session time 
11. platforms 
12. inputs -> i forgot to scrape heh

#### Auto Scrolling Algorithm 

In [306]:
#auto scrolling algorithm 
#TODO: make it retreive 1000 games
pause = 0.5
lastHeight = driver.execute_script("return document.body.scrollHeight")

while True:
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    time.sleep(pause)
    newHeight = driver.execute_script("return document.body.scrollHeight")
    if newHeight == lastHeight:
        break
    lastHeight = newHeight

#### Retrieving Game Info from Homepage

In [None]:
#retrieve games
games = driver.find_elements(By.XPATH, "//div[@class='game_cell has_cover']")

info = [] 

#extract game data
#NOTE: some games do not have all the data, so we need to account for that by adding N/A if the data is not present in order for the lists to be the same length
#TODO: probably a better way to do the conditions
for game in games:

    data = []
    #all games are guaranteed to have a game_id
    game_id = game.get_attribute("data-game_id")
    data.append(game_id)

    title = game.find_elements(By.XPATH, ".//a[@class='title game_link']")
    genre = game.find_elements(By.XPATH, ".//div[@class='game_genre']")
    author = game.find_elements(By.XPATH, ".//div[@class='game_author']")
    text = game.find_elements(By.XPATH, ".//div[@class='game_text']")

    if title:
        data.append(title[0].text)
    else:
        data.append("N/A")

    if genre:
        data.append(genre[0].text)
    else:
        data.append("N/A")

    if author:
        data.append(author[0].text)
    else:
        data.append("N/A")

    if text:
        data.append(text[0].text)
    else:
        data.append("N/A")

    info.append(data)
    
print(info)


#### Accessing Info From Each Game Link

In [308]:
# getting the links 
link_list = []
for game in games:
    link = game.find_element(By.XPATH, ".//a[@class='title game_link']").get_attribute('href')
    link_list.append(link)

# print(f"First five: {link_list[:5]}")
print(f"Number of links: {len(link_list)}")

Number of links: 1523


In [309]:
## delete before submission -> just checking if the im scrapping the right info
# accessing first game
driver.get(link_list[0])
# scroll and click 'more information' button 
info_button = driver.find_element(By.XPATH, "//a[@class='toggle_info_btn']")
driver.execute_script("arguments[0].scrollIntoView();", info_button)
info_button.click()


In [310]:
status = driver.find_element(By.XPATH, "//tr[td[text()='Status']]/td[2]")
print(status.text)

rating_row = driver.find_element(By.XPATH, "//tr[td[text()='Rating']]/td[2]")
rating = rating_row.find_element(By.XPATH, "//div[@class='star_value']").get_attribute("content")
print(rating)
rating_count = rating_row.find_element(By.XPATH, "//span[@class='rating_count']").get_attribute("content")
print(rating_count)

tags = driver.find_element(By.XPATH, "//tr[td[text()='Tags']]/td[2]")
print(tags.text)

#first link doesnt have average session time
#sesh_time = driver.find_element(By.XPATH, "//tr[td[text()='Average session']]/td[2]")
#print(sesh_time.text)

platforms = driver.find_element(By.XPATH, "//tr[td[text()='Platforms']]/td[2]")
print(platforms.text)


4.6
1166
Cute, Fangame, Horror, Incredibox, minigames, Sprunki
HTML5, Windows, macOS, Linux, Android


In [311]:
#NOTE: scrapped first 50 links only just for checking
link_list = link_list[:50]
len(link_list)

50

In [312]:
more_info = []

for i in range(len(link_list)):

    data = []

    driver.get(link_list[i])

    # scroll and click 'more information' button 
    info_button = driver.find_element(By.XPATH, "//a[@class='toggle_info_btn']")
    try:
        driver.execute_script("arguments[0].scrollIntoView();", info_button)
        info_button.click()
        time.sleep(2) # pause for it load a bit

    except NoSuchElementException:
        print(f"Link {i+1}: 'More Information' button not found")
        data.extend(["N/A", "N/A", "N/A", "N/A", "N/A", "N/A"])
        more_info.append(data)
        continue

    status = driver.find_elements(By.XPATH, "//tr[td[text()='Status']]/td[2]")

    rating_row = driver.find_element(By.XPATH, "//tr[td[text()='Rating']]/td[2]")
    rating = rating_row.find_element(By.XPATH, "//div[@class='star_value']").get_attribute("content")
    rating_count = rating_row.find_element(By.XPATH, "//span[@class='rating_count']").get_attribute("content")

    tags = driver.find_elements(By.XPATH, "//tr[td[text()='Tags']]/td[2]")

    sesh_time = driver.find_elements(By.XPATH, "//tr[td[text()='Average session']]/td[2]")
    platforms = driver.find_elements(By.XPATH, "//tr[td[text()='Platforms']]/td[2]")

    if status:
        data.append(status[0].text)
    else:
        data.append("N/A")

    if rating:
        data.append(rating)
    else:
        data.append("N/A")

    if rating_count:
        data.append(rating_count)
    else:
        data.append("N/A")

    if tags:
        data.append(tags[0].text)
    else:
        data.append("N/A")

    if sesh_time:
        data.append(sesh_time[0].text)
    else:
        data.append("N/A")

    if platforms:
        data.append(platforms[0].text)
    else:
        data.append("N/A")
    
    more_info.append(data)


### Loading the Data into the DataFrame

In [313]:
columns = ['Game ID', 'Title', 'Genre', 'Author', 'Text']
info = pd.DataFrame(info, columns=columns)
info

Unnamed: 0,Game ID,Title,Genre,Author,Text
0,2955066,Incredibox - Sprunki,,wolf_hal,
1,3216520,The Apartment 57,Adventure,Infinity Entertainment,is a psychological horror game set in an aband...
2,3148668,Pretend it's not There,Adventure,Dreadloom,"Pretend that you can't see the monster, that m..."
3,2077106,A Date with Death,Visual Novel,Two and a Half Studios,Enter a week long bet against the Grim Reaper ...
4,3106450,Crusty Proto,Action,3DI70R,Glitchy PS1 horror slop straight from 1997
...,...,...,...,...,...
1518,1731910,HACK THE PLANET,Puzzle,Jumpgate Games,hack the cops and free the world. a ludum dare...
1519,196758,Onirism,Adventure,onirism,You like great old school adventure titles lik...
1520,934627,Blind Drive,Action,lofipeople,"Audio-based, black comedy arcade action game."
1521,47515,Slash Quest,Action,Big Green Pillow,Your sword grows after each kill


In [314]:
columns = ['Status', 'Average Rating', 'Rating Count', 'Tags', 'Average Session Time', 'Platforms']
more_info = pd.DataFrame(more_info, columns=columns)
more_info.head()

Unnamed: 0,Status,Average Rating,Rating Count,Tags,Average Session Time,Platforms
0,Released,4.6,1166,"Cute, Fangame, Horror, Incredibox, minigames, ...",,"HTML5, Windows, macOS, Linux, Android"
1,Released,4.2,76,"Creepy, Dark, Horror, Indie, Multiple Endings,...",,Windows
2,Released,4.3,255,"3D, Atmospheric, First-Person, Horror, PSX (Pl...",A few seconds,"Windows, macOS"
3,Released,4.9,2877,"Amare, chat-sim, chat-simulator, Gay, LGBT, LG...",About an hour,"Windows, macOS, Linux"
4,Released,4.8,326,"Creepy, Funny, glitch, Horror, PSX (PlayStatio...",A few minutes,"HTML5, Windows"


In [315]:
columns = ['Game ID', 'Title', 'Genre', 'Author', 'Text', 'Status', 'Average Rating', 'Rating Count', 'Tags', 'Average Session Time', 'Platforms']
result = pd.concat([info, more_info], axis=1)
df = pd.DataFrame(result)
df

Unnamed: 0,Game ID,Title,Genre,Author,Text,Status,Average Rating,Rating Count,Tags,Average Session Time,Platforms
0,2955066,Incredibox - Sprunki,,wolf_hal,,Released,4.6,1166,"Cute, Fangame, Horror, Incredibox, minigames, ...",,"HTML5, Windows, macOS, Linux, Android"
1,3216520,The Apartment 57,Adventure,Infinity Entertainment,is a psychological horror game set in an aband...,Released,4.2,76,"Creepy, Dark, Horror, Indie, Multiple Endings,...",,Windows
2,3148668,Pretend it's not There,Adventure,Dreadloom,"Pretend that you can't see the monster, that m...",Released,4.3,255,"3D, Atmospheric, First-Person, Horror, PSX (Pl...",A few seconds,"Windows, macOS"
3,2077106,A Date with Death,Visual Novel,Two and a Half Studios,Enter a week long bet against the Grim Reaper ...,Released,4.9,2877,"Amare, chat-sim, chat-simulator, Gay, LGBT, LG...",About an hour,"Windows, macOS, Linux"
4,3106450,Crusty Proto,Action,3DI70R,Glitchy PS1 horror slop straight from 1997,Released,4.8,326,"Creepy, Funny, glitch, Horror, PSX (PlayStatio...",A few minutes,"HTML5, Windows"
...,...,...,...,...,...,...,...,...,...,...,...
1518,1731910,HACK THE PLANET,Puzzle,Jumpgate Games,hack the cops and free the world. a ludum dare...,,,,,,
1519,196758,Onirism,Adventure,onirism,You like great old school adventure titles lik...,,,,,,
1520,934627,Blind Drive,Action,lofipeople,"Audio-based, black comedy arcade action game.",,,,,,
1521,47515,Slash Quest,Action,Big Green Pillow,Your sword grows after each kill,,,,,,


In [327]:
df[:50].isnull().sum()

Game ID                 0
Title                   0
Genre                   0
Author                  0
Text                    0
Status                  0
Average Rating          0
Rating Count            0
Tags                    0
Average Session Time    0
Platforms               0
dtype: int64

In [324]:
na_count = (df[:50] == "N/A").sum()
na_count

Game ID                  0
Title                    0
Genre                    7
Author                   0
Text                     4
Status                   0
Average Rating           0
Rating Count             0
Tags                     0
Average Session Time    23
Platforms                0
dtype: int64

### Exporting the DataFrame to CSV