# DATA102 Homework 1: Web Scraping

**Group Number**: 7 <br/>
**Members**:
- Jose Maria Angelo Guerra
- Kyle Carlo Lasala
- Katrina Bianca Roco
- Antonio Jose Maria Lorenzo
- Josh Angelo Theodore Borro
- Charles Joseph Hinolan

**Section**: S11

**Instructor**: Mr. Jude Michael Teves

### Import Libraries

In [14]:
import requests
import numpy as np
import pandas as pd
import time
from threading import Thread
from multiprocessing import Pool
from IPython.display import clear_output

# Selenium
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.chrome.service import Service #microsft edge, change to webdriver.chrome.service for chrome

### Parse Using BeautifulSoup

In [2]:
from bs4 import BeautifulSoup
page = "https://itch.io/games"
#contact is my personal email
headers = {"User-Agent": "EducationalScraper/1.0 (contact: hinolancj@gmail.com)"}

# Disallowed paths based on itch.io/robots.txt
disallowed_paths = ["/embed/", "/embed-upload/", "/search", "/checkout/", "/game/download/", "/bundle/download/", "/register-for-purchase/", "/email-feedback/"]

# Function to check if URL is allowed
def is_allowed(url):
    for path in disallowed_paths:
        if path in url:
            return False
    return True


url = f"{page}"
if is_allowed(url):
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.text, "html.parser")
        print(soup.prettify())
        time.sleep(2)
    except requests.exceptions.RequestException as e:
        print("Error: Unable to fetch the page.")
else:
    print(f"Skipping disallowed URL: {url}")

<!DOCTYPE HTML>
<html lang="en">
 <head>
  <meta charset="utf-8"/>
  <meta content="IBM HomePage Builder 2001 V5.0.0 for Windows" name="GENERATOR"/>
  <meta content="3BB4D18369B9C21326AF7A99FCCC5A09" name="msvalidate.01">
   <meta content="537395183072744" property="fb:app_id"/>
   <title>
    Top games - itch.io
   </title>
   <meta name="csrf_token" value="WyJWQ3Z0IiwxNzM4MDA1MjI4LCJMcWhRYVc2SUlycGRMaVQiXQ==.cL8RsNi29dKahzGg2hViMJvN84I="/>
   <meta content="Top games" property="og:title"/>
   <meta content="itch.io" property="og:site_name"/>
   <meta content="4503599627724030" property="twitter:account_id"/>
   <link href="?page=2" rel="next"/>
   <link href="/static/manifest.json" rel="manifest"/>
   <meta content="@itchio" name="twitter:creator"/>
   <meta content="Top games" name="twitter:title"/>
   <meta content="Explore games on itch.io" name="twitter:description"/>
   <meta content="@itchio" name="twitter:site"/>
   <meta content="summary_large_image" name="twitter:card"/>
   

### Setup Browser Automation

In [27]:
driver_path = "C:/Users/Kyle Carlo C. Lasala/Documents/CODING/Python/DATA102/driver/chromedriver.exe" #edit your driver's path
url = "https://itch.io/games" 

#service = Service(driver_path)
driver = webdriver.Chrome()

driver.get(url)

### Extracting the Data

data to extract (for now):
1. game id (class=game_cell has_cover)
2. game title (class=game_title)
3. genre (class=game_genre)
4. author (class=game_author)
5. game text (class=game_text)

-------------- need to click on the game to get the following data below--------------

6. status
7. average rating
8. rating count
9. tags 
10. average session time 
11. platforms 
12. inputs -> i forgot to scrape heh

#### Auto Scrolling Algorithm 

In [4]:
#auto scrolling algorithm 
#NOTE: max_game_count limits the number of games to parse through -> limits both the auto scrolling and the games list
pause = 0.5
lastHeight = driver.execute_script("return document.body.scrollHeight")

length = 0
max_game_count = 1500

while length < max_game_count:
    game_list = driver.find_elements(By.XPATH,"//div[@class='game_cell has_cover']")
    length = len(game_list)
    
    # checking progress
    clear_output(wait=True)
    print('Games Loaded:', length)
    
    if length >= max_game_count:
        break
        
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    time.sleep(pause)
    newHeight = driver.execute_script("return document.body.scrollHeight")
    if newHeight == lastHeight:
        break
    lastHeight = newHeight
print('DONE!')

Games Loaded: 1508
DONE!


#### Retrieving Game Info from Homepage

In [6]:
# truncate based on max_game_count
games = game_list[:max_game_count]

#extract game data
#NOTE: some games do not have all the data, so we need to account for that by adding N/A if the data is not present in order for the lists to be the same length
#NOTE: moved the code for obtaining game links here because for some reason it always got different lengths of game lists
def append_to_data(*args, game_id, link, data_list):
    data_list.append(game_id)
    for each in args:
        # check for null data
        data_list.append("N/A" if not each else each[0].text)
    data_list.append(link)
        
def retrieve_games_info(start_index, end_index, games_info):
    for game in games[start_index:end_index]:
        data = []
        # all games are guaranteed to have a game_id
        game_id = game.get_attribute("data-game_id")    
        title = game.find_elements(By.XPATH, ".//a[@class='title game_link']")
        genre = game.find_elements(By.XPATH, ".//div[@class='game_genre']")
        author = game.find_elements(By.XPATH, ".//div[@class='game_author']")
        text = game.find_elements(By.XPATH, ".//div[@class='game_text']")
        link = game.find_element(By.XPATH, ".//a[@class='title game_link']").get_attribute('href')
        
        # append the game_id, title, genre, author, and text to data array
        append_to_data(title, genre, author, text, game_id=game_id, link=link, data_list=data)
    
        # append the data array to games_info numpy array
        games_info = np.vstack((games_info, data))
    return games_info

In [7]:
# create a thread to retrieve the game info from scraped games
class RetrieveThread(Thread):
    def __init__(self, start_index, end_index): 
        Thread.__init__(self) 
        self.start_index = start_index
        self.end_index = end_index
        self.games_info = np.empty(shape=[0,6])

    def run(self):
        self.games_info = retrieve_games_info(self.start_index, self.end_index, games_info=self.games_info)

In [8]:
num_threads = 2
index_interval = max_game_count // num_threads
threads = []

# initializing the threads
for each in range(num_threads):
    start_index = each * index_interval
    # making sure the end_index is the last index
    end_index = max_game_count if each == num_threads - 1 else each * index_interval + index_interval
    t = RetrieveThread(start_index, end_index)
    t.start()
    threads.append(t)

# sync
for each in threads:
    each.join()

In [11]:
# combine all data from threads
games_info = np.empty(shape=[0,6])
for each in threads:
    games_info = np.vstack((games_info, each.games_info))

games_info_df = pd.DataFrame(games_info)
games_info_df.columns = ['game_id', 'title', 'genre', 'author', 'text', 'link']
games_info_df

Unnamed: 0,game_id,title,genre,author,text,link
0,2955066,Incredibox - Sprunki,,wolf_hal,,https://wolf-hal.itch.io/incredibox-sprunki
1,3224595,Little Bartmares,Adventure,David Mills,WHY YOU LITTLE!!!,https://shadowband.itch.io/little-bartmares
2,3216520,The Apartment 57,Adventure,Infinity Entertainment,is a psychological horror game set in an aband...,https://infinity-entertainment.itch.io/the-apa...
3,3148668,Pretend it's not There,Adventure,Dreadloom,"Pretend that you can't see the monster, that m...",https://dreadloom-studios.itch.io/pretend-its-...
4,2513640,Ignited Entry,Adventure,JordiBoi,The corpse is alive.,https://jordiboi.itch.io/ignited-entry
...,...,...,...,...,...,...
1495,2978259,Raldi Crackhouse Android Edition (Beta),,BMAS2,,https://bmas2.itch.io/raldi-crack-house-androi...
1496,980953,DISMEMBER MIND 1,Adventure,ReveVoodoo,,https://revevoodoo.itch.io/dismember-mind
1497,88189,I Suddenly Became A Magical Girl To Fight Alie...,Visual Novel,Mado,"A visual novel about magical girls, aliens, an...",https://madocactus.itch.io/isuddenlybecameamag...
1498,976655,Astral Green,Shooter,skr33t,"An FPS inspired by mid 90s shooters like DOOM,...",https://skr33t.itch.io/astral-green


In [12]:
# close the current instance of driver
driver.quit()

In [17]:
link_list = games_info_df['link'][:50]
print(len(link_list))

50


In [30]:
import worker # for jupyter notebook, the code should be in a separate file for multiprocessing to work!

if __name__ == "__main__": # necessary for main process
    # WARNING: Please limit the Pool equal to your physical processors!!! Ready your PC fans!
    p = Pool(4)
    output = p.map(worker.scrape_more_info, link_list[:4])
    print(output)

[['Released', '4.6', '1176', 'Cute, Fangame, Horror, Incredibox, minigames, Sprunki', 'N/A', 'HTML5, Windows, macOS, Linux, Android'], ['Released', '4.5', '29', '3D, Atmospheric, Horror, PSX (PlayStation), Psychological Horror, Retro, Singleplayer, Survival Horror', 'N/A', 'Windows, macOS, Linux'], ['Released', '4.1', '90', 'Creepy, Dark, Horror, Indie, Multiple Endings, Psychological Horror, Retro, Short, Spooky, Unity', 'N/A', 'Windows'], ['Released', '4.3', '273', '3D, Atmospheric, First-Person, Horror, PSX (PlayStation), Psychological Horror, Short, Singleplayer, Unity', 'A few seconds', 'Windows, macOS']]


#### Accessing Info From Each Game Link

In [13]:
# getting the links 
link_list = games_info_df['link']
print(f"Number of links: {len(link_list)}")

Number of links: 1500


In [15]:
## delete before submission -> just checking if the im scrapping the right info
# accessing first game
driver.get(link_list[0])
# scroll and click 'more information' button 
info_button = driver.find_element(By.XPATH, "//a[@class='toggle_info_btn']")
driver.execute_script("arguments[0].scrollIntoView();", info_button)
info_button.click()

In [19]:
status = driver.find_element(By.XPATH, "//tr[td[text()='Status']]/td[2]")
print(status.text)

rating_row = driver.find_element(By.XPATH, "//tr[td[text()='Rating']]/td[2]")
rating = rating_row.find_element(By.XPATH, "//div[@class='star_value']").get_attribute("content")
print(rating)
rating_count = rating_row.find_element(By.XPATH, "//span[@class='rating_count']").get_attribute("content")
print(rating_count)

tags = driver.find_element(By.XPATH, "//tr[td[text()='Tags']]/td[2]")
print(tags.text)

#first link doesnt have average session time
#sesh_time = driver.find_element(By.XPATH, "//tr[td[text()='Average session']]/td[2]")
#print(sesh_time.text)

platforms = driver.find_element(By.XPATH, "//tr[td[text()='Platforms']]/td[2]")
print(platforms.text)

Released
4.6
1175
Cute, Fangame, Horror, Incredibox, minigames, Sprunki
HTML5, Windows, macOS, Linux, Android


In [102]:
# class MoreRetrieveThread(Thread):
#     def __init__(self, start_index, end_index): 
#         Thread.__init__(self) 
#         self.start_index = start_index
#         self.end_index = end_index
#         self.games_info = np.empty(shape=[0,6])

#     def run(self):
#         self.games_info = retrieve_games_info(self.start_index, self.end_index, games_info=self.games_info)

'https://wolf-hal.itch.io/incredibox-sprunki'

In [28]:
#NOTE: it takes around 3.5 min for 50 links

more_info = []

for i in range(len(link_list[:4])):

    data = []

    driver.get(link_list[i])

    info_button = driver.find_element(By.XPATH, "//a[@class='toggle_info_btn']")

    # scroll and click 'more information' button 
    try:
        driver.execute_script("arguments[0].scrollIntoView();", info_button)
        info_button.click()
        time.sleep(2) # pause for it load a bit

    except NoSuchElementException:
        print(f"Link {i+1}: 'More Information' button not found")
        data.extend(["N/A", "N/A", "N/A", "N/A", "N/A", "N/A"])
        more_info.append(data)
        continue

    status = driver.find_elements(By.XPATH, "//tr[td[text()='Status']]/td[2]")

    rating_row = driver.find_element(By.XPATH, "//tr[td[text()='Rating']]/td[2]")
    rating = rating_row.find_element(By.XPATH, "//div[@class='star_value']").get_attribute("content")
    rating_count = rating_row.find_element(By.XPATH, "//span[@class='rating_count']").get_attribute("content")

    tags = driver.find_elements(By.XPATH, "//tr[td[text()='Tags']]/td[2]")

    sesh_time = driver.find_elements(By.XPATH, "//tr[td[text()='Average session']]/td[2]")
    platforms = driver.find_elements(By.XPATH, "//tr[td[text()='Platforms']]/td[2]")

    if status:
        data.append(status[0].text)
    else:
        data.append("N/A")

    if rating:
        data.append(rating)
    else:
        data.append("N/A")

    if rating_count:
        data.append(rating_count)
    else:
        data.append("N/A")

    if tags:
        data.append(tags[0].text)
    else:
        data.append("N/A")

    if sesh_time:
        data.append(sesh_time[0].text)
    else:
        data.append("N/A")

    if platforms:
        data.append(platforms[0].text)
    else:
        data.append("N/A")
    
    more_info.append(data)


### Loading the Data into the DataFrame

In [22]:
columns = ['Game ID', 'Title', 'Genre', 'Author', 'Text']
info = pd.DataFrame(info, columns=columns)
info

Unnamed: 0,Game ID,Title,Genre,Author,Text
0,2955066,Incredibox - Sprunki,,wolf_hal,
1,3224595,Little Bartmares,Adventure,David Mills,WHY YOU LITTLE!!!
2,3216520,The Apartment 57,Adventure,Infinity Entertainment,is a psychological horror game set in an aband...
3,3148668,Pretend it's not There,Adventure,Dreadloom,"Pretend that you can't see the monster, that m..."
4,2513640,Ignited Entry,Adventure,JordiBoi,The corpse is alive.
5,3170979,Unbothered,Adventure,Yegboi,
6,566214,Last Train Home,Adventure,hby,A short game about a guy stuck on a train.
7,2553372,Fallacy Quiz,Educational,Tim Krief,Sharpen your critical thinking skills in this ...
8,1969893,TOUCHSTARVED: A Dark Romance Visual Novel,Visual Novel,Red Spring Studio,Find a cure for your curse by entrusting your ...
9,3264682,FATAL CHOICE,Adventure,kyezdev,Don't make the wrong choice.. think before cho...


In [23]:
columns = ['Status', 'Average Rating', 'Rating Count', 'Tags', 'Average Session Time', 'Platforms']
more_info = pd.DataFrame(more_info, columns=columns)
more_info.head()

Unnamed: 0,Status,Average Rating,Rating Count,Tags,Average Session Time,Platforms
0,Released,4.6,1175,"Cute, Fangame, Horror, Incredibox, minigames, ...",,"HTML5, Windows, macOS, Linux, Android"
1,Released,4.5,29,"3D, Atmospheric, Horror, PSX (PlayStation), Ps...",,"Windows, macOS, Linux"
2,Released,4.1,87,"Creepy, Dark, Horror, Indie, Multiple Endings,...",,Windows
3,Released,4.4,271,"3D, Atmospheric, First-Person, Horror, PSX (Pl...",A few seconds,"Windows, macOS"
4,Released,4.8,300,"Atmospheric, Horror, Low-poly, PSX (PlayStatio...",About an hour,Windows


In [24]:
columns = ['Game ID', 'Title', 'Genre', 'Author', 'Text', 'Status', 'Average Rating', 'Rating Count', 'Tags', 'Average Session Time', 'Platforms']
result = pd.concat([info, more_info], axis=1)
df = pd.DataFrame(result)
df

Unnamed: 0,Game ID,Title,Genre,Author,Text,Status,Average Rating,Rating Count,Tags,Average Session Time,Platforms
0,2955066,Incredibox - Sprunki,,wolf_hal,,Released,4.6,1175,"Cute, Fangame, Horror, Incredibox, minigames, ...",,"HTML5, Windows, macOS, Linux, Android"
1,3224595,Little Bartmares,Adventure,David Mills,WHY YOU LITTLE!!!,Released,4.5,29,"3D, Atmospheric, Horror, PSX (PlayStation), Ps...",,"Windows, macOS, Linux"
2,3216520,The Apartment 57,Adventure,Infinity Entertainment,is a psychological horror game set in an aband...,Released,4.1,87,"Creepy, Dark, Horror, Indie, Multiple Endings,...",,Windows
3,3148668,Pretend it's not There,Adventure,Dreadloom,"Pretend that you can't see the monster, that m...",Released,4.4,271,"3D, Atmospheric, First-Person, Horror, PSX (Pl...",A few seconds,"Windows, macOS"
4,2513640,Ignited Entry,Adventure,JordiBoi,The corpse is alive.,Released,4.8,300,"Atmospheric, Horror, Low-poly, PSX (PlayStatio...",About an hour,Windows
5,3170979,Unbothered,Adventure,Yegboi,,Released,4.2,56,"Aliens, First-Person, Funny, Horror, storygame",,
6,566214,Last Train Home,Adventure,hby,A short game about a guy stuck on a train.,Released,4.5,2225,"2D, Atmospheric, Horror, Pixel Art",,"HTML5, Windows"
7,2553372,Fallacy Quiz,Educational,Tim Krief,Sharpen your critical thinking skills in this ...,Released,4.8,23,"Brain Training, Casual, fallacy, Hand-drawn, I...",About a half-hour,"Windows, macOS, Linux"
8,1969893,TOUCHSTARVED: A Dark Romance Visual Novel,Visual Novel,Red Spring Studio,Find a cure for your curse by entrusting your ...,In development,4.9,2066,"Dating Sim, Fantasy, Horror, Indie, Monsters, ...",A few hours,"Windows, macOS"
9,3264682,FATAL CHOICE,Adventure,kyezdev,Don't make the wrong choice.. think before cho...,Released,5.0,3,"3D, Atmospheric, Creepy, First-Person, Horror,...",,Windows


In [114]:
df[:50].isnull().sum()

Game ID                 0
Title                   0
Genre                   0
Author                  0
Text                    0
Status                  0
Average Rating          0
Rating Count            0
Tags                    0
Average Session Time    0
Platforms               0
dtype: int64

In [115]:
na_count = (df[:50] == "N/A").sum()
na_count

Game ID                  0
Title                    0
Genre                    3
Author                   0
Text                     3
Status                   0
Average Rating           0
Rating Count             0
Tags                     1
Average Session Time    21
Platforms                2
dtype: int64

### Exporting the DataFrame to CSV

In [116]:
df.to_csv("Homework 1 - itch.io Game List.csv",index=False)