In [None]:
"""This notebook will run on cloud notebooks only. For running 
this locally, watch the comments below in the cell after the 
selenium installation cell."""
print("Final Notebook")

Final Notebook


In [None]:
#Importing the required libraries

#Importing the requests library to make the HTTP get() requests.
import requests

#To create and work with DataFrame
import pandas as pd

#Library to scrape the HTML content
from bs4 import BeautifulSoup

#Importing time library, (to be used much later)
import time

#Importing the regular expression library
import re
from re import search

In [None]:
#Setting the main url
main_url = "https://store.steampowered.com/games/"

#To get a response object from the URL
response = requests.get(main_url)

#Return the response object(html) as text
website_html = response.text

#Creating a BeautifulSoup object, and using 'html.parser' to work with html text.
soup = BeautifulSoup(website_html, 'html.parser')

In [None]:
"""Getting all the names of the games and their respective links present on 
the main_url, However it contains all the html tags present in between"""

names_get = soup.find_all(name = "div", class_ = "tab_item_name")
links_get = soup.find_all(name = "a", class_ = "tab_item")

In [None]:
#Creating empty list of names and lists
names = []
links = []

"""Several links on the main page are for bundles and do not contain
reviews and other data, so filtering them out.
These bundles are not individual games, rather a bundle of games'new 
and previous versions along with MODs and packs.
For individual games "".com/app" is present in the link"""

substr = str(".com/app")

"""Using search() from re library to include links 
and names of individual games only."""

for name, link in zip(names_get,links_get):
  if search(substr, str(link)):
    names.append(name.getText())
    links.append(link.get("href"))

In [None]:
#Saving data for top 5 only as per project requirements.

top_5_links = links[:5]
top_5_names = names[:5]

In [None]:
#Creating empty list for number_od_positive_reviews developers and publishers
num_pos_rev = []
developers = []
publishers = []

"""Creating game_req dictionary which will contain requirement for every game
  requirements dictionary will be the value for each game in the game_req and
  will be intitalized to emplty dictionary in every iteration of loop"""
requirements = {}
game_req = {}


for game_name, link in zip(top_5_names, top_5_links):

  """Opening each link in the top_5_links and creating a 
    BS object for each of the links"""

  game_response = requests.get(link)
  game_page_html = game_response.text
  game_soup = BeautifulSoup(game_page_html, 'html.parser')


  
  #Locating postive reviews
  rev_loc = game_soup.find(name = "span", class_ = "responsive_hidden")

  """Creating an empty string to contain the number_of_postive_reviews
    Since the postive reviews are enclosed in paratheses, the loop will iterate
    the text and concatenate the digits only.
    The try-except has been frequently used here to account for errors
    arising out of absense of data for some games."""
  rev_number = ""
  try:
    for num in rev_loc.getText():
      if num.isdigit():
        rev_number+=num
  except:
    rev_number = ""
  num_pos_rev.append(rev_number)



  """Finding the developer name for each game"""
  try:
    dev_loc = game_soup.find(name = "div", class_ = "summary column", id = "developers_list")
    dev_find = dev_loc.find("a")
    dev = dev_find.getText()
    developers.append(dev)
  except:
    developers.append("")



    """Finding the publisher name for each game"""
  try:
    pub_loc = game_soup.find(name = "div", class_ = "dev_row")
    pub_find = pub_loc.find("a")
    pub = pub_find.getText()
    publishers.append(pub)
  except:
    publishers.append("")



  """Finding the requirements for each game"""
  try:
    req_loc = game_soup.find(name = "div", class_ = "sysreq_contents")
    req_find = req_loc.find("ul")
  except:
    pass

  """req_cat will contain the category of requirements, 
    for example: OS, DirectX, since every game has different
    categories of requirements listed, some have soundcard,
    additional notes and much more"""
  req_cat = []

  """The req_val will contain the corresponding value for each category of
    the requiremetnts"""
  req_val = []

  #Locating the requirements
  for req_text in req_find.find_all("li"):
    
    #Emptying the requirements dictionary as it is updated for each game
    requirements = {}


    req_cat.append(req_text.text.split(":")[0])
    """For some games, the value for a particular requirement category is absent
      for example, for some OS is not listed. Therefore using the try-except
      to account for errors arising out of absence of value."""
    try:
      req_val.append(req_text.text.split(":")[1])
    except:
      req_val.append("")
    

    for key, val in zip(req_cat, req_val):
      requirements[key] = val
  
  #Updating requirements for each game into game_req dictionary
  game_req[game_name] = requirements

In [None]:
"""Creating an empty dataframe and populating it with the data
scraped till now"""
temp_data_1 = pd.DataFrame()
temp_data_1["Name"] = top_5_names
temp_data_1["Link"] = top_5_links
temp_data_1["Developer"] = developers
temp_data_1["Publisher"] = publishers
temp_data_1["Number of Positive Reviews"] = num_pos_rev

In [None]:
"""Created a dataframe for game_req and saved its transpose in game_req_df.
Transposed the dataframe to be able to merge with the previously created 
dataframe(temp_data_1)"""
game_req_df = pd.DataFrame(game_req).T
game_req_df.reset_index(inplace = True)
game_req_df.rename(columns={"index":"Name"}, inplace = True)

#Merged dataframe saved as temp_data_2
temp_data_2 = temp_data_1.merge(game_req_df, how = "left", on = "Name")

In [None]:
#data without reviews
temp_data_2

Unnamed: 0,Name,Link,Developer,Publisher,Number of Positive Reviews,OS,Processor,Memory,Graphics,DirectX,Network,Storage,Sound Card,Additional Notes
0,Hunter's Arena: Legends,https://store.steampowered.com/app/1061100/Hun...,Mantisco,Mantisco,28,"64-bit Windows 7, Windows 8.1, Windows 10",Intel Core i5-4430 / AMD FX-6300,8 GB RAM,NVIDIA GeForce GTX 770 / AMD Radeon R7 370 2GB,Version 11,Broadband Internet connection,30 GB available space,,
1,Olaguna Chronicles,https://store.steampowered.com/app/985650/Olag...,SELeft Studio,SELeft Studio,43,WIN7/WIN10,Pentium Dual Core级以上,2 GB RAM,Geforce FX5600级/ATI Radeon9600以上 (支持Shader 2....,Version 9.0c,,700 MB available space,DirectX 可互换声卡,WINDOWS XP以下不能运行（含XP)
2,Who Stole My Beard?,https://store.steampowered.com/app/1141270/Who...,Cleardot Games,Cleardot Games,10,7,Intel 3 or higher,1024 MB RAM,Basic (2GB or higher),,,1024 MB available space,16-bit,
3,Creatures of Aether,https://store.steampowered.com/app/1593750/Cre...,Tako Boy Studios,Tako Boy Studios,88,Windows 7 / 8 / 10,2.0 Ghz,512 MB RAM,512 MB Video Memory,Version 9.0c,Broadband Internet connection,400 MB available space,,
4,GRIME,https://store.steampowered.com/app/1123050/GRI...,Clover Bite,Clover Bite,191,Windows 7,Intel i5 3470 or AMD equivelent,4 GB RAM,nVidia GeForce 960 or AMD equivelent,Version 10,,10 GB available space,,


In [None]:
#Installing selenium, chromium and its driver
"""The reason for using selenium is that scraping the reviews is not
possible using BeautifulSoup only, as BS can only get static content from the 
websites. On the otherhand, Selenium is provides a way to scrape the dynamic 
content. Here reviews are pulled from the profiles of each user and displayed 
on the game page"""
!apt update
!apt install chromium-chromedriver
!pip install selenium

from selenium import webdriver

# Setting option to headless to be able to use from Colab
options = webdriver.ChromeOptions()
options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')

[33m0% [Working][0m            Ign:1 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  InRelease
[33m0% [Connecting to archive.ubuntu.com] [Connecting to security.ubuntu.com (91.18[0m                                                                               Ign:2 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  InRelease
[33m0% [Connecting to archive.ubuntu.com] [Connecting to security.ubuntu.com (91.18[0m                                                                               Get:3 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  Release [697 B]
Hit:4 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  Release
Get:5 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  Release.gpg [836 B]
Get:6 https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/ InRelease [3,626 B]
Get:7 http://security.ubuntu.com/ubuntu 

In [None]:
#To run locally, set executable_path to the path of chrome driver on local machine
driver = webdriver.Chrome(options=options)

In [None]:
#Creating a dictionary to contain reviews for each game
game_reviews = {}
for name, link in zip(top_5_names,top_5_links):
  reviews = []

  #Using selenium driver to obtain the webpage content
  driver.get(link)
  """Using selenium to scroll down slowly to the document end.
    The loading of reviews takes a bit time and to make sure they are
    loaded, the page is scrolled down slowly, using time.sleep()"""
  driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
  time.sleep(5)

  #Obtaining the page source using the webdriver.
  html = driver.page_source

  #Creating a BS object
  soup = BeautifulSoup(html)

  #Finding the reviews that are located in review_box
  rb = soup.find_all(name = "div", class_ = "review_box")

  #Using a counter 'c' to obtain only 10 reviews
  c = 0
  for i in rb:
      rev = i.find(name = "div", class_ = "content")

      #Using try-except to account for errors where reviews are not present
      try:
        reviews.append(str(rev.text).replace("\t","").replace("\n",""))
      except:
        reviews.append(str("Null"))
      c = c + 1
      if c == 10:
          break
  #Updating reviews for each game in thr game_reviews dictionary
  game_reviews[name] = reviews

In [None]:
"""Since some games contain less than 10 reviews, 
 Null is imputed for the rest, to make dataframe creation easy.
For example, if a game has 4 reviews, one Null is imputed and so on."""

for key, val in game_reviews.items():
  missing_num = 10 - len(val)
  # print(number)
  if missing_num > 0:
    while missing_num > 0:
      val.append("Null")
      missing_num = missing_num - 1

In [None]:
"""Creating a dataframe temp_data_3 which contains reviews for each 
game. Each review contitutes one column."""
temp_data_3 = pd.DataFrame(game_reviews).T
temp_data_3.reset_index(inplace = True)
temp_data_3.rename(columns={"index":"Name", 0:"Review 1", 
                            1:"Review 2", 2:"Review 3", 3: "Review 4", 
                            4:"Review 5", 5:"Review 6",6:"Review 7",
                            7:"Review 8",8:"Review 9",
                            9:"Review 10"}, inplace = True)

In [None]:
"""The dataframe created in previous step is merged with temp_data_2
which was created earlier containing the rest of the required data.
The 'data' dataframe is the final dataframe required"""
data = temp_data_2.merge(temp_data_3, how = "left", on = "Name")

In [None]:
"""If "Requires a 64-bit processor and operating system" is present in the columns,
it is an additional requirement for some games, If a game has no such requirement,
NaN is present in the rows for such games. For games which has this requirement,
there is an empty cell because this is only a text written in the requirement
box with no value. In the data cleaning steps, we can replace empty cell with 1
and NaNs with 0s"""
data

Unnamed: 0,Name,Link,Developer,Publisher,Number of Positive Reviews,OS,Processor,Memory,Graphics,DirectX,Network,Storage,Sound Card,Additional Notes,Review 1,Review 2,Review 3,Review 4,Review 5,Review 6,Review 7,Review 8,Review 9,Review 10
0,Hunter's Arena: Legends,https://store.steampowered.com/app/1061100/Hun...,Mantisco,Mantisco,28,"64-bit Windows 7, Windows 8.1, Windows 10",Intel Core i5-4430 / AMD FX-6300,8 GB RAM,NVIDIA GeForce GTX 770 / AMD Radeon R7 370 2GB,Version 11,Broadband Internet connection,30 GB available space,,,"Needs more players, but the actual game is fun...",They need OCE servers so its playable for us,"When I queued for 10 mins and found no match, ...","Don't bother, no player base, you can't get a ...",Played the beta on PS4. Thought the game was a...,"If only this game had players, an also the sam...",Game's combat is currently better than Naraka:...,"0 players, developpers got kidnapped maybe ?",people must be on some next level hive mind ♥♥...,Good Games Yes Yes
1,Olaguna Chronicles,https://store.steampowered.com/app/985650/Olag...,SELeft Studio,SELeft Studio,43,WIN7/WIN10,Pentium Dual Core级以上,2 GB RAM,Geforce FX5600级/ATI Radeon9600以上 (支持Shader 2....,Version 9.0c,,700 MB available space,DirectX 可互换声卡,WINDOWS XP以下不能运行（含XP),Null,Null,Null,Null,Null,Null,Null,Null,Null,Null
2,Who Stole My Beard?,https://store.steampowered.com/app/1141270/Who...,Cleardot Games,Cleardot Games,10,7,Intel 3 or higher,1024 MB RAM,Basic (2GB or higher),,,1024 MB available space,16-bit,,Fun game with great artwork and bubbly music t...,A friend of mine recommended this game to me d...,This game has been so much fun. Very reminisce...,Been playing this game nonstop for over an hou...,I love how this game combines nostalgia and qu...,Fun game with a real indie vibe - tones of Sta...,Beautiful nostalgic retro RPG game with charm ...,Started playing this and the first thing I not...,"I just tried this out because I like RGPSs, an...","Still playing it, but so far it's been the bes..."
3,Creatures of Aether,https://store.steampowered.com/app/1593750/Cre...,Tako Boy Studios,Tako Boy Studios,88,Windows 7 / 8 / 10,2.0 Ghz,512 MB RAM,512 MB Video Memory,Version 9.0c,Broadband Internet connection,400 MB available space,,,"I wanted to like this game, because I love the...",furry hearthstone,funny card game,played this on mobile a long while back and re...,Have you ever ACTUALLY tried to play Triple Tr...,A fast paced card game featuring some of the b...,A surprisingly complex and fun card game for f...,I've been playing this game since the very fir...,The game is pay to win paired with the small p...,Leans toward paying players with the level up ...
4,GRIME,https://store.steampowered.com/app/1123050/GRI...,Clover Bite,Clover Bite,191,Windows 7,Intel i5 3470 or AMD equivelent,4 GB RAM,nVidia GeForce 960 or AMD equivelent,Version 10,,10 GB available space,,,I remember when the word Metroidvania actually...,"Ignore other reviews, this is 100% a MV, its o...","So far, so very good. (sidenote: if you're not...","An incredible surprise!Admittedly, I was not e...",This game rocks.,This review is further detailed in the video b...,This is one of my favorite games in recent mem...,Great game so far. Reminds me of Hollow Knight...,This game is the Hellpoint of 2021. No one saw...,"Like a ♥♥♥♥♥♥♥♥, less interesting version of H..."


In [None]:
data.to_csv("/content/game_data.csv")

In [None]:
"""Additional Notes:
The code is scalable and I've successfully scraped the data for all the 
games (around 60) on the page, we can use a loop to iterate through more 
pages."""


"Additional Notes:\nThe code is scalable and I've successfully scraped the data for all the \ngames (around 60) on the page, we can use a loop to iterate through more \npages."