# Having fun with Web Scraping 🍹

In [2]:
import requests as re
import json as json 
import os as os
from bs4 import BeautifulSoup as bs 
from selenium import webdriver
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.by import By
import pandas as pd
import time

# Initial feasibility test and fun 😜

In [3]:
url = "https://blog.playstation.com"

In [4]:
def get_blog_posts(url):
    response = re.get(url)
    soup = bs(response.content, 'html.parser')
    posts = soup.find_all('a', class_='post-card__title-link')
    titles = [post.get_text(strip=True) for post in posts]
    return titles

In [5]:
posts_titles = get_blog_posts(url)
for title in posts_titles:
    print(title)

PS.Blog Game of the Year 2023: The Winners
The Holiday Sale promotion comes to PlayStation Store
Seasons greetings 2023 from PlayStation.Blog and friends
PlayStation 2023 Wrap-Up launches today, with a personalized look at your 2023 gaming achievements
PlayStation Plus Game Catalog for December: Grand Theft Auto V, Stranger of Paradise: Final Fantasy Origin, Metal: Hellsinger and more
Tekken 8: hands-on with Arcade Quest, Super Ghost Battles, and Tekken Ball
Alan Wake 2 gets New Game Plus mode titled The Final Draft, available today
Cooking Simulator VR launches Dec 15 on PS VR2
The new PS5 owners’ guide to great gaming experiences

Shuhei Yoshida’s favorite PlayStation indie games of 2023
How Suicide Squad: Kill the Justice League loadout options enable customizable chaos


Share of the Week: Cyberpunk 2077
Ronald D. Moore interview: an alternate reality Q&A For All Mankind

PS Blog Game of the Year Awards 2023: voting is now open


Play with the Champions finals stream this week – tu

# The good stuff here 😉

            The scrape page function opens the PS5 blog site and iterates over the pages to extract information from each article posted. 

            Manually adjust N to scrape more pages. 

**Notes**

- PS5 URL for articles dating 1 year back is N = 36

In [6]:
def title_scraper(page_number):
    url = f'https://blog.playstation.com/category/ps5/page/{page_number}/'
    response = re.get(url)
    soup = bs(response.content, 'html.parser')

    articles = soup.find_all('div', class_='post-card__content')
    data = []
    for article in articles:
        title_element = article.find('h2', class_='post-card__title')
        writer_element = article.find('p', class_='post-card__author-name')
        date_element = article.find('span', class_='post-card__meta-date')
        comments_element = article.find('span', class_='post-card__meta-comments-count')
        likes_element = article.find('span', class_='post-card__likes')
        link_element = article.find('a', class_='post-card__title-link')

        title = title_element.get_text(strip=True) if title_element else None
        writer = writer_element.get_text(strip=True) if writer_element else None
        date = date_element.get_text(strip=True) if date_element else None
        comments = comments_element.get_text(strip=True) if comments_element else '0'
        likes = likes_element.get_text(strip=True) if likes_element else '0'
        link = link_element['href'] if link_element else None
        
        data.append({'Title': title, 'Writer': writer, 'Date Written': date, 
                     'Number of Comments': comments, 'Number of Likes': likes, 
                     'Link to Article': link})
    return data

N = 36  # Manually adjust number of pages to scrape 🥷
all_data = []
for page in range(1, N + 1):
    print(f"Scraping page {page}...")
    all_data.extend(title_scraper(page))

df = pd.DataFrame(all_data)

print(df)

Scraping page 1...
Scraping page 2...
Scraping page 3...
Scraping page 4...
Scraping page 5...
Scraping page 6...
Scraping page 7...
Scraping page 8...
Scraping page 9...
Scraping page 10...
Scraping page 11...
Scraping page 12...
Scraping page 13...
Scraping page 14...
Scraping page 15...
Scraping page 16...
Scraping page 17...
Scraping page 18...
Scraping page 19...
Scraping page 20...
Scraping page 21...
Scraping page 22...
Scraping page 23...
Scraping page 24...
Scraping page 25...
Scraping page 26...
Scraping page 27...
Scraping page 28...
Scraping page 29...
Scraping page 30...
Scraping page 31...
Scraping page 32...
Scraping page 33...
Scraping page 34...
Scraping page 35...
Scraping page 36...
                                                 Title  \
0    The new PS5 owners’ guide to great gaming expe...   
1    How Suicide Squad: Kill the Justice League loa...   
2    Seasons greetings 2023 from PlayStation.Blog a...   
3    PS Blog Game of the Year Awards 2023: voting i...   

In [7]:
def comments_spider(df):
    comments_data = []
    driver = webdriver.Chrome()  

    for index, link in enumerate(df['Link to Article'], start=1):
        print(f"Processing article {index}/{len(df)}: {link}")
        driver.get(link)

        try:
            comments_area = driver.find_element(By.CLASS_NAME, "comments__area")
            loading_indicator = comments_area.find_element(By.CLASS_NAME, "loading-indicator")
            actions = ActionChains(driver)
            actions.move_to_element(loading_indicator).perform()
            time.sleep(5)  
        except Exception as e:
            print(f"Loading indicator in comments area not found or hover action failed: {e}")

        last_height = driver.execute_script("return document.body.scrollHeight")
        while True:
            driver.execute_script("window.scrollBy(0, 1000);") 
            time.sleep(3)  

            new_height = driver.execute_script("return document.body.scrollHeight")
            if new_height == last_height:
                break
            last_height = new_height

        soup = bs(driver.page_source, 'html.parser')
        comments = soup.find_all('li', class_='comment')
        print(f"Found {len(comments)} comments in this article.")

        for comment in comments:
            author = comment.find('span', class_='comment-author').get_text(strip=True) if comment.find('span', class_='comment-author') else None
            timestamp_link = comment.find('a', class_='comment-permalink')
            timestamp = timestamp_link.get_text(strip=True) if timestamp_link else None
            body_parts = comment.find_all('p')
            body = ' '.join(p.get_text(strip=True) for p in body_parts) if body_parts else None

            comments_data.append({
                'Author': author,
                'Timestamp': timestamp,
                'Comment': body,
                'Article Link': link
            })

    driver.quit()
    return pd.DataFrame(comments_data)

comments_df = comments_spider(df)

print(comments_df)

Processing article 1/432: https://blog.playstation.com/2023/12/26/the-new-ps5-owners-guide-to-great-gaming-experiences/
Loading indicator in comments area not found or hover action failed: Message: no such element: Unable to locate element: {"method":"css selector","selector":".loading-indicator"}
  (Session info: chrome=120.0.6099.129); For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors#no-such-element-exception
Stacktrace:
0   chromedriver                        0x00000001050c84dc chromedriver + 4162780
1   chromedriver                        0x00000001050c0664 chromedriver + 4130404
2   chromedriver                        0x0000000104d17bc0 chromedriver + 293824
3   chromedriver                        0x0000000104d5e040 chromedriver + 581696
4   chromedriver                        0x0000000104d53250 chromedriver + 537168
5   chromedriver                        0x0000000104d9dab0 chromedriver + 842416
6   chromedrive

In [8]:
# turn df into csv (uncomment to run)
# comments_df.to_csv('RAW_ps5_blog_comments.csv')

In [10]:
# testing the output csv file 

raw_comments_df = pd.read_csv('RAW_ps5_blog_comments.csv')

print(raw_comments_df.head(10))

   Unnamed: 0           Author                          Timestamp  \
0           0     x-rogue_55-x  December 20, 2023 at 11:38 am PST   
1           1  namelessking627  December 20, 2023 at 12:42 pm PST   
2           2           xDD90x   December 20, 2023 at 1:35 pm PST   
3           3         RasoGOVI   December 20, 2023 at 5:16 pm PST   
4           4        unknownfa   December 21, 2023 at 9:07 pm PST   
5           5       CB7Tuner91   December 25, 2023 at 9:33 pm PST   
6           6           cusman   December 15, 2023 at 8:26 am PST   
7           7        Valyrious   December 15, 2023 at 8:36 am PST   
8           8       Ravenous74   December 15, 2023 at 9:11 am PST   
9           9   ThePotatoElite   December 15, 2023 at 9:20 am PST   

                                             Comment  \
0  So hype for this to release next year, my favo...   
1  Yeaaha Deadshot  slaps harder than my dad afte...   
2  The gameplay looks fun and I have faith in Roc...   
3  This has offi