In [1]:
import requests as re
import json as json 
import os as os
from bs4 import BeautifulSoup as bs 
from selenium import webdriver
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.by import By
import pandas as pd
import time

**Notes**

- PS Store URL for articles dating 1 year back is N = 3 but doing 4 to not miss out on some hanging fruit from early jan 23 and late dec 22

In [2]:
def title_scraper(page_number):
    url = f'https://blog.playstation.com/category/ps-plus/page/{page_number}/'
    response = re.get(url)
    soup = bs(response.content, 'html.parser')

    articles = soup.find_all('div', class_='post-card__content')
    data = []
    for article in articles:
        title_element = article.find('h2', class_='post-card__title')
        writer_element = article.find('p', class_='post-card__author-name')
        date_element = article.find('span', class_='post-card__meta-date')
        comments_element = article.find('span', class_='post-card__meta-comments-count')
        likes_element = article.find('span', class_='post-card__likes')
        link_element = article.find('a', class_='post-card__title-link')

        title = title_element.get_text(strip=True) if title_element else None
        writer = writer_element.get_text(strip=True) if writer_element else None
        date = date_element.get_text(strip=True) if date_element else None
        comments = comments_element.get_text(strip=True) if comments_element else '0'
        likes = likes_element.get_text(strip=True) if likes_element else '0'
        link = link_element['href'] if link_element else None
        
        data.append({'Title': title, 'Writer': writer, 'Date Written': date, 
                     'Number of Comments': comments, 'Number of Likes': likes, 
                     'Link to Article': link})
    return data

N = 4  # Manually adjust number of pages to scrape 🥷
all_data = []
for page in range(1, N + 1):
    print(f"Scraping page {page}...")
    all_data.extend(title_scraper(page))

df = pd.DataFrame(all_data)

print(df)

Scraping page 1...
Scraping page 2...
Scraping page 3...
Scraping page 4...
                                                Title  \
0   PlayStation Plus Game Catalog for December: Gr...   
1   PlayStation Plus Monthly Games for December: L...   
2   PlayStation Plus Game Catalog for November: Te...   
3   PlayStation Plus Monthly Games for November – ...   
4   PS5 Streaming for PlayStation Plus Premium mem...   
5   PlayStation Plus Game Catalog for October: Got...   
6   PS5 cloud streaming launches this month for Pl...   
7   PlayStation Plus Monthly Games for October: Th...   
8   PlayStation Plus Game Catalog for September: N...   
9   PlayStation Plus Monthly Games for September: ...   
10  PlayStation Plus Game Catalog for August: Sea ...   
11  PlayStation Plus Monthly Games for August: PGA...   
12  PlayStation Plus Game Catalog  & Classics for ...   
13  PlayStation Plus Monthly Games for July: Call ...   
14  PlayStation Plus Game Catalog for June + PS5 G...   
15  11 unmis

In [5]:
def comments_spider(df):
    comments_data = []
    driver = webdriver.Chrome()  

    for index, link in enumerate(df['Link to Article'], start=1):
        print(f"Processing article {index}/{len(df)}: {link}")
        driver.get(link)

        try:
            comments_area = driver.find_element(By.CLASS_NAME, "comments__area")
            loading_indicator = comments_area.find_element(By.CLASS_NAME, "loading-indicator")
            actions = ActionChains(driver)
            actions.move_to_element(loading_indicator).perform()
            time.sleep(5)  
        except Exception as e:
            print(f"Loading indicator in comments area not found or hover action failed: {e}")

        last_height = driver.execute_script("return document.body.scrollHeight")
        while True:
            driver.execute_script("window.scrollBy(0, 1500);") 
            time.sleep(3)  

            new_height = driver.execute_script("return document.body.scrollHeight")
            if new_height == last_height:
                break
            last_height = new_height

        soup = bs(driver.page_source, 'html.parser')
        comments = soup.find_all('li', class_='comment')
        print(f"Found {len(comments)} comments in this article.")

        for comment in comments:
            author = comment.find('span', class_='comment-author').get_text(strip=True) if comment.find('span', class_='comment-author') else None
            timestamp_link = comment.find('a', class_='comment-permalink')
            timestamp = timestamp_link.get_text(strip=True) if timestamp_link else None
            body_parts = comment.find_all('p')
            body = ' '.join(p.get_text(strip=True) for p in body_parts) if body_parts else None

            comments_data.append({
                'Author': author,
                'Timestamp': timestamp,
                'Comment': body,
                'Article Link': link
            })

    driver.quit()
    return pd.DataFrame(comments_data)

comments_df = comments_spider(df)

print(comments_df)

Processing article 1/48: https://blog.playstation.com/2023/12/13/playstation-plus-game-catalog-for-december-grand-theft-auto-v-stranger-of-paradise-final-fantasy-origin-metal-hellsinger-and-more/
Found 78 comments in this article.
Processing article 2/48: https://blog.playstation.com/2023/11/29/playstation-plus-monthly-games-for-december-lego-2k-drive-powerwash-simulator-sable/
Found 109 comments in this article.
Processing article 3/48: https://blog.playstation.com/2023/11/15/playstation-plus-game-catalog-for-november-teardown-dragons-dogma-dark-arisen-superliminal-and-more/
Found 108 comments in this article.
Processing article 4/48: https://blog.playstation.com/2023/11/01/playstation-plus-monthly-games-for-november-mafia-ii-definitive-edition-dragon-ball-the-breakers-aliens-fireteam-elite/
Found 79 comments in this article.
Processing article 5/48: https://blog.playstation.com/2023/10/16/ps5-streaming-for-playstation-plus-premium-members-launches-starting-today-in-japan-europe-and-n

In [6]:
# turn df into csv (uncomment to run)
comments_df.to_csv('RAW_psplus_blog_comments.csv')

In [7]:
# testing the output csv file 

raw_comments_df = pd.read_csv('RAW_psplus_blog_comments.csv')

print(raw_comments_df.head(10))
print(raw_comments_df.shape)

   Unnamed: 0        Author                          Timestamp  \
0           0      engobruh   December 13, 2023 at 8:49 am PST   
1           1     Tim102597   December 13, 2023 at 8:53 am PST   
2           2    cyberpangu   December 13, 2023 at 8:54 am PST   
3           3     StingrayX   December 13, 2023 at 8:55 am PST   
4           4  Prosopopoico   December 13, 2023 at 8:56 am PST   
5           5    AceofVenum   December 13, 2023 at 9:02 am PST   
6           6   Sushi_Combo   December 13, 2023 at 9:17 am PST   
7           7    Orpheus79V   December 13, 2023 at 9:26 am PST   
8           8       Tusiczz  December 13, 2023 at 11:50 am PST   
9           9     FaZeRovix  December 13, 2023 at 12:59 pm PST   

                                             Comment  \
0                                  Finally, quality.   
1  Anyone else had buzz listing year star command...   
2  I would be happier if it was Ride 5 instead of...   
3                                       Solid mon