In [1]:
import requests as re
import json as json 
import os as os
from bs4 import BeautifulSoup as bs 
from selenium import webdriver
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.by import By
import pandas as pd
import time

**Notes**

- PS Store URL for articles dating 1 year back is N = = 2 but doing 3 to not miss out on some hanging fruit from early jan 23 and late dec 22

In [6]:
def title_scraper(page_number):
    url = f'https://blog.playstation.com/category/ps-store/page/{page_number}/'
    response = re.get(url)
    soup = bs(response.content, 'html.parser')

    articles = soup.find_all('div', class_='post-card__content')
    data = []
    for article in articles:
        title_element = article.find('h2', class_='post-card__title')
        writer_element = article.find('p', class_='post-card__author-name')
        date_element = article.find('span', class_='post-card__meta-date')
        comments_element = article.find('span', class_='post-card__meta-comments-count')
        likes_element = article.find('span', class_='post-card__likes')
        link_element = article.find('a', class_='post-card__title-link')

        title = title_element.get_text(strip=True) if title_element else None
        writer = writer_element.get_text(strip=True) if writer_element else None
        date = date_element.get_text(strip=True) if date_element else None
        comments = comments_element.get_text(strip=True) if comments_element else '0'
        likes = likes_element.get_text(strip=True) if likes_element else '0'
        link = link_element['href'] if link_element else None
        
        data.append({'Title': title, 'Writer': writer, 'Date Written': date, 
                     'Number of Comments': comments, 'Number of Likes': likes, 
                     'Link to Article': link})
    return data

N = 3  # Manually adjust number of pages to scrape 🥷
all_data = []
for page in range(1, N + 1):
    print(f"Scraping page {page}...")
    all_data.extend(title_scraper(page))

df = pd.DataFrame(all_data)

print(df)

Scraping page 1...
Scraping page 2...
Scraping page 3...
                                                Title  \
0   The Holiday Sale promotion comes to PlayStatio...   
1   PlayStation Indies promotion comes to PlayStat...   
2   Essential Picks promotion comes to PlayStation...   
3   Extended Play promotion comes to PlayStation S...   
4   Big Games Big deals promotion comes to PlaySta...   
5   PlayStation Plus Double Discounts promotion co...   
6            August Savings come to PlayStation Store   
7        PlayStation Store: July 2023’s top downloads   
8   Mid-Year Deals promotion comes to PlayStation ...   
9   PlayStation Plus Double Discounts promotion co...   
10         Days of Play 2023 sale kicks off on June 2   
11  The Planet of the Discounts promotion comes to...   
12  Big Games Big Deals promotion comes to PlaySta...   
13   May Savings promotion comes to PlayStation Store   
14                                                      
15      PlayStation Store: Marc

In [None]:
def comments_spider(df):
    comments_data = []
    driver = webdriver.Chrome()  

    for index, link in enumerate(df['Link to Article'], start=1):
        print(f"Processing article {index}/{len(df)}: {link}")
        driver.get(link)

        try:
            comments_area = driver.find_element(By.CLASS_NAME, "comments__area")
            loading_indicator = comments_area.find_element(By.CLASS_NAME, "loading-indicator")
            actions = ActionChains(driver)
            actions.move_to_element(loading_indicator).perform()
            time.sleep(5)  
        except Exception as e:
            print(f"Loading indicator in comments area not found or hover action failed: {e}")

        last_height = driver.execute_script("return document.body.scrollHeight")
        while True:
            driver.execute_script("window.scrollBy(0, 1000);") 
            time.sleep(3)  

            new_height = driver.execute_script("return document.body.scrollHeight")
            if new_height == last_height:
                break
            last_height = new_height

        soup = bs(driver.page_source, 'html.parser')
        comments = soup.find_all('li', class_='comment')
        print(f"Found {len(comments)} comments in this article.")

        for comment in comments:
            author = comment.find('span', class_='comment-author').get_text(strip=True) if comment.find('span', class_='comment-author') else None
            timestamp_link = comment.find('a', class_='comment-permalink')
            timestamp = timestamp_link.get_text(strip=True) if timestamp_link else None
            body_parts = comment.find_all('p')
            body = ' '.join(p.get_text(strip=True) for p in body_parts) if body_parts else None

            comments_data.append({
                'Author': author,
                'Timestamp': timestamp,
                'Comment': body,
                'Article Link': link
            })

    driver.quit()
    return pd.DataFrame(comments_data)

comments_df = comments_spider(df)

print(comments_df)

In [5]:
# turn df into csv (uncomment to run)
comments_df.to_csv('RAW_psstore_blog_comments.csv')

In [None]:
# testing the output csv file 

raw_comments_df = pd.read_csv('RAW_psstore_blog_comments.csv')

print(raw_comments_df.head(10))
print(raw_comments_df.shape)