In [1]:
import requests as re
import json as json 
import os as os
from bs4 import BeautifulSoup as bs 
from selenium import webdriver
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.by import By
import pandas as pd
import time

**Notes**

- PS VR2 URL for articles dating 1 year back is N = 9

In [3]:
def title_scraper(page_number):
    url = f'https://blog.playstation.com/category/playstation-vr2/page/{page_number}/'
    response = re.get(url)
    soup = bs(response.content, 'html.parser')

    articles = soup.find_all('div', class_='post-card__content')
    data = []
    for article in articles:
        title_element = article.find('h2', class_='post-card__title')
        writer_element = article.find('p', class_='post-card__author-name')
        date_element = article.find('span', class_='post-card__meta-date')
        comments_element = article.find('span', class_='post-card__meta-comments-count')
        likes_element = article.find('span', class_='post-card__likes')
        link_element = article.find('a', class_='post-card__title-link')

        title = title_element.get_text(strip=True) if title_element else None
        writer = writer_element.get_text(strip=True) if writer_element else None
        date = date_element.get_text(strip=True) if date_element else None
        comments = comments_element.get_text(strip=True) if comments_element else '0'
        likes = likes_element.get_text(strip=True) if likes_element else '0'
        link = link_element['href'] if link_element else None
        
        data.append({'Title': title, 'Writer': writer, 'Date Written': date, 
                     'Number of Comments': comments, 'Number of Likes': likes, 
                     'Link to Article': link})
    return data

N = 9  # Manually adjust number of pages to scrape 🥷
all_data = []
for page in range(1, N + 1):
    print(f"Scraping page {page}...")
    all_data.extend(title_scraper(page))

df = pd.DataFrame(all_data)

print(df)

Scraping page 1...
Scraping page 2...
Scraping page 3...
Scraping page 4...
Scraping page 5...
Scraping page 6...
Scraping page 7...
Scraping page 8...
Scraping page 9...
                                                 Title  \
0    Seasons greetings 2023 from PlayStation.Blog a...   
1       Cooking Simulator VR launches Dec 15 on PS VR2   
2    PS Blog Game of the Year Awards 2023: voting i...   
3    Resident Evil 4 VR Mode interview: new gamepla...   
4        Among Us VR launches on PlayStation VR2 today   
..                                                 ...   
103  Build and manage your dream city in Cities: VR...   
104  The Last Clockwinder brings clever automation ...   
105   Kayak VR: Mirage joins the PS VR2 launch line-up   
106  PlayStation VR2: 13 new titles and launch line...   
107                    PlayStation in 2023: Highlights   

                                                Writer  \
0    O’Dell Harmon Jr. (he/him)Specialist, Content ...   
1            Kac

In [4]:
def comments_spider(df):
    comments_data = []
    driver = webdriver.Chrome()  

    for index, link in enumerate(df['Link to Article'], start=1):
        print(f"Processing article {index}/{len(df)}: {link}")
        driver.get(link)

        try:
            comments_area = driver.find_element(By.CLASS_NAME, "comments__area")
            loading_indicator = comments_area.find_element(By.CLASS_NAME, "loading-indicator")
            actions = ActionChains(driver)
            actions.move_to_element(loading_indicator).perform()
            time.sleep(5)  
        except Exception as e:
            print(f"Loading indicator in comments area not found or hover action failed: {e}")

        last_height = driver.execute_script("return document.body.scrollHeight")
        while True:
            driver.execute_script("window.scrollBy(0, 1000);") 
            time.sleep(3)  

            new_height = driver.execute_script("return document.body.scrollHeight")
            if new_height == last_height:
                break
            last_height = new_height

        soup = bs(driver.page_source, 'html.parser')
        comments = soup.find_all('li', class_='comment')
        print(f"Found {len(comments)} comments in this article.")

        for comment in comments:
            author = comment.find('span', class_='comment-author').get_text(strip=True) if comment.find('span', class_='comment-author') else None
            timestamp_link = comment.find('a', class_='comment-permalink')
            timestamp = timestamp_link.get_text(strip=True) if timestamp_link else None
            body_parts = comment.find_all('p')
            body = ' '.join(p.get_text(strip=True) for p in body_parts) if body_parts else None

            comments_data.append({
                'Author': author,
                'Timestamp': timestamp,
                'Comment': body,
                'Article Link': link
            })

    driver.quit()
    return pd.DataFrame(comments_data)

comments_df = comments_spider(df)

print(comments_df)

Processing article 1/108: https://blog.playstation.com/2023/12/15/seasons-greetings-2023-from-playstation-blog-and-friends/
Found 17 comments in this article.
Processing article 2/108: https://blog.playstation.com/2023/12/14/cooking-simulator-vr-launches-dec-15-on-ps-vr2/
Found 10 comments in this article.
Processing article 3/108: https://blog.playstation.com/2023/12/13/ps-blog-game-of-the-year-awards-2023-voting-is-now-open/
Found 69 comments in this article.
Processing article 4/108: https://blog.playstation.com/2023/12/06/resident-evil-4-vr-mode-interview-new-gameplay-features-learnings-from-re-village-and-more/
Found 22 comments in this article.
Processing article 5/108: https://blog.playstation.com/2023/12/05/among-us-vr-launches-on-playstation-vr2-today/
Found 3 comments in this article.
Processing article 6/108: https://blog.playstation.com/2023/12/05/how-arizona-sunshine-2-brings-its-arsenal-and-buddy-the-dog-to-life-with-ps-vr2-out-dec-7/
Found 2 comments in this article.
Pro

In [5]:
# turn df into csv (uncomment to run)
# comments_df.to_csv('RAW_psvr2_blog_comments.csv')

In [6]:
# testing the output csv file 

raw_comments_df = pd.read_csv('RAW_psvr2_blog_comments.csv')

print(raw_comments_df.head(10))
print(raw_comments_df.shape)

   Unnamed: 0            Author                          Timestamp  \
0           0            cusman   December 15, 2023 at 8:26 am PST   
1           1         Valyrious   December 15, 2023 at 8:36 am PST   
2           2        Ravenous74   December 15, 2023 at 9:11 am PST   
3           3    ThePotatoElite   December 15, 2023 at 9:20 am PST   
4           4  The_0neAbove_AIl  December 15, 2023 at 10:19 am PST   
5           5    X---Subtle---X   December 15, 2023 at 1:21 pm PST   
6           6           yentair   December 15, 2023 at 2:53 pm PST   
7           7   namelessking627   December 15, 2023 at 3:52 pm PST   
8           8   namelessking627   December 15, 2023 at 3:53 pm PST   
9           9        babaykasus   December 15, 2023 at 5:22 pm PST   

                                             Comment  \
0  Wow, so many. Happy Holidays to all of you rig...   
1  Rain Code huh? Is that a bit of an oopsy, Spik...   
2  Merry Christmas/Happy Holidays to everyone at ...   
3    