# Having fun with Web Scraping 🍹

In [20]:
import requests as re
import json as json 
import os as os
from bs4 import BeautifulSoup as bs 
from selenium import webdriver
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.by import By
import pandas as pd
import time

In [2]:
url = "https://blog.playstation.com"

In [3]:
def get_blog_posts(url):
    response = re.get(url)
    soup = bs(response.content, 'html.parser')
    posts = soup.find_all('a', class_='post-card__title-link')
    titles = [post.get_text(strip=True) for post in posts]
    return titles

In [4]:
posts_titles = get_blog_posts(url)
for title in posts_titles:
    print(title)

PS.Blog Game of the Year 2023: The Winners
The Holiday Sale promotion comes to PlayStation Store
Seasons greetings 2023 from PlayStation.Blog and friends
PlayStation 2023 Wrap-Up launches today, with a personalized look at your 2023 gaming achievements
PlayStation Plus Game Catalog for December: Grand Theft Auto V, Stranger of Paradise: Final Fantasy Origin, Metal: Hellsinger and more
Tekken 8: hands-on with Arcade Quest, Super Ghost Battles, and Tekken Ball
Alan Wake 2 gets New Game Plus mode titled The Final Draft, available today
Cooking Simulator VR launches Dec 15 on PS VR2


Share of the Week: Cyberpunk 2077
Ronald D. Moore interview: an alternate reality Q&A For All Mankind

PS Blog Game of the Year Awards 2023: voting is now open


Play with the Champions finals stream this week – tune in Dec 11 to 14

Share of the Week: Fortnite
PlayStation Store: November 2023’s top downloads
The Finals launches on PS5 today
Final Fantasy XVI: two new story DLCs announced, first launches toda

# Level 2 stuff here 😉

            The scrape page function opens the PS5 blog site and iterates over the pages to extract information from each article posted. 

            Manually adjust N to scrape more pages. 

In [6]:
def scrape_page(page_number):
    url = f'https://blog.playstation.com/category/ps5/page/{page_number}/'
    response = re.get(url)
    soup = bs(response.content, 'html.parser')

    articles = soup.find_all('div', class_='post-card__content')
    data = []
    for article in articles:
        title_element = article.find('h2', class_='post-card__title')
        writer_element = article.find('p', class_='post-card__author-name')
        date_element = article.find('span', class_='post-card__meta-date')
        comments_element = article.find('span', class_='post-card__meta-comments-count')
        likes_element = article.find('span', class_='post-card__likes')
        link_element = article.find('a', class_='post-card__title-link')

        title = title_element.get_text(strip=True) if title_element else None
        writer = writer_element.get_text(strip=True) if writer_element else None
        date = date_element.get_text(strip=True) if date_element else None
        comments = comments_element.get_text(strip=True) if comments_element else '0'
        likes = likes_element.get_text(strip=True) if likes_element else '0'
        link = link_element['href'] if link_element else None
        
        data.append({'Title': title, 'Writer': writer, 'Date Written': date, 
                     'Number of Comments': comments, 'Number of Likes': likes, 
                     'Link to Article': link})
    return data

N = 3  # Manually adjust number of pages to scrape 🥷
all_data = []
for page in range(1, N + 1):
    print(f"Scraping page {page}...")
    all_data.extend(scrape_page(page))

df = pd.DataFrame(all_data)

print(df)

Scraping page 1...
Scraping page 2...
Scraping page 3...
                                                Title  \
0   Seasons greetings 2023 from PlayStation.Blog a...   
1   PS Blog Game of the Year Awards 2023: voting i...   
2   Tekken 8: hands-on with Arcade Quest, Super Gh...   
3   Alan Wake 2 gets New Game Plus mode titled The...   
4                         Share of the Week: Fortnite   
5                    The Finals launches on PS5 today   
6   Final Fantasy XVI: two new story DLCs announce...   
7      Rise of the Ronin arrives only on PS5 March 22   
8   God of War Ragnarök: Valhalla DLC revealed, co...   
9   First hands-on details: Fortnite Festival, Roc...   
10  Expanding the Avatar franchise with Avatar: Fr...   
11  Celebrating inclusivity: Access controller for...   
12  The Last of Us Part II Remastered: Exploring t...   
13      The Access controller for PS5 starter’s guide   
14                        Share of the Week: Emotions   
15  Players’ Choice: Like a Dra

In [16]:
def extract_comments_with_selenium(df):
    comments_data = []
    driver = webdriver.Chrome()  

    for index, link in enumerate(df['Link to Article'], start=1):
        print(f"Processing article {index}/{len(df)}: {link}")
        driver.get(link)

        try:
            loading_indicator = driver.find_element(By.CLASS_NAME, "loading-indicator") # This right here!!!! ⬅️ 
            actions = ActionChains(driver)
            actions.move_to_element(loading_indicator).perform()
            time.sleep(5)  
        except Exception as e:
            print(f"Loading indicator not found or hover action failed: {e}")

        last_height = driver.execute_script("return document.body.scrollHeight")
        while True:
            driver.execute_script("window.scrollBy(0, 1000);") 
            time.sleep(3)  

            new_height = driver.execute_script("return document.body.scrollHeight")
            if new_height == last_height:
                break
            last_height = new_height

        soup = bs(driver.page_source, 'html.parser')
        comments = soup.find_all('li', class_='comment')
        print(f"Found {len(comments)} comments in this article.")

        for comment in comments:
            author = comment.find('span', class_='comment-author').get_text(strip=True) if comment.find('span', class_='comment-author') else None
            timestamp_link = comment.find('a', class_='comment-permalink')
            timestamp = timestamp_link.get_text(strip=True) if timestamp_link else None
            body_parts = comment.find_all('p')
            body = ' '.join(p.get_text(strip=True) for p in body_parts) if body_parts else None

            comments_data.append({
                'Author': author,
                'Timestamp': timestamp,
                'Comment': body,
                'Article Link': link
            })

    driver.quit()
    return pd.DataFrame(comments_data)

comments_df = extract_comments_with_selenium(df)

print(comments_df)

Processing article 1/36: https://blog.playstation.com/2023/12/15/seasons-greetings-2023-from-playstation-blog-and-friends/
Found 17 comments in this article.
Processing article 2/36: https://blog.playstation.com/2023/12/13/ps-blog-game-of-the-year-awards-2023-voting-is-now-open/
Found 69 comments in this article.
Processing article 3/36: https://blog.playstation.com/2023/12/12/tekken-8-hands-on-with-arcade-quest-super-ghost-battles-and-tekken-ball/
Found 6 comments in this article.
Processing article 4/36: https://blog.playstation.com/2023/12/11/alan-wake-2-gets-new-game-plus-mode-titled-the-final-draft-available-today/
Found 4 comments in this article.
Processing article 5/36: https://blog.playstation.com/2023/12/08/share-of-the-week-fortnite/
Found 2 comments in this article.
Processing article 6/36: https://blog.playstation.com/2023/12/07/the-finals-launches-on-ps5-today/
Found 4 comments in this article.
Processing article 7/36: https://blog.playstation.com/2023/12/07/final-fantasy