# Reddit 🕷️ Spider for PlayStation Comments 🎮

**https://www.reddit.com/robots.txt**

- https://www.reddit.com/r/playstation/
- https://www.reddit.com/r/PlaystationPortal/
- https://www.reddit.com/r/PlayStationPlus/
- https://www.reddit.com/r/PS4/
- https://www.reddit.com/r/PS5/
- https://www.reddit.com/r/PSVR/

**Note** Decided to use the `/top/?t=year` URL param to help filter the noise. I will then take the top 1,000 posts for the past year. 

In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import ElementNotInteractableException, NoSuchElementException, TimeoutException
from bs4 import BeautifulSoup
import pandas as pd
import time

In [2]:
from io import StringIO
from dotenv import load_dotenv
from dotenv import dotenv_values
from dotenv import find_dotenv
import os

In [3]:
load_dotenv(find_dotenv())

True

In [4]:
username = os.getenv("USERNAME")
password = os.getenv("PASSWORD")

In [5]:
def login_to_reddit(driver, username, password):
    driver.get("https://www.reddit.com/login/")
    time.sleep(2)  
    username_field = driver.find_element(By.ID, 'loginUsername')
    username_field.send_keys(username)
    password_field = driver.find_element(By.ID, 'loginPassword')
    password_field.send_keys(password)
    login_button = driver.find_element(By.XPATH, "//button[contains(@class, 'AnimatedForm__submitButton')]")
    login_button.click()
    time.sleep(2)  # Note: I tried 1 sec and it's too short maybe my wifi speed lol 

In [10]:
def expand_comments(driver):
    try:
        WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.XPATH, "//div")))

        while True:
            try:
                more_replies_button = driver.find_element(By.XPATH, "//*[contains(text(), 'more replies')]")
                ActionChains(driver).move_to_element(more_replies_button).click().perform()
                
                time.sleep(3)
            except NoSuchElementException:
                break
            except TimeoutException:
                break

    except Exception as e:
        print(f"Error in expanding comments: {e}")

In [11]:
def extract_comments(driver, article):
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    comment_divs = soup.find_all('div', attrs={'data-testid': 'comment'})

    data = []
    for div in comment_divs:
        header_div = div.find_previous_sibling('div', attrs={'data-testid': 'post-comment-header'})

        if header_div:
            author_element = header_div.find('a', attrs={'data-testid': 'comment_author_link'})
            author = author_element.get_text(strip=True) if author_element else 'Null'

            timestamp_element = header_div.find('a', attrs={'data-testid': 'comment_timestamp'})
            timestamp = timestamp_element.get_text(strip=True) if timestamp_element else 'Null'
        else:
            author = 'Null'
            timestamp = 'Null'

        comment_text_element = div.find('div', class_='RichTextJSON-root') 
        comment_text = comment_text_element.get_text(strip=True) if comment_text_element else 'Null'

        data.append({
            'Article': article,
            'Author': author,
            'Timestamp': timestamp,
            'Comment': comment_text,
        })

    return data

In [12]:
def scrape_reddit_comments(file_path, start_row, batch_size):
    df_links = pd.read_csv(file_path)
    all_comments = []
    end_row = start_row + batch_size

    login_to_reddit(driver, username, password)

    for index, row in enumerate(df_links[start_row:end_row].itertuples(), start=start_row):
        link = row.Link
        print(f"Processing {index + 1}/{end_row}: {link}")

        try:
            driver.get(link)
            expand_comments(driver)  
            comments_data = extract_comments(driver, link)
            all_comments.extend(comments_data)
        except Exception as e:
            print(f"Error processing {link}: {e}")
            continue  

    return pd.DataFrame(all_comments)

# The Main Spider 🕷️

In [46]:
driver = webdriver.Chrome()

start_row = 0
batch_size = 255

file_path = '../../Data/Raw Pulls /RAW_rPSVR_reddit_titles.csv'

# r/Playstation - Done
# r/PlaystationPlus - Done
# r/PlaystationPortal - Done
# r/PS4 - Done
# r/PS5 - Done 
# r/PSVR - Done

df_comments = scrape_reddit_comments(file_path, start_row, batch_size)
print(df_comments)

driver.quit()

Processing 1/255: https://www.reddit.com/r/PSVR/comments/11jfi3q/the_vr2_controller_charging_station_just_burned/
Processing 2/255: https://www.reddit.com/r/PSVR/comments/11mvewm/i_cant_see_any_improvements_at_all/
Processing 3/255: https://www.reddit.com/r/PSVR/comments/11bt9n8/the_only_negative_thing_about_these_controllers/
Processing 4/255: https://www.reddit.com/r/PSVR/comments/115iiqf/im_ecstatic_cant_believe_it_i_got_it_this_early/
Processing 5/255: https://www.reddit.com/r/PSVR/comments/141rtr4/the_psvr2s_price_isnt_looking_so_bad_now/
Processing 6/255: https://www.reddit.com/r/PSVR/comments/11kctxj/_/
Processing 7/255: https://www.reddit.com/r/PSVR/comments/11umest/its_getting_a_bit_annoying_at_this_point/
Processing 8/255: https://www.reddit.com/r/PSVR/comments/1142rch/life_can_change_in_an_instant_a_couple_of_weeks/
Processing 9/255: https://www.reddit.com/r/PSVR/comments/11b7y2a/playing_air_guitar_on_vr_what_i_see_x_what_she/
Processing 10/255: https://www.reddit.com/r/PSVR

---
---
---

# Testing with itables 🧪

In [13]:
from itables import show

In [47]:
print(df_comments.shape)

(25935, 4)


In [49]:
show(df_comments)

Article,Author,Timestamp,Comment
Loading... (need help?),,,


In [48]:
# df_comments.to_csv('RAW_rPSVR_reddit_comments_batch0-end.csv', index=False)