# IGN Spider 🕷️ 🕸️ for PlayStation Posts 🎮

**https://www.ign.com/robots.txt**

In [1]:
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException, TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
from selenium.webdriver import Keys
from pyshadow.main import Shadow
import pandas as pd
import time

In [2]:
def expand_comments(driver):
    try:
        print("Scrolling to the bottom of the page...")
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(5)  

        print("Looking for the comments section...")
        shadow = Shadow(driver)
        comments_section = shadow.find_element("div[data-spotim-module='conversation']")
        print("Comments section found, scrolling into view...")
        driver.execute_script("arguments[0].scrollIntoView();", comments_section)

        try:
            show_more_button = shadow.find_element("div[data-spotim-module='conversation'] button[data-spot-im-class='load-more-messages']")
            print("Expanding comments...")
            show_more_button.click()
            time.sleep(3)
            print("Comments expanded.")
        except NoSuchElementException:
            print("No 'Show More' button found, or it's not clickable.")

    except TimeoutException:
        print("Timed out waiting for comments to load.")
        print(driver.page_source[:1500]) 
    except NoSuchElementException:
        print("Could not find the comments container on the page.")
    except Exception as e:
        print(f"An error occurred: {e}")

In [3]:
def extract_comments(driver, article_title):
    shadow = Shadow(driver)
    comments = shadow.find_elements("div[data-spotim-module='conversation'] li[aria-label='Comment']")
    comments_data = []

    for comment in comments:
        try:
            author = comment.find_element(By.CSS_SELECTOR, 'span[data-spot-im-class="message-username"]').text
        except NoSuchElementException:
            author = "Null"
        try:
            timestamp = comment.find_element(By.CSS_SELECTOR, 'time[data-spot-im-class="message-timestamp"]').text
        except NoSuchElementException:
            timestamp = "Null"
        try:
            commentTextDiv = comment.find_element(By.CSS_SELECTOR, 'div[data-spot-im-class="message-text"]')
            comment_text = commentTextDiv.text if commentTextDiv else ''
        except NoSuchElementException:
            comment_text = "Null"

        comments_data.append({'Author': author, 'Timestamp': timestamp, 'Comment': comment_text, 'Article': article_title})
        
    print(f"✅ Extracted {len(comments_data)} comments from {article_title}")
    
    return comments_data

In [4]:
def scrape_ign_comments(file_path, start_row=0, batch_size=None): # 🚥 # -- Batch out of curtesy -- 🚥
    df_links = pd.read_csv(file_path)
    if batch_size is not None:
        df_links = df_links[start_row:start_row+batch_size]
    total_links = len(df_links)
    
    options = Options()
    options.add_argument("user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7)")
    options.add_argument("--disable-blink-features=AutomationControlled")
    options.add_experimental_option("excludeSwitches", ["enable-automation"])
    options.add_experimental_option('useAutomationExtension', False)
    
    driver = webdriver.Chrome(options=options)
    all_comments = []

    for index, link in enumerate(df_links['Link'], start=start_row):
        try:
            print(f"{index+1}/{total_links+start_row} Processing URL: {link}")
            article_title = link.split('/')[-1]  
            driver.get(link)
            time.sleep(2)
            body = driver.find_element(By.CSS_SELECTOR, 'body')
            body.send_keys(Keys.PAGE_DOWN)
            print("Quick manual scroll done")
            time.sleep(1)  
            expand_comments(driver)
            comments_data = extract_comments(driver, article_title)
            all_comments.extend(comments_data)
        except Exception as e:
            print(f'An error occured while processing {link}: {e}')
            continue # I was tired at this point... 🤷‍♂️

    driver.quit()
    return pd.DataFrame(all_comments)

In [47]:
file_path = '../../Data/Raw Pulls /RAW_ign_PS_titles.csv' 
df_comments = scrape_ign_comments(file_path, start_row=3218, batch_size=200)  # 🚥 # -- Batch out of curtesy -- 🚥
print(df_comments)

3219/3363 Processing URL: https://www.ign.com/articles/what-dead-space-gets-right-that-the-callisto-protocol-got-wrong
Quick manual scroll done
Scrolling to the bottom of the page...
Looking for the comments section...
Comments section found, scrolling into view...
Expanding comments...
An error occurred: Message: element click intercepted: Element <button role="button" data-spmark="show-more" data-spot-im-class="load-more-messages" aria-label="Show more comments" data-openweb-allow-amp="true" type="button" class="Button__button--11-4-15 Button__primary--11-4-15 Button__isEllipsis--11-4-15 Button__hoverBackground--11-4-15 spcv_load-more-messages">...</button> is not clickable at point (421, 843). Other element would receive the click: <div id="google_ads_iframe_/5691/ign_desktop_web_display/article_4__container__" style="border: 0pt none; margin: auto; text-align: center; width: 728px; height: 66px;"></div>
  (Session info: chrome=120.0.6099.129)
Stacktrace:
0   chromedriver           

---
---
---

# 🔍 Inspection Area 🥼 🤓

In [6]:
from itables import show 

In [48]:
print(df_comments.head(2))

print(df_comments.shape)

           Author         Timestamp  \
0  Loneranger2323  27 January, 2023   
1         tenken8  27 January, 2023   

                                             Comment  \
0  I think IGN needs to stop comparing TCP to the...   
1  I liked Callisto. But it is fundamentally a ve...   

                                             Article  
0  what-dead-space-gets-right-that-the-callisto-p...  
1  what-dead-space-gets-right-that-the-callisto-p...  
(1138, 4)


In [49]:
show(df_comments)

Author,Timestamp,Comment,Article
Loading... (need help?),,,


In [50]:
# to csv
# df_comments.to_csv('RAW_ign_PS_comments_batch3220-end.csv', index=False)