In [1]:
# ===== Selenium core browser automation =====
from selenium import webdriver # webdriver = controls a real browser (Chrome, Firefox, etc.)
from selenium.webdriver.common.by import By # By = tells Selenium HOW to locate elements (tag, class, id, css, xpath)
from selenium.webdriver.chrome.service import Service # Service = connects Selenium to ChromeDriver (the browser bridge)
from selenium.webdriver.common.keys import Keys # Keys = simulate keyboard presses (END, ENTER, PAGE_DOWN)


# ===== Standard Python libraries =====
import time # time.sleep() = pause execution (used to wait for page loading)
import json # json = save scraped data into a structured file


In [None]:
# ===== 1. Setup ChromeDriver =====
# This is the path to chromedriver.exe on your machine
# ChromeDriver allows Python to control Google Chrome
# Download ChromeDriver here: https://googlechromelabs.github.io/chrome-for-testing/
service = Service(
    r"C:\Users\WINDOWS-10\Downloads\chromedriver-win64\chromedriver-win64\chromedriver.exe"
)
# Found problem 
# latest update of chrome(v144) is not compatible with latest chromedriver(v145)
# Solution too complex for learning purpose simplest solution to wait out chrome(v145)


# Launch a real Chrome browser controlled by Selenium
driver = webdriver.Chrome(service=service)



In [None]:
# ===== 2. Open the target website =====
url = "https://www.reddit.com/r/malaysia/"
driver.get(url)
# driver.get() tells the browser to open a URL

time.sleep(3)
# Pause execution for 3 seconds
# Wait for JavaScript-heavy content to load
# Reddit loads posts AFTER the page opens

In [None]:
# ===== 3. Scroll the page to load more posts =====
# Reddit uses infinite scrolling, not pages
for i in range(10):  # simulate scrolling down 10 times
    # Find the <body> element and press END key 
    driver.find_element(By.TAG_NAME, "body").send_keys(Keys.END)

    time.sleep(2)
    # Wait so Reddit has time to load new posts

In [None]:
# ===== 4. Prepare storage for scraped data =====
results = []
# results will store dictionaries like:
# {"post_title": "...", "image_url": "..."}

# Grab ALL <img> tags currently loaded on the page
images = driver.find_elements(By.TAG_NAME, "img")

In [None]:

# ===== 5. Extract useful images only =====
for img in images:
    # alt attribute often contains post title or description
    title = img.get_attribute("alt")

    # src attribute contains the image URL
    image_url = img.get_attribute("src")

    # Filtering logic:
    # - must have title and image URL
    # - remove emojis, icons, avatars, UI elements
    # - skip SVG icons (not real images)
    if (
        title and image_url
        and "emoji" not in image_url
        and "icon" not in image_url
        and "avatar" not in image_url
        and "communityIcon" not in image_url
        and "profileIcon" not in image_url
        and not image_url.endswith(".svg")
    ):
        # Store clean data in dictionary form
        results.append({
            "post_title": title,
            "image_url": image_url
        })



In [None]:
# ===== 6. Debug output =====
print(f"Collected {len(results)} posts")
# Shows how many valid images were scraped

print(results[:5])
# Preview first 5 items to check data structure

Collected 44 posts
[{'post_title': 'r/malaysia - Penerangan BUDI95', 'image_url': 'https://preview.redd.it/penerangan-budi95-v0-u15v4cra7qqf1.jpg?width=640&crop=smart&auto=webp&s=60a01081cea80948cd6ce1802ad89382d0ef63f4'}, {'post_title': 'r/malaysia - Penerangan BUDI95', 'image_url': 'https://preview.redd.it/penerangan-budi95-v0-9o2f57kb7qqf1.jpg?width=640&crop=smart&auto=webp&s=9b594798a1f769fec475d984f2a1fc590d7ae589'}, {'post_title': 'r/malaysia - Penerangan BUDI95', 'image_url': 'https://preview.redd.it/penerangan-budi95-v0-yzdf9xia7qqf1.jpg?width=640&crop=smart&auto=webp&s=e91e5882a37ffb988dafd7c4af3bfcf49f44f37a'}, {'post_title': 'r/malaysia - Penerangan BUDI95', 'image_url': 'https://preview.redd.it/penerangan-budi95-v0-pjv5zfva7qqf1.jpg?width=640&crop=smart&auto=webp&s=a3ba3fde4d7f2245c61b10d1dc4c9400827987c4'}, {'post_title': 'r/malaysia - [VIDEO] MBMB Shuts Down Melaka Café After TikTok of "Lonely Alpaca" Goes Viral', 'image_url': 'https://external-preview.redd.it/ESKisS63HlJ

In [None]:
# ===== 7. Save data to JSON file =====
with open("reddit_posts.json", "w", encoding="utf-8") as f:
    # json.dump converts Python list → JSON file
    json.dump(results, f, indent=4, ensure_ascii=False)

# reddit_posts.json can now be used by HTML / JS / APIs

In [None]:
# ===== 8. Close the browser =====
driver.quit()
# Always quit the browser to free memory and system resources