In [40]:
import sys, os
import time
import re
import urllib
from pprint import pprint
import pandas as pd
import bs4
from bs4 import BeautifulSoup
import selenium
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
print(f'Carousell Scraping V1.0 by Billy Cao\nRunning on Python {sys.version}, Selenium {selenium.__version__}, BeautifulSoup {bs4.__version__}')

def request_page(url):
    """ Returns BeautifulSoup4 Objects (soup)"""
    driver.get(url)
    page = 1
    timeout = 5
    if page_limit:
        while page < page_limit:
            try:
                next_page_btn = WebDriverWait(driver, timeout).until(EC.presence_of_element_located((By.XPATH, '//main[1]/div/button[.="Load more"]')))  # wait max timeout sec for loading
                target_scroll_end = driver.execute_script("return document.body.scrollHeight")
                while scroll_pos <= target_scroll_end:
                    driver.execute_script("window.scrollTo(0,document.body.scrollHeight)")  # scroll page to bottom to load images
                driver.execute_script("arguments[0].click();", next_page_btn)  # click the load more button through ads
                page += 1
            except TimeoutException as e:
                break
    else:
        while True:  # scrap all
            try:
                next_page_btn = WebDriverWait(driver, timeout).until(EC.presence_of_element_located((By.XPATH, '//main[1]/div/button[.="Load more"]')))  # wait max timeout sec for loading
                driver.execute_script("arguments[0].click();", next_page_btn)  # click the load more button through ads
                page += 1
            except TimeoutException as e:
                break
    time.sleep(timeout)
    print(f'All results loaded. Total: {page} pages.')
    return BeautifulSoup(driver.page_source, "html.parser")

def parse_info(item_div, mode=1):
    try:
        a = item_div.div.find_all('a', recursive=False)
        seller_divs = a[0].find_all('div', recursive=False)[1]
        item_p = a[1].find_all('p', recursive=False)
        img = a[1].div.find_all('div', recursive=False)[-1].img['src']
        likes = item_div.find_all('div', recursive=False)[1].button.span.get_text()
        if mode == 1:
            return {'seller_name': seller_divs.p.get_text(),
                    'seller_url': home+a[0]['href'],
                    'item_name': a[1].find_all('div', recursive=False)[1].p.get_text(),
                    'item_url': home+a[1]['href'],
                    'img': img,
                    'time_posted': seller_divs.div.p.get_text(),  # TODO: process into absolute datetime
                    'condition': item_p[1].get_text(),
                    'likes': likes,
                    'price': re.findall(r"\d+", item_p[0].get_text().replace(',', ''))[0]}  # 0 is discounted price, 1 is original price, if applicable
        else:
            return {'seller_name': seller_divs.p.get_text(),
                    'seller_url': home+a[0]['href'],
                    'item_name': item_p[0].get_text(),
                    'item_url': home+a[1]['href'],
                    'img': img,
                    'time_posted': seller_divs.div.p.get_text(),  # TODO: process into absolute datetime
                    'condition': item_p[3].get_text(),
                    'likes': likes,
                    'price': re.findall(r"\d+", item_p[1].get_text().replace(',', ''))[0]}  # 0 is discounted price, 1 is original price, if applicable
    except:
        pass

home = 'https://sg.carousell.com'
item = input('Enter item to scrape: ')
page_limit = int(input('Up to how many pages to scrap? Each page is 23-25 listings, enter 0 to scrap all: '))
extension = f'/search/{urllib.parse.quote(item)}'
opts = Options()
opts.add_experimental_option('prefs', {'intl.accept_languages': 'en,en_US'})
caps = DesiredCapabilities().CHROME
caps["pageLoadStrategy"] = "normal"  #  Waits for full page load
driver = webdriver.Chrome(desired_capabilities=caps, options=opts)
# driver.minimize_window()
print(f'Chrome Web Driver loaded. Version: {driver.capabilities["browserVersion"]}\n')  # use "version" on Linux
parse_mode = 1  # Carousell have 2 formats of their item divs. See below comment for more info.
tries = 1

while tries < 5:  # retrying loop as the div class position is random
    try:
        print(f'Retrieving search results on {item}...')
        search_results_soup = request_page(home+extension)
        # TODO: Find concrete way to locate correct class name, current work around works 99% of times.
        item_divs_class = ' '.join(search_results_soup.find('main').find('div').find('div').find('div')['class'])  # changes randomly but 99% of the time its the first div
        print(f'Detected item_divs class: {item_divs_class}')
        item_divs = search_results_soup.find('main').find('div').find('div').find_all('div', class_=item_divs_class, recursive=False)  # filter out ads divs
        print(f'Found {len(item_divs)} listings. Parsing...')
        items_list = [parse_info(item_div, parse_mode) for item_div in item_divs]
        break
    except AttributeError as e:  # no item_divs at all
        print(e)
        raise RuntimeError('The search has returned no result.')
    except IndexError as e:
        print(e)
        print(f'Parsing attempt {tries} failed due to class name error using parse mode {parse_mode}. Retrying with parse mode 2...\n')
        tries += 1
        parse_mode = 2
        continue
else:
    raise RuntimeError('Parsing failed as it still faces IndexError after 5 tries.')

driver.quit()
print(f'Parse success using mode {parse_mode}! Sample item parsed:')
pprint(items_list[0])
df = pd.DataFrame(items_list)
print(df.describe())
print(df)
df.to_csv(f'{item}.csv', index=False)
print(f'Results saved to {item}.csv')

'''
Two parse modes only differs in item divs 2nd a
Structure of Carousell HTML FORMAT 1 (parse_mode 1):
body > find main > 1st div > 1st div > divs of items
    in divs of items > parents of each item
        parent > 1st div > 1st a is seller, 2nd a is item page
            in 1st a: 2nd div > p is seller name, > div > p is time posted
            in 2nd a: 2nd div > p is item name but with ... if too long, directly under 2nd a first p is price, 2nd p is condition
        parent > 2nd div > button > span is number of likes
total 24 or 25 results loaded once.

Structure of Carousell HTML FORMAT 2 (parse_mode 2):
body > find main > 1st div > 1st div > divs of items
    in divs of items > parents of each item
        parent > 1st div > 1st a is seller, 2nd a is item page
            in 1st a: 2nd div > p is seller name, > div > p is time posted
            in 2nd a: 1st p is FULl NAME, 2nd p is price, 3rd p is description, 4th p is condition
        parent > 2nd div > button > span is number of likes
total 24 or 25 results loaded once.

body > find main > div > button to view more
view more button loads on top of existing, so can prob spam view more then gather all items at once
MAY NOT BE FIRST DIV! Temp workaround is to get class name of the correct item divs
'''

Carousell Scraping V1.0 by Billy Cao
Running on Python 3.9.9 (tags/v3.9.9:ccb0e6a, Nov 15 2021, 18:08:50) [MSC v.1929 64 bit (AMD64)], Selenium 4.1.0, BeautifulSoup 4.10.0
Chrome Web Driver loaded. Version: 97.0.4692.71

Retrieving search results on apple...
All results loaded. Total: 1 pages.
Detected item_divs class: D_wt D_tm D_tr
Found 19 listings. Parsing...
<div class="D_t_"><div class="D_tu" style="background-color: rgb(240, 241, 241);"><img alt="" class="D_lu D_lr D_tv" src="https://sl3-cdn.karousell.com/components/tag_icons/protection@xxxhdpi.png" title=""/><p class="D_hg D_fK D_hh D_hk D_hm D_hp D_hr D_hc">Protection</p></div><div class="D_rm D_tK D_rr D_rn"><p class="D_hg D_fN D_hh D_hk D_hn D_hp D_hr D_he">Spotlight</p></div><div class="D_yi D_yl"><img alt="Buy in  new,used,spoilt Macbooks &amp; laptops ,iMac " class="D_lu D_lr D_yp" src="https://media.karousell.com/media/photos/products/2020/10/7/buy_in_newusedspoilt_laptops___1602050065_479a4837_progressive_thumbnail.jpg"

AttributeError: 'NoneType' object has no attribute 'keys'

In [40]:
import requests
from bs4 import BeautifulSoup
import json
import time

def request_page(url):
    """ Returns BeautifulSoup4 Objects (soup)"""
    # driver.get(url)
    r = requests.get(url)
    return BeautifulSoup(r.text, "html.parser")

# opts = Options()
# opts.add_experimental_option('prefs', {'intl.accept_languages': 'en,en_US'})
# caps = DesiredCapabilities().CHROME
# caps["pageLoadStrategy"] = "normal"  #  Waits for full page load
# driver = webdriver.Chrome(desired_capabilities=caps, options=opts)
# # driver.minimize_window()
# print(f'Chrome Web Driver loaded. Version: {driver.capabilities["browserVersion"]}\n')  # use "version" on Linux

items = {}
prefix = 'https://www.carousell.sg/p/'
# id = 1134572107
failed_items = 0
time_taken = 0
for id in range(start_id, start_id-items_to_scrap, -1):
    start = time.time()
    url = prefix + str(id)
    item_soup = request_page(url)
    try:
        item_soup = item_soup.body.find('div').find('div').find_all('div', recursive=False)[2]
        top_section = item_soup.find('section').find('div').find('div').find('div')
        title = item_soup.find_all('div', recursive=False)[1].find('div').find('div').find('div').find('p').get_text().strip()
        likes = top_section.find_all('div', recursive=False)[1].find_all('button', recursive=False)[-1].find('p').get_text().split()[0]  # remove trailing 'likes'
        img = top_section.find_all('div', recursive=False)[-1].find('div').find('img')['src']
        item_details_soup = item_soup.find_all('div', recursive=False)[1].find('div').find('div').find_all('div', recursive=False)[1].find('section').find_all('div', recursive=False)[3]
        # item_details_soup = [soup for soup in item_details_soup if soup.get_text() == 'Description'][0]  # position of div may change so filter based on text
        item_details_divs = item_details_soup.find_all('div', recursive=False)
        attribute_divs = item_details_divs[0].find('div').find_all('div', recursive=False)
        attributes = {div.find_all('p', recursive=False)[0].get_text(): div.find_all('p', recursive=False)[1].get_text() for div in attribute_divs}
        attributes.pop('Posted')  # remove time posted, also act as a check, if got error, then scraping algo likely failed
        description = item_details_divs[1].get_text().replace('\n', ' ').replace('\t', ' ').replace('\r', ' ').strip()
        items.update({id: {'title': title, 'url': url, 'img': img, 'attributes': attributes, 'desc': description, 'likes': likes}})
    except Exception as e:  # means product already delisted, attribute error confirm is, rest is other reasons
        failed_items += 1
        print(url, e)
    end = time.time()
    time_taken += end - start

print(f'Scraping completed. Successfully scraped {len(items)}, failed {failed_items}, total time taken: {time_taken}, average time taken: {round(time_taken / len(items_to_scrap), 5)}')

with open('data.json', 'w') as f:
    json.dump(items, f)

NameError: name 'start_id' is not defined

In [83]:
import requests
import json

count = 40  # 40 max
query = 'rtx 3080ti'
url = 'https://www.carousell.sg/api-service/search/cf/4.0/search/'
product_url_prefix = 'https://www.carousell.sg/p/'
header = {"bestMatchEnabled":'true',"canChangeKeyword":'true',"count":count,"countryCode":"SG","countryId":"1880251","filters":[],"includeSuggestions":'true',"locale":"en","prefill":{},"query": query}
r = requests.post(url, data=header)
r = json.loads(r.text)
total_results_in_db = r['data']['formattedTotal']  # string as it may include '10000+'
r = r['data']['results']
print('total results returned:', len(r))
for i in r:
    i = i['listingCard']
    print(i['seller']['username'])
    print(i['photoUrls'])
    print(i['price'])
    print(i['title'])
    print(i['likesCount'])
    link = product_url_prefix+i['id']
    print(link)
    print()

In [39]:
import json
import requests
import shutil

with open('data.json', 'r') as f:
    data = json.load(f)
for i, v in data.items():
    r = requests.get(v['img'], stream=True)
    r.raw.decode_content = True
    with open(f'imgs/{i}.jpg','wb') as f:
        shutil.copyfileobj(r.raw, f)