We need a catalog of available laptops to get started. 
* Kaggle has a dataset available but that dataset is sparse on actual specs for each. 
* Google shopping API has apparently been deprecated, there is `noteb.com` but that didn't work reliably. 

Product comparison websites do a good job of maintaining detailed specs. For ex, the website (https://www.91mobiles.com/laptopfinder.php) seems to have a good collection of laptop specs. This notebook scrapes that data using playwright, and the BeautifulSoup library. 

## Fetch the list of all available laptops (around 2k)

In [1]:
BASE_URL = "https://www.91mobiles.com/laptopfinder.php"
data = []

In [2]:
from playwright.async_api import async_playwright

In [3]:
p = await async_playwright().start()
browser = await p.firefox.launch(headless=False)
page = await browser.new_page()
await page.goto(BASE_URL)

<Response url='https://www.91mobiles.com/laptopfinder.php' request=<Request url='https://www.91mobiles.com/laptopfinder.php' method='GET'>>

In [4]:
await page.wait_for_selector("#promarketstatusmarketstatusavastores")
await page.click("#promarketstatusmarketstatusavastores")

In [5]:
iterations = 0
while True:
    await page.evaluate(
        """async () => {
        for (let i = 0; i < document.body.scrollHeight; i += 100) {
          window.scrollTo(0, i);
        }
    }"""
    )
    await page.wait_for_selector(".title_ul > .finder_snipet_wrap")
    await page.wait_for_selector("#finder_pagination > div > div")

    nodes = await page.query_selector_all(".title_ul > .finder_snipet_wrap")
    for node in nodes:
        img = await node.query_selector(".finder-image-box img")
        img_src = await img.get_attribute("src")
        img_title = await img.get_attribute("title")

        el_link = await node.query_selector("a")
        ref = await el_link.get_attribute("href")
        specs_link = f"www.91mobiles.com/{ref}#specifications"

        data.append(
            {"title": img_title, "img": img_src.strip("//"), "specs": specs_link}
        )

    iterations += 1
    print(f"num iters: {iterations}")

    page_links = await page.query_selector_all("#finder_pagination > div > div")
    next_exists = False
    for link in page_links:
        text = await link.text_content()
        if text.strip("\n ").lower() == "next":
            next_exists = True
            break

    if next_exists:
        await page.evaluate(f"submitPage('next', '{iterations + 1}')")
        await page.wait_for_timeout(1000)
    else:
        break

num iters: 1
num iters: 2
num iters: 3
num iters: 4
num iters: 5
num iters: 6
num iters: 7
num iters: 8
num iters: 9
num iters: 10
num iters: 11
num iters: 12
num iters: 13
num iters: 14
num iters: 15
num iters: 16
num iters: 17
num iters: 18
num iters: 19
num iters: 20
num iters: 21
num iters: 22
num iters: 23
num iters: 24
num iters: 25
num iters: 26
num iters: 27
num iters: 28
num iters: 29
num iters: 30
num iters: 31
num iters: 32
num iters: 33
num iters: 34
num iters: 35
num iters: 36
num iters: 37
num iters: 38
num iters: 39
num iters: 40
num iters: 41
num iters: 42
num iters: 43
num iters: 44
num iters: 45
num iters: 46
num iters: 47
num iters: 48
num iters: 49
num iters: 50
num iters: 51
num iters: 52
num iters: 53
num iters: 54
num iters: 55
num iters: 56
num iters: 57
num iters: 58
num iters: 59
num iters: 60
num iters: 61
num iters: 62
num iters: 63
num iters: 64
num iters: 65
num iters: 66
num iters: 67
num iters: 68
num iters: 69
num iters: 70
num iters: 71
num iters: 72
n

In [6]:
await browser.close()

In [7]:
import json

with open("data/list_laptop_urls.jsonl", "w") as f:
    for d in data:
        f.write(json.dumps(d) + "\n")

## Load the full specifications for each laptop now

In [1]:
import json

list_laptops = []
with open("data/list_laptop_urls.jsonl", "r") as f:
    for line in f:
        list_laptops.append(json.loads(line))

In [2]:
import asyncio

import httpx
from bs4 import BeautifulSoup


async def fetch_specs(client, laptop):
    url = "https://" + laptop["specs"]
    try:
        resp = await client.get(url)
        soup = BeautifulSoup(resp, "html.parser")
        info = {}
        spec_boxes = soup.find_all("div", class_="spec_box")
        for box in spec_boxes:
            deets = {}
            rows = box.find_all("tr")
            for row in rows:
                key = row.find("td", class_="spec_ttle").text.strip()
                val = row.find("td", class_="spec_des").text.strip()
                deets[key] = val

            catg = box.find("span").text
            info[catg.strip()] = deets

        price_span = soup.find("div", class_="price_div").find(
            "span", class_="big_prc", attrs={"itemprop": "price"}
        )
        price = int(price_span.attrs["content"])

        ratingValue = soup.find("span", {"itemprop": "ratingValue"}).attrs["content"]
        ratingCount = soup.find("span", {"itemprop": "ratingCount"}).attrs["content"]

        img_el = soup.find("img", {"class": "overview_lrg_pic_img", "id": "mainImage"})
        return {
            "url_specs": url,
            "url_img_small": "https://" + laptop["img"],
            "url_img_large": "https:" + img_el.attrs["src"],
            "title": laptop["title"],
            "specs": info,
            "price": price,
            "rating_value": float(ratingValue.split("/")[0]),
            "rating_count": int(ratingCount),
        }
    except Exception as e:
        print("failed: " + laptop["title"])
        return {}

In [3]:
def chunk(iterable, n):
    out = []
    i = 0
    for el in iterable:
        out.append(el)
        i += 1
        if i == n:
            yield out
            out = []
            i = 0
    yield out

In [4]:
list_specs = []

async with httpx.AsyncClient() as client:
    for i, laptops in enumerate(chunk(list_laptops, 100)):
        tasks = []
        for laptop in laptops:
            tasks.append(asyncio.ensure_future(fetch_specs(client, laptop)))
        list_specs.extend(await asyncio.gather(*tasks))
        print("iter finished")
        await asyncio.sleep(2)

iter finished
failed: HP 14s-DY2501TU (3T170PA) Laptop (Core i3 11th Gen/8 GB/256 GB SSD/Windows 10)
failed: Infinix INBook X1 Slim XL21 Laptop (Core i5 10th Gen/8 GB/512 GB SSD/Windows 11/128 MB)
failed: HP 15s-FR2508TU (546K3PA) Laptop (Core i3 11th Gen/8 GB/512 GB SSD/Windows 11)
failed: MSI GF65 Thin 10UE-410IN Laptop (Core i7 10th Gen/16 GB/1 TB SSD/Windows 10/6 GB)
failed: Lenovo Legion 5 (82B500BHIN) Laptop (AMD Hexa Core Ryzen 5/8 GB/1 TB 256 GB SSD/Windows 10/4 GB)
failed: HP 15s-fq5112TU (6Q2M3PA) Laptop (Core i5 12th Gen/16 GB/512 GB SSD/Windows 11)
failed: Asus Vivobook K15 OLED K513EA-L312WS Laptop (Core i3 11th Gen/8 GB/512 GB SSD/Windows 11)
failed: Microsoft Surface Book 2 (HNN-00001) Laptop (Core i7 8th Gen/16 GB/1 TB SSD/Windows 10/2 GB)
failed: Asus TUF FA766IC-HX005T Laptop (AMD Octa Core Ryzen 7/16 GB/512 GB SSD/Windows 10/4 GB)
failed: Infinix INBook X1 Pro XL12 Laptop (Core i7 10th Gen/16 GB/512 GB SSD/Windows 11)
failed: HP Pavilion x360 14-dy0190TU (533T7PA) La

In [5]:
import json

with open("data/list_laptop_specs.jsonl", "w") as f:
    for row in list_specs:
        if len(row) > 0:
            f.write(json.dumps(row) + "\n")

In [9]:
with open("data/list_laptop_specs.jsonl", "r") as f:
    rows = [json.loads(l) for l in f]
len(rows)

1151

In [10]:
rows[0]

{'url_specs': 'https://www.91mobiles.com//apple-m1-mgnd3hn-a-apple-m1-8-gb-256-gb-macos-big-sur-laptop-price-in-india-141587#specifications',
 'url_img_small': 'https://www.91-img.com/pictures/laptops/apple/apple-m1-mgnd3hn-a-141587-v1-small-1.jpg?tr=q-80',
 'url_img_large': 'https://www.91-img.com/pictures/laptops/apple/apple-m1-mgnd3hn-a-141587-v1-large-1.jpg?tr=q-80',
 'title': 'Apple MacBook Air M1 MGND3HN/A Ultrabook (Apple M1/8 GB/256 GB SSD/macOS Big Sur)',
 'specs': {'General Information': {'Brand': 'Apple',
   'Model': 'M1 MGND3HN/A',
   'Dimensions(WxHxD)': '304.1 x 212.4 x 10.9 \xa0mm',
   'Weight': '1.29 Kg',
   'Colors': 'Gold',
   'Operating System': 'macOS Big Sur'},
  'Display Details': {'Display Size': '13.3 Inches (33.78 cm)',
   'Display Resolution': '2560 x 1600 Pixels',
   'Pixel Density': '227 ppi',
   'Display Type': 'LED',
   'Display Features': 'Quad LED Backlit IPS Display (227 PPI, 400 nits Brightness, Wide Colour (P3), True Tone Technology)',
   'Display Tou