To feed into the recommender system downstream, we gotta clean up the scraped laptop data and make it search-friendly. For ex, parsing numerical attributes into numbers, or some string values into categories. 

In [1]:
import json

import pandas as pd

In [2]:
all_laptops = []
with open("data/list_laptop_specs.jsonl") as f:
    for line in f:
        all_laptops.append(json.loads(line))

In [3]:
# a sample product
all_laptops[100]

{'url_specs': 'https://www.91mobiles.com//asus-k15-oled-km513ua-l502ts-amd-hexa-core-ryzen-5-8-gb-1-tb-windows-10-laptop-price-in-india-150494#specifications',
 'url_img_small': 'https://www.91-img.com/pictures/laptops/asus/asus-k15-oled-km513ua-l502ts-150494-v1-small-1.jpg?tr=q-80',
 'url_img_large': 'https://www.91-img.com/pictures/laptops/asus/asus-k15-oled-km513ua-l502ts-150494-v1-large-1.jpg?tr=q-80',
 'title': 'Asus Vivobook K15 OLED KM513UA-L502TS Laptop (AMD Hexa Core Ryzen 5/8 GB/1 TB 256 GB SSD/Windows 10)',
 'specs': {'General Information': {'Brand': 'Asus',
   'Model': 'K15 OLED KM513UA-L502TS',
   'Dimensions(WxHxD)': '459 x 290 x 64 \xa0mm',
   'Weight': '1.8 Kg',
   'Colors': 'Black',
   'Operating System': 'Windows 10 Home Basic'},
  'Display Details': {'Display Size': '15.6 Inches (39.62 cm)',
   'Display Resolution': '1920 x 1080 Pixels',
   'Pixel Density': '141 ppi',
   'Display Type': 'OLED',
   'Display Features': 'NanoEdge display',
   'Display Touchscreen': 'No'

## Extract attributes

The attribute values are stored in nested dictionaries, and any given key in the chain could be absent. 

We need a custom getter to return None if any of the keys is missing. 

In [4]:
def get_prop(row, *keys):
    d = row
    for key in keys:
        if key in d:
            d = d[key]
        else:
            return None
    return d

#### Brand

The product brand is an important criteria for buyers. 

In [5]:
def read_brand(row):
    return get_prop(row, "specs", "General Information", "Brand")


brands = [read_brand(d) for d in all_laptops]
uniq_brands = pd.unique(brands)
uniq_brands

array(['Apple', 'Acer', 'HP', 'Asus', 'Realme', 'MSI', 'Infinix',
       'Xiaomi', 'Lenovo', 'Nokia', 'Dell', 'Samsung', 'Avita', 'VAIO',
       'Toshiba', 'Microsoft', 'Honor', 'RDP', 'Lava', 'LG', 'Nexstgo',
       'Wipro', 'Micromax'], dtype=object)

#### OS

Operating system is another.

In [6]:
def read_os(row):
    # The 64 bit vs 32 bit difference isn't relevant anymore
    return get_prop(row, "specs", "General Information", "Operating System")


oses = [read_os(d) for d in all_laptops]
uniq_oses = pd.unique(oses)
uniq_oses

array(['macOS Big Sur', 'Windows 11 Home Basic', 'Windows 11',
       'Windows 10 Home Basic', 'Google Chrome', 'macOS Sierra',
       'macOS Monterey Home Basic', 'DOS', 'macOS Monterey', 'Windows 8',
       'Windows 7 Home Basic', 'Windows 10 Professional',
       'macOS Catalina', 'Linux', 'Windows 8.1', 'Windows 10',
       'macOS Mojave', 'Windows 8 Professional',
       'Windows 8.1 Professional'], dtype=object)

Different editions of the same OS (Windows 10 vs 11) will come up as different entities, which we want to group since users can update later.

In [7]:
def read_os(row):
    os = get_prop(row, "specs", "General Information", "Operating System")
    if os is None:
        return None
    os = os.lower()
    if "windows" in os:
        return "Windows"
    elif "macos" in os:
        return "MacOS"
    elif "linux" in os or "ubuntu" in os:
        return "Linux"
    elif "chrome" in os:
        return "Chrome"
    else:
        return "Others"


oses = [read_os(d) for d in all_laptops]
uniq_oses = pd.unique(oses)
uniq_oses

array(['MacOS', 'Windows', 'Chrome', 'Others', 'Linux'], dtype=object)

#### Weight

In [8]:
get_prop(all_laptops[1], "specs", "General Information", "Weight")

'2.15 Kg'

We will have to parse the weights into numerical values. 

In [9]:
import re

re.match(r"([\d\.]+)\sKg", "2.15 Kg").groups()

('2.15',)

In [10]:
def read_weight(row):
    wt = get_prop(row, "specs", "General Information", "Weight")
    if wt is None:
        return None
    else:
        num = re.match(r"([\d\.]+)\sKg", wt).groups()[0]
        return float(num)


read_weight(all_laptops[2])

1.7

#### Display size

Display size next. Check if the units being used are standard. 

In [11]:
# pd.unique(
#     [get_prop(row, "specs", "Display Details", "Display Size") for row in all_laptops]
# )

We can read the screen size in inches

In [12]:
def read_screen_size(row):
    sz = get_prop(row, "specs", "Display Details", "Display Size")
    if sz is None:
        return None
    else:
        num = re.match(r"([\d\.]+)\sInches", sz).groups()[0]
        return float(num)


pd.unique([read_screen_size(row) for row in all_laptops])

array([13.3, 15.6, 14. , 17.3, 14.1, 11.6, 16.1, 16.6, 14.9, 16. , 15. ,
       16.2, 12.3, 14.2, 13.4, 13. , 10. , 12.5, 13.5, 15.2])

#### Display resolution

Next up is display resolution. 

In [13]:
# pd.unique(
#     [
#         get_prop(row, "specs", "Display Details", "Display Resolution")
#         for row in all_laptops
#     ]
# )

In [14]:
def read_resolution(row):
    res = get_prop(row, "specs", "Display Details", "Display Resolution")
    if res is None:
        return None
    else:
        nums = re.match(r"(\d+)\sx\s(\d+)", res)
        if nums is None:
            return None
        else:
            x = int(nums.groups()[0])
            y = int(nums.groups()[1])
            return (x, y)


pd.unique([read_resolution(r) for r in all_laptops])

array([(2560, 1600), (1920, 1080), (2160, 1440), (1366, 768),
       (2880, 1800), (2560, 1440), (1440, 900), (1920, 1200),
       (3456, 2234), (2736, 1824), (2256, 1504), (3000, 2000),
       (1800, 1200), (2240, 1400), (1920, 1280), (3840, 2160),
       (2496, 1664), (1600, 900), (3840, 2400), (3200, 1800), (1280, 800)],
      dtype=object)

#### Processor

In [15]:
# pd.unique([get_prop(r, "specs", "Performance", "Processor") for r in all_laptops])

The dominant brands seem to be Intel and AMD, besides Apple's own chips. We could bunch them together. 

In [16]:
def read_processor(row):
    proc = get_prop(row, "specs", "Performance", "Processor")
    if proc is None:
        return None
    proc = proc.lower()
    if "intel" in proc:
        return "Intel"
    elif "amd" in proc:
        return "AMD"
    elif "apple" in proc:
        return "Apple chip"
    else:
        return "Others"


pd.unique([read_processor(r) for r in all_laptops])

array(['Apple chip', 'AMD', 'Intel', 'Others'], dtype=object)

#### GPU

Whether a GPU is present or not is often a factor

In [17]:
pd.unique(
    [get_prop(row, "specs", "Performance", "Graphic Processor") for row in all_laptops]
)

array([None, 'NVIDIA GeForce GTX 1650', 'AMD Radeon', 'Intel UHD',
       'Intel Iris Xe Graphics', 'AMD Integrated',
       'NVIDIA GeForce RTX 3050', 'Intel Iris Xe', 'AMD Radeon RX5500M',
       'AMD Radeon RX 5500M', 'AMD Radeon RX Vega 10',
       'NVIDIA Geforce GTX 1650', 'AMD Radeon RX 6600M',
       'AMD Radeon Vega 6', 'Intel UHD 605', 'AMD Radeon Vega 8 Mobile',
       'MediaTek Integrated Graphics', 'NVIDIA GeForce GTX 1650 Max Q',
       'Intel UHD Graphics', 'Intel UHD 600', 'Intel Integrated',
       'NVIDIA GeForce GTX 1650 Ti', 'NVIDIA GeForce GTX 1650 MAX Q',
       'Intel Integrated UHD', 'Nvidia Geforce RTX 3050', 'Intel HD 500',
       'NVIDIA GeForce RTX 3060', 'AMD Radeon Graphics',
       'AMD Radeon Vega 8', 'PowerVR GX6250', 'Intel Integrated Graphics',
       'NVIDIA GeForce RTX 3050 Ti', 'NVIDIA GeForce RTX 3070',
       'NVIDIA GeForce RTX 2050', 'AMD Radeon R4', 'Intel Iris X',
       'NVIDIA GeForce RTX 1650', 'Intel HD 6000', 'Apple M2 GPU',
       'NVID

In [18]:
def read_gpu(row):
    gpu = get_prop(row, "specs", "Performance", "Graphic Processor")
    if gpu is None:
        return (None, None)
    gpu = gpu.lower()
    if "nvidia" in gpu or ("amd" in gpu and "integrated" not in gpu):
        sz = get_prop(row, "specs", "Performance", "Graphics Memory")
        brand = "NVIDIA" if "nvidia" in gpu else "AMD"
        if sz is not None:
            val = re.match(r"(\d+)", sz)
            if val is not None:
                sz = int(val.groups()[0])
        return (brand, sz)
    else:
        return ("Integrated", None)


pd.unique([read_gpu(r) for r in all_laptops])

array([(None, None), ('NVIDIA', 4), ('AMD', None), ('Integrated', None),
       ('AMD', 4), ('AMD', 8), ('NVIDIA', 6), ('NVIDIA', 8),
       ('NVIDIA', None), ('NVIDIA', 2), ('AMD', 12), ('NVIDIA', 3),
       ('AMD', 2)], dtype=object)

#### Battery

Battery life next. Lets look at values in the dataset for available numbers. 

In [19]:
pd.unique([get_prop(r, "specs", "Battery", "Battery Life") for r in all_laptops])

array(['15 Hrs', None, '7 Hrs', '6 Hrs', '4 Hrs', '11 Hrs', '12 Hrs',
       '10 Hrs', '8 Hrs', '9.45 Hrs', '9 Hrs', '7.15 Hrs', '13 Hrs',
       '9.30 Hrs', '9.3 Hrs', '14 Hrs', '3 Hrs', '18 Hrs', '9.5 Hrs',
       '5 Hrs', '9.29 Hrs', '10.8 Hrs', '20 Hrs', '7.5 Hrs', '11.5 Hrs',
       '9.15 Hrs', '8.5 Hrs', '21 Hrs', '12.5 Hrs', '6.5 Hrs', '10.5 Hrs',
       '6.7 Hrs', '13.5 Hrs', '8.45 Hrs', '17 Hrs', '11.1 Hrs', '19 Hrs',
       '16.5 Hrs', '8.4 Hrs', '5.5 Hrs', '15.5 Hrs', '25 Hrs'],
      dtype=object)

In some cases only the num of battery cells is mentioned. 

In [20]:
pd.unique([get_prop(r, "specs", "Battery", "Battery Cell") for r in all_laptops])

array([None, '3 Cell', '2 Cell', '4 Cell', '6 Cell', '8 Cell',
       '4 Cell Cell', '6 Cell Cell'], dtype=object)

To get a normalised score across all laptops, we could learn the relationship between the two.

In [21]:
import re

import altair as alt


def parse_num(s):
    match = re.match(r"(\d+(?:\.\d)?)", s)
    return float(match.groups()[0])


ds = []
for r in all_laptops:
    cells = get_prop(r, "specs", "Battery", "Battery Cell")
    life = get_prop(r, "specs", "Battery", "Battery Life")
    if cells is None or life is None:
        pass
    else:
        ds.append(
            {
                "cells": parse_num(cells),
                "life": parse_num(life),
            }
        )


alt.Chart(alt.Data(values=ds)).mark_point().encode(x="cells:Q", y="life:Q")

Clearly, the data is suspect. Macbooks also show up with 3 hr battery life which can't be true. 

I am just gonna make an arbitrary assignment. 

In [22]:
def read_battery(row):
    life = get_prop(row, "specs", "Battery", "Battery Life")
    cells = get_prop(row, "specs", "Battery", "Battery Cell")
    if life is not None:
        return parse_num(life)
    elif cells is not None:
        return 5 if parse_num(cells) < 4 else 10
    else:
        return None

In [23]:
pd.unique([read_battery(r) for r in all_laptops])

array([15.0, 5, 7.0, None, 6.0, 4.0, 11.0, 12.0, 10, 8.0, 9.4, 9.0, 7.1,
       13.0, 9.3, 14.0, 3.0, 18.0, 9.5, 9.2, 10.8, 20.0, 7.5, 11.5, 9.1,
       8.5, 21.0, 12.5, 6.5, 10.5, 6.7, 13.5, 8.4, 17.0, 11.1, 19.0, 16.5,
       5.5, 15.5, 25.0], dtype=object)

#### Warranty

Warranty seems like another important attribute. Let's read that. 

In [24]:
# pd.unique([get_prop(r, "specs", "Others", "Warranty") for r in all_laptops])

In [25]:
def read_warranty(row):
    warr = get_prop(row, "specs", "Others", "Warranty")
    if warr is None:
        return None
    else:
        val = re.match(r"(\d+)", warr)
        return int(val.groups()[0])


pd.unique([read_warranty(r) for r in all_laptops])

array([1, 2, None, 3], dtype=object)

#### Price

In [26]:
# pd.unique([get_prop(r, "price") for r in all_laptops])

In [27]:
def read_price(row):
    return get_prop(row, "price")


read_price(all_laptops[0])

90900

#### RAM

In [28]:
def read_ram(row):
    ram = get_prop(row, "specs", "Memory", "Capacity")
    if ram is None:
        return None
    else:
        return int(ram.split(" ")[0])


pd.unique([read_ram(r) for r in all_laptops])

array([ 8, 16,  4,  2, 32])

#### Storage

In [29]:
def _parse_mem(s):
    val = int(s.split(" ")[0])
    # some typos in data
    if "TB" in s and val < 10:
        return val * 1024
    else:
        return val


def read_storage(row):
    storage = get_prop(row, "specs", "Storage")
    ssd, hdd = None, None
    if storage is not None:
        if "SSD Capacity" in storage:
            ssd = _parse_mem(storage["SSD Capacity"])
        if "HDD Capacity" in storage:
            hdd = _parse_mem(storage["HDD Capacity"])
    return ssd, hdd


pd.unique([read_storage(r) for r in all_laptops])

array([(256, None), (512, None), (256, 1024), (None, 1024), (128, None),
       (64, None), (None, None), (1024, None), (512, 1024), (None, 500),
       (32, None), (16, None), (128, 1024), (None, 2048), (16, 1024),
       (None, 1), (2048, None), (None, 320)], dtype=object)

#### Colors

In [30]:
def read_colors(row):
    return tuple(get_prop(row, "specs", "General Information", "Colors").split(", "))


pd.unique([read_colors(r) for r in all_laptops])

array([('Gold',), ('Charcoal Black',), ('Jet black',),
       ('Transparent Silver',), ('Real Blue',), ('Black',),
       ('Natural Silver',), ('Shadow Black',), ('Black Plastic',),
       ('Blue',), ('Eclipse Gray',), ('Jet Black',), ('Shale Black',),
       ('Carbon Gray',), ('Indie Black',), ('Graphite Black',),
       ('Charcoal Gray',), ('Platinum Grey',), ('Starfall Grey',),
       ('Silver',), ('Space Grey',), ('Arctic Grey',), ('Slate Grey',),
       ('Noble Red',), ('Onyx Black',), ('Bonfire Black',),
       ('Cosmic Blue',), ('Performance blue',), ('Carbon Black',),
       ('Quiet Blue',), ('Pure Silver',), ('Performance Blue',),
       ('Indigo Blue',), ('Ash Grey',), ('Carbon Grey',),
       ('Hearty Gold',), ('Dark Grey',), ('Dreamy White',),
       ('Cobalt Blue',), ('Bespoke Black',),
       ('Shadow Black & Ultra Violet',), ('Dark Shadow Grey',),
       ('Dark Ash Silver',), ('Rose Gold',),
       ('Phantom Grey with speckles',), ('Volt Green',),
       ('Graphite Grey'

## Collect all the properties under a separate field

In [31]:
docs = []
for i, row in enumerate(all_laptops):
    gpu_make, gpu_memory = read_gpu(row)
    sdd, hdd = read_storage(row)
    doc = {
        "id": i,
        "warranty": read_warranty(row),
        "battery": read_battery(row),
        "ram": read_ram(row),
        "sdd": sdd,
        "hdd": hdd,
        "gpu_make": gpu_make,
        "gpu_memory": gpu_memory,
        "processor_make": read_processor(row),
        "display_size": read_screen_size(row),
        "resolution": read_resolution(row),
        "os": read_os(row),
        "weight": read_weight(row),
        "make": read_brand(row),
        "colors": read_colors(row),
        # pick out other attributes
        "price": read_price(row),
        "rating_value": row["rating_value"],
        "rating_count": row["rating_count"],
        "title": row["title"],
        "urls": {
            "specs": row["url_specs"],
            "img_small": row["url_img_small"],
            "img_large": row["url_img_large"],
        },
        "specs": row["specs"],
    }

    docs.append(doc)

In [32]:
len(docs)

1151

In [33]:
with open("data/list_laptop_attrs.jsonl", "w") as f:
    for doc in docs:
        f.write(json.dumps(doc) + "\n")

In [34]:
docs[0]

{'id': 0,
 'warranty': 1,
 'battery': 15.0,
 'ram': 8,
 'sdd': 256,
 'hdd': None,
 'gpu_make': None,
 'gpu_memory': None,
 'processor_make': 'Apple chip',
 'display_size': 13.3,
 'resolution': (2560, 1600),
 'os': 'MacOS',
 'weight': 1.29,
 'make': 'Apple',
 'colors': ('Gold',),
 'price': 90900,
 'rating_value': 4.7,
 'rating_count': 11912,
 'title': 'Apple MacBook Air M1 MGND3HN/A Ultrabook (Apple M1/8 GB/256 GB SSD/macOS Big Sur)',
 'urls': {'specs': 'https://www.91mobiles.com//apple-m1-mgnd3hn-a-apple-m1-8-gb-256-gb-macos-big-sur-laptop-price-in-india-141587#specifications',
  'img_small': 'https://www.91-img.com/pictures/laptops/apple/apple-m1-mgnd3hn-a-141587-v1-small-1.jpg?tr=q-80',
  'img_large': 'https://www.91-img.com/pictures/laptops/apple/apple-m1-mgnd3hn-a-141587-v1-large-1.jpg?tr=q-80'},
 'specs': {'General Information': {'Brand': 'Apple',
   'Model': 'M1 MGND3HN/A',
   'Dimensions(WxHxD)': '304.1 x 212.4 x 10.9 \xa0mm',
   'Weight': '1.29 Kg',
   'Colors': 'Gold',
   'Ope