In [1]:
from requests_html import HTMLSession
import requests
import pandas as pd
import re
import os

## GET PRODUCT'S META INFO FUNCTION

In [2]:
def get_products_tiki(html) -> list:
    products_elements = html.find('.product-item')
    product_list = list()

    for product in products_elements:
        product_id = product.attrs['data-id']
        title = product.attrs['data-title']
        price = product.attrs['data-price']
        brand = product.attrs['data-brand']
        link = product.find('a', first=True).attrs['href']
        product_list.append({
            'id': product_id,
            'title': title.strip('\t'),
            'brand': brand,
            'price': price,
            'link': link
        })

    return product_list

def get_products_detail_tiki(html) -> dict:
    detail = {}
    chi_tiet = html.find('#chi-tiet tbody tr')
    for info in chi_tiet:
        attrs = info.find('td', first=True).attrs
        if 'rel' not in attrs:
            continue

        field_name = attrs['rel']
        if isinstance(field_name, tuple):
            field_name = field_name[0]

        value = info.find('td.last', first=True).text.strip()
        
        field_name_new = {
            'id': 'id',
            'title': 'name',
            'price': 'price',
            'brand': 'brand',
            'cpu': 'cpu',
            'chipset': 'type_cpu',
            'loai_o_dia': 'ssd',
            'ram': 'ram',
            'product_weight': 'weight',
            'card_man_hinh': 'graphic',
            'loai_pin': 'battery'
        }.get(field_name, None)

        value_filtered = {
            'price': lambda price: re.sub(r'\D+', '', price),
            'ssd': lambda ssd: 'ssd' in ssd.lower(),
            'weight': lambda weight: re.sub(r'[ a-zA-Z]', '', weight)
        }.get(field_name_new, lambda x: x)(value)

        if field_name_new is not None:
            detail[field_name_new] = value_filtered
    
    return detail

## CRAWL FUNCTION

In [4]:
def crawl_tiki_laptop(output: str):
    url = "https://tiki.vn/laptop/c8095"
    session = HTMLSession()
    products = list()
    page = 1
    total = 1

    while True:
        response = session.get(url, params={'page': page}, allow_redirects=True)
        if not response.ok:
            break;

        total_text = response.html.find('.product-box [name=results-count]', first=True).text
        total_text_number = re.findall(r'\d+', total_text)
        if (len(total_text_number)):
            total = int(total_text_number[0])

        products += get_products_tiki(response.html)
        print(f"[TIKI] Crawled page {page}")
        page += 1
        if (len(products) >= total):
            break;

    df = pd.DataFrame(products, columns=['id', 'title', 'brand', 'price', 'link'])
    df.to_csv(output)
    
def crawl_tiki_laptop_detail(inp: str, out: str, limit: int = None):
    session = HTMLSession()
    products = pd.read_csv(inp)
    products_detail = list()
    
    try:
        for idx, row in products.iterrows():
            row: dict = row
            link = row['link']
            resp = session.get(link)
            detail = get_products_detail_tiki(resp.html)
            row.pop('link')
            
            products_detail.append({
                **row,
                **detail
            })
            
            print(f"Crawled detail {idx}: {row['title']}")
            if (limit is not None and idx >= limit):
                break
    except Exception as e:
        print('Error: ' + e)

    df = pd.DataFrame(products_detail)
    df.to_csv(out, index=False)
    print('DONE')
    

## BEGIN CRAWL

In [6]:
crawl_tiki_laptop('tiki_meta.csv')
print('Done')

[TIKI] Crawled page 1
[TIKI] Crawled page 2
[TIKI] Crawled page 3
[TIKI] Crawled page 4
[TIKI] Crawled page 5
[TIKI] Crawled page 6
[TIKI] Crawled page 7
[TIKI] Crawled page 8
[TIKI] Crawled page 9
[TIKI] Crawled page 10
[TIKI] Crawled page 11
[TIKI] Crawled page 12
[TIKI] Crawled page 13
[TIKI] Crawled page 14
[TIKI] Crawled page 15
[TIKI] Crawled page 16
[TIKI] Crawled page 17
[TIKI] Crawled page 18
[TIKI] Crawled page 19
[TIKI] Crawled page 20
[TIKI] Crawled page 21
Done


In [7]:
crawl_tiki_laptop_detail('tiki_meta.csv', 'tiki_detail.csv')

Crawled detail 0: Laptop Asus ASUSPRO B9440UA-GV0488T Core i7-8550U/ Win10 (14
Crawled detail 1: Laptop Dell Inspiron 3493 N4I5136W S -Black I5 1035G1 8GB 256GB SDD 14FHD Win 10 - Hàng Chính Hãng 
Crawled detail 2: Laptop Asus ASUSPRO B9440UA-GV0495T Core i5-8250U/ Win10 (14
Crawled detail 3: Laptop Lenovo Legion Y530-15ICH 81FV008LVN Core i7-8750H/Win10 (15.6 inch) - Black - Hàng Chính Hãng
Crawled detail 4: Apple Macbook Air 2019 - 13 inchs (i5/ 8GB/ 128GB) - Hàng Nhập Khẩu Chính Hãng
Crawled detail 5: Macbook Air 2017 MQD32 (13.3 inch) - Hàng Chính Hãng
Crawled detail 6: Apple Macbook Pro Touch Bar 2019 - 13 inchs (i5/ 8GB/ 128GB) - Hàng Nhập Khẩu Chính Hãng
Crawled detail 7: Laptop Asus Vivobook 14 A412FA-EK224T Core i5-8265U/ Win10 (14 FHD) - Hàng Chính Hãng
Crawled detail 8: Laptop HP ENVY 13-AQ0026TU (Intel Core I5-8265U  8GB RAM DDR4  256GB SSD  13,3
Crawled detail 9: The New Macbook 2017 (12 inch) Core M3/ 256GB - Hàng Chính Hãng
Crawled detail 10: Laptop Xiaomi Mi Air JYU4063

## GET RATING

Using tiki api: `https://tiki.vn/api/v2/reviews?product_id=<id>`

Response:

```json
{
    "rating_average": "float",
    "reviews_count": "int",
    "...": "..."
}
```

In [22]:
def get_tiki_rating():
    session = HTMLSession()
    data: pd.DataFrame = pd.read_csv('tiki_detail.csv')
    data_with_rating = list()
    
    for idx, product in data.iterrows():
        product = dict(product)
        product_id = product.get('id')
        if product_id is None:
            print(f'IGNORE {product_id}')
            continue
        try:
            resp = requests.get(f'https://tiki.vn/api/v2/reviews?product_id={product_id}')
            json = resp.json()
            rating_average = json.get('rating_average')
            reviews_count = json.get('reviews_count')
            
            product.update({
                'rating_average': rating_average,
                'reviews_count': reviews_count
            })
            data_with_rating.append(product)
            print(f'DONE {product_id}')
            
        except Exception as e:
            print(f"Error {product_id}: {str(e)}")
        
    df_with_rating: pd.DataFrame = pd.DataFrame(data_with_rating)
    df_with_rating.to_csv('tiki_detail_with_rating.csv', index=False)

In [23]:
get_tiki_rating()

DONE 11011566
DONE 44146391
DONE 11007488
DONE 3080395
DONE 23264100
DONE 721995
DONE 23264373
DONE 17602293
DONE 22563635
DONE 2665113
DONE 2546955
DONE 25652461
DONE 2471077
DONE 19569288
DONE 20955090
DONE 21922160
DONE 5465955
DONE 5463387
DONE 23264107
DONE 9669826
DONE 23264368
DONE 11501789
DONE 20955084
DONE 32438329
DONE 40495525
DONE 6086513
DONE 10672462
DONE 24351715
DONE 1747305
DONE 4374913
DONE 4332521
DONE 2884733
DONE 32349095
DONE 21922154
DONE 26421283
DONE 7093043
DONE 10225512
DONE 39940256
DONE 6088421
DONE 4274393
DONE 3529775
DONE 29253421
DONE 2855321
DONE 2882223
DONE 39057428
DONE 4273159
DONE 10216162
DONE 32340626
DONE 19695390
DONE 3252931
DONE 34471728
DONE 7393319
DONE 13986891
DONE 38455224
DONE 15073948
DONE 6077101
DONE 24351539
DONE 11196536
DONE 15073946
DONE 7376963
DONE 43035421
DONE 42704936
DONE 4281519
DONE 4279807
DONE 2071717
DONE 15073942
DONE 8009524
DONE 24351545
DONE 29253431
DONE 14013514
DONE 20955082
DONE 21009368
DONE 15073960
DONE 26