In [2]:
import requests
from pydantic import BaseModel
from datetime import datetime
from typing import Optional

class RawProduct(BaseModel):
    platform: str = 'tiki'
    id: int | None
    name: str | None
    sale: int | None
    price: float | None
    shop_name: str | None
    shop_id: int | None
    unite_id: int = None
    brand_name: str | None
    timestamp: float | None

In [3]:
#! SETUP SESSION

session = requests.Session()
session.headers.update(
    {
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'accept-encoding': 'gzip, deflate, br, zstd',
'accept-language': 'en-US,en;q=0.9,vi-VN;q=0.8,vi;q=0.7,fr;q=0.6',
'cache-control': 'max-age=0',
'priority': 'u=0, i',
'sec-ch-ua': '"Google Chrome";v="129", "Not=A?Brand";v="8", "Chromium";v="129"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
'sec-fetch-dest': 'document',
'sec-fetch-mode': 'navigate',
'sec-fetch-site': 'none',
'sec-fetch-user': '?1',
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Safari/537.36',
    })

In [4]:
#! DEFINING SCRAPING FUNCTIONS
import json
import csv

fout = open('data.csv',mode='w', errors="ignore",newline='')
writer = csv.DictWriter(
                        fout,
                        RawProduct.model_fields)
log = open('data.json', mode='w+')

def init_data():
    writer.writeheader()
    
def append_to_log(data):
    log.write(log.read() + json.dumps(data, indent=4))

def scrape_at_url(url):
    response = session.get(url=url)
    json_content = json.loads(response.content.decode())
    append_to_log(json_content)
    products = json_content['data']
    products_ = []
    try:
        for product in products:
            products_ += [RawProduct(
                        id=product.get('id', None),
                        name=product.get('url_key', None),
                        sale=product['visible_impression_info']['amplitude'].get('all_time_quantity_sold', None),
                        price=product.get('price', None),
                        # unite_id=None,
                        shop_name=product.get('seller_name', None),
                        shop_id=product.get('seller_id', None),
                        brand_name=product.get('brand_name', 'NaN'),
                        timestamp=datetime.now().timestamp()
                ).model_dump()]
    except Exception as e :
        print (product)
        print (str(e))
        return []
    return products_
    
def export_to_file(products: list[dict[str, any]]):
    writer.writerows(products)
    
def close_files():
    log.close()
    fout.close()
    
init_data()

In [5]:
#! GO SCRAPING

# baseURL = "https://tiki.vn/api/v2/products?limit=40&aggregations=2&trackity_id=55e38373-3402-501c-3da5-7a4f96edd60d&q=niveamen&page="
# prods = scrape_at_url(baseURL+str(1))
products = []
for i in range(1,12):
    baseURL = "https://tiki.vn/api/v2/products?limit=40&aggregations=2&trackity_id=55e38373-3402-501c-3da5-7a4f96edd60d&q=niveamen&page="
    products = scrape_at_url(baseURL+str(i))
    export_to_file(products)
close_files()

##### Extracting fields
-'id' -'name' -'url_key' -'seller_id' -'seller_name' -'brand_id' -'brand_name' -'price' -'visible_impression_info''amplitude''seller_type' -'visible_impression_info''amplitude''all_time_quantity_sold'

##### Aggregate object
-sold_quantity -total_sale -

In [6]:
#! CLEANSE DATA
#: filter out product with `nivea`, `men`, `nam`
#: create a aggregate object