In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import json
import re
from datetime import datetime
import os
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
from requests.exceptions import InvalidURL
from unidecode import unidecode

In [2]:
def extract_link(base_url):
    # Headers for request
    headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36"}
    page = 1
    all_links = []
    max_pages = 12  # Set maximum number of pages to navigate

    while page <= max_pages:
        url = f'{base_url}?p={page}'
        response = requests.get(url, headers=headers)
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # New lines
        a_tags = soup.find_all('a', class_='v3_thumb_common_sp relative')
        all_links.extend([tag.get('href') for tag in a_tags])
        
        # Increment page number for the next iteration
        page += 1

    return all_links

In [3]:
def extract_html(all_links, start, end):
    data = []
    for i in all_links[start:end]:
        headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36'}
        
        retry_strategy = Retry(
            total=20,  # Total number of retries
            status_forcelist=[429, 500, 502, 503, 504],
            allowed_methods=["HEAD", "GET", "OPTIONS"]
        )
        adapter = HTTPAdapter(max_retries=retry_strategy)
        http = requests.Session()
        http.mount("https://", adapter)
        http.mount("http://", adapter)
        
        response = http.get(i, headers=headers)
        soup = BeautifulSoup(response.content.decode('utf-8'), 'html.parser')
        
        # Initialize dictionary for storing data
        product_data = {
            "url": i,
            "cate": None,
            "listing_price": None,
            "price": None,
            "english_sku_name": None,
            "sku_name": None,
            "img_url": None,
            "sku_code": None,
            "rating": None,
            "rating_count": None,
            "status_text": "Status not found",
            "gift": None,
            "barcode": None,
            "brand": None,
            "brand_origin": None,
            "factory": None,
            "skin_type": None,
            "feature": None,
            "ingredients": None
        }

        try:
            cate_elements = soup.select('li span[itemprop="name"]')
            product_data['cate'] = [element.get_text(strip=True) for element in cate_elements]
        except AttributeError:
            product_data['cate'] = None
        
        # Extract price
        try:
            price = soup.find('span', id='product-final_price').text.strip()
            product_data['price'] = int(price.replace('.', '').replace(' ₫', ''))
        except AttributeError:
            pass

        # Extract listing price
        try:
            listing_price = soup.find('span', id='market_price').text.strip()
            product_data['listing_price'] = int(listing_price.replace('.', '').replace(' ₫', ''))
        except AttributeError:
            pass

        english_sku_name_element = soup.select_one('.page-title.english_name_detail .base')
        product_data['english_sku_name'] = english_sku_name_element.text.strip() if english_sku_name_element else None

        # Extract JSON-LD script
        try:
            json_ld_script = soup.find('script', type='application/ld+json').string
            product_info = json.loads(json_ld_script)
            product_data['sku_name'] = product_info.get('name')
            product_data['img_url'] = product_info.get('image')[0] if product_info.get('image') else None
            product_data['sku_code'] = product_info.get('sku')
            product_data['rating'] = product_info['aggregateRating']['ratingValue']
            product_data['rating_count'] = product_info['aggregateRating']['reviewCount']
        except (AttributeError, json.JSONDecodeError, KeyError):
            pass

        status = soup.select_one('.block_add_to_cart_nav .btn_site_3, .block_add_to_cart_nav .btnOutOfStock')
        product_data['status_text'] = status.text.strip() if status else 'Status not found'

        gift_element = soup.select_one('.title_gif_detail span')
        product_data['gift'] = gift_element.text.strip() if gift_element else None

        product_details = {}
        table_rows = soup.select('.tb_info_sanpham tr')
        for row in table_rows:
            cells = row.find_all('td')
            if len(cells) >= 2:
                key = cells[0].text.strip()
                value = cells[1].text.strip()
                product_details[key] = value

        product_data['barcode'] = product_details.get('Barcode')
        product_data['brand'] = product_details.get('Thương Hiệu')
        product_data['brand_origin'] = product_details.get('Xuất xứ thương hiệu')
        product_data['factory'] = product_details.get('Nơi sản xuất')
        product_data['skin_type'] = product_details.get('Loại da')
        product_data['feature'] = product_details.get('Đặc tính')

        ingredients_div = soup.select_one('#box_thanhphanchinh .ct_box_detail')
        product_data['ingredients'] = ingredients_div.get_text(strip=True) if ingredients_div else None

        data.append(product_data)

    return data

In [4]:
def extract_data(data):
    df = pd.DataFrame(data)

    df['date'] = pd.Timestamp.now().normalize()
    
    # Filepath for the CSV file
    file_path = 'kemchongnang_taytrang_hasaki.csv'

    # Append the new data to the existing CSV file
    if os.path.exists(file_path):
        df.to_csv(file_path, mode='a', index=False, header=False, encoding='utf-8-sig')
    else:
        df.to_csv(file_path, index=False, encoding='utf-8-sig')
        
    return df

# 1. Get all_links

In [5]:
all_links = []
all_links.extend(extract_link('https://hasaki.vn/danh-muc/chong-nang-da-mat-c11.html'))

In [6]:
all_links.extend(extract_link('https://hasaki.vn/danh-muc/tay-trang-mat-c48.html'))

In [7]:
len(all_links)

574

In [8]:
all_links = list(dict.fromkeys(all_links))
len(all_links)

553

# 2. Extract data

In [9]:
data = []
x = 0
y = 25
z = 25
data.extend(extract_html(all_links,x,y))

In [10]:
x = x+z
y = y+z
data.extend(extract_html(all_links,x,y))

In [11]:
x = x+z
y = y+z
data.extend(extract_html(all_links,x,y))

In [12]:
x = x+z
y = y+z
data.extend(extract_html(all_links,x,y))

In [13]:
x = x+z
y = y+z
data.extend(extract_html(all_links,x,y))

In [14]:
x = x+z
y = y+z
data.extend(extract_html(all_links,x,y))

In [15]:
x = x+z
y = y+z
data.extend(extract_html(all_links,x,y))

In [16]:
x = x+z
y = y+z
data.extend(extract_html(all_links,x,y))

In [17]:
x = x+z
y = y+z
data.extend(extract_html(all_links,x,y))

In [18]:
x = x+z
y = y+z
data.extend(extract_html(all_links,x,y))

In [19]:
x = x+z
y = y+z
data.extend(extract_html(all_links,x,y))

In [20]:
x = x+z
y = y+z
data.extend(extract_html(all_links,x,y))

In [21]:
x = x+z
y = y+z
data.extend(extract_html(all_links,x,y))

In [22]:
x = x+z
y = y+z
data.extend(extract_html(all_links,x,y))

In [23]:
x = x+z
y = y+z
data.extend(extract_html(all_links,x,y))

In [24]:
x = x+z
y = y+z
data.extend(extract_html(all_links,x,y))

In [25]:
x = x+z
y = y+z
data.extend(extract_html(all_links,x,y))

In [26]:
x = x+z
y = y+z
data.extend(extract_html(all_links,x,y))

In [27]:
x = x+z
y = y+z
data.extend(extract_html(all_links,x,y))

In [28]:
x = x+z
y = y+z
data.extend(extract_html(all_links,x,y))

In [29]:
x = x+z
y = y+z
data.extend(extract_html(all_links,x,y))

In [30]:
x = x+z
y = y+z
data.extend(extract_html(all_links,x,y))

In [31]:
print(x,y)

525 550


In [32]:
x = x+z
y = y+z
data.extend(extract_html(all_links,x,y))

In [33]:
x = x+z
y = y+z
data.extend(extract_html(all_links,x,y))

In [34]:
x = x+z
y = y+z
data.extend(extract_html(all_links,x,y))

In [35]:
x = x+z
y = y+z
data.extend(extract_html(all_links,x,y))

In [36]:
x = x+z
y = y+z
data.extend(extract_html(all_links,x,y))

In [37]:
x = x+z
y = y+z
data.extend(extract_html(all_links,x,y))

In [38]:
x = x+z
y = y+z
data.extend(extract_html(all_links,x,y))

In [39]:
x = x+z
y = y+z
data.extend(extract_html(all_links,x,y))

In [40]:
x = x+z
y = y+z
data.extend(extract_html(all_links,x,y))

In [41]:
x = x+z
y = y+z
data.extend(extract_html(all_links,x,y))

In [42]:
print(x,y)

775 800


# 3. Export data

In [43]:
df = extract_data(data)
df.describe()

Unnamed: 0,listing_price,price,rating,rating_count,date
count,520.0,553.0,553.0,553.0,553
mean,455591.5,331316.5,2.801447,21.180832,2024-11-27 00:00:00
min,14000.0,13000.0,0.0,0.0,2024-11-27 00:00:00
25%,229000.0,157000.0,0.0,0.0,2024-11-27 00:00:00
50%,415500.0,296000.0,4.3,2.0,2024-11-27 00:00:00
75%,586000.0,439000.0,4.8,12.0,2024-11-27 00:00:00
max,3360000.0,2900000.0,5.0,294.0,2024-11-27 00:00:00
std,307109.4,247375.0,2.29758,54.132184,
