In [1]:
import os
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from bs4 import BeautifulSoup

class RozetkaDescriptionParser:
    def __init__(self, category_url):
        # Init browser
        options = webdriver.ChromeOptions()
        options.add_argument('headless')
        options.add_argument('--lang=uk-UA')
        self.browser = webdriver.Chrome(options=options)

        self.category_url = category_url
        self.data_dir = data_dir
        self.page_count = 0
        self.product_hrefs_iter = iter([])
        self.total_dscr_count = 0
        self.page_dscr_count = 0

    def parse(self, num_dscr):
        i = 0
        while i < num_dscr:
            try:
                # Try to get the next product on current
                product_href = next(self.product_hrefs_iter)
                self.page_dscr_count += 1
            except StopIteration:
                # If no products left on current page, get a new one
                self.page_count += 1
                self.update_product_hrefs()
                product_href = next(self.product_hrefs_iter)
                self.page_dscr_count = 1

            # Extract description
            soup = self._browser_get_soup(product_href)
            try:
                dscr = self._extract_dscr_from_prod(soup)
                self.total_dscr_count += 1
                print(f'   Description for product {self.total_dscr_count} ({self.page_dscr_count} on page)')
                i += 1
                yield dscr
            except:                
                print(f'   Skipped product {self.total_dscr_count} {product_href} ({self.page_dscr_count} on page)')
    
    def update_product_hrefs(self):
        soup = self._browser_get_soup(self.category_url + f'page={self.page_count}/')
        print('Updated product page', self.page_count)
        self.product_hrefs_iter = iter([a['href'] for a in soup.select('a.goods-tile__heading')])
        
    def _browser_get_soup(self, query):
        self.browser.get(query)
        html = self.browser.page_source
        soup = BeautifulSoup(html, 'html.parser')
        return soup
    
    def _extract_dscr_from_prod(self, soup):
        prod_name = soup.select_one('.product__title').get_text(separator=' ').strip()
        
        prod_dscr_paras = soup.select('div.product-about__description p')
        if not prod_dscr_paras:
            prod_dscr_paras = soup.select('div.text.rich-content div.rz-sec_text')
        if not prod_dscr_paras:
            prod_dscr_paras = soup.select('div.text.rich-content div.rz-section__subtitle')
        if not prod_dscr_paras:
            assert False
        
        prod_dscr = [p.get_text(separator=' ').strip() for p in prod_dscr_paras]
        full_dscr = prod_name + '. ' + ' '.join(prod_dscr)
        return full_dscr

In [4]:
category_url = 'https://rozetka.com.ua/ua/tablets/c130309/'
data_dir = os.path.join('.', 'data', 'dscr_tablets')
parser = RozetkaDescriptionParser(category_url)

In [5]:
for dscr in parser.parse(400):
    with open(os.path.join(data_dir, str(parser.total_dscr_count) + '.txt'), 'w+', encoding='UTF-8') as file:
        file.write(dscr)

Updated product page 1
   Description for product 1 (1 on page)
   Description for product 2 (2 on page)
   Description for product 3 (3 on page)
   Description for product 4 (4 on page)
   Description for product 5 (5 on page)
   Description for product 6 (6 on page)
   Description for product 7 (7 on page)
   Description for product 8 (8 on page)
   Description for product 9 (9 on page)
   Description for product 10 (10 on page)
   Description for product 11 (11 on page)
   Description for product 12 (12 on page)
   Description for product 13 (13 on page)
   Description for product 14 (14 on page)
   Description for product 15 (15 on page)
   Description for product 16 (16 on page)
   Description for product 17 (17 on page)
   Description for product 18 (18 on page)
   Description for product 19 (19 on page)
   Description for product 20 (20 on page)
   Description for product 21 (21 on page)
   Description for product 22 (22 on page)
   Description for product 23 (23 on page)
   Des

   Skipped product 143 https://rozetka.com.ua/ua/planshet---telefon-hoozo-mt232-gold-chekhol-klaviatura-karta-pamyati-64gb/g19789890/ (46 on page)
   Description for product 144 (47 on page)
   Description for product 145 (48 on page)
   Description for product 146 (49 on page)
   Skipped product 146 https://rozetka.com.ua/ua/planshet-evromedia-play-pad-3g-2goo/g15575202/ (50 on page)
   Skipped product 146 https://rozetka.com.ua/ua/139487875/p139487875/ (51 on page)
   Description for product 147 (52 on page)
   Description for product 148 (53 on page)
   Description for product 149 (54 on page)
   Description for product 150 (55 on page)
   Description for product 151 (56 on page)
   Description for product 152 (57 on page)
   Description for product 153 (58 on page)
   Description for product 154 (59 on page)
   Skipped product 154 https://rozetka.com.ua/ua/planshet-telefon-hoozo-mt116-2gb-ram-black-chekhol-klaviatura-karta-pamyati-32gb/g19794433/ (60 on page)
Updated product page 4

   Description for product 237 (4 on page)
   Description for product 238 (5 on page)
   Description for product 239 (6 on page)
   Description for product 240 (7 on page)
   Description for product 241 (8 on page)
   Skipped product 241 https://rozetka.com.ua/ua/240054007/p240054007/ (9 on page)
   Skipped product 241 https://rozetka.com.ua/ua/hoozo-x1001-lite-full-hd-32gb-lte/g30572191/ (10 on page)
   Description for product 242 (11 on page)
   Description for product 243 (12 on page)
   Skipped product 243 https://rozetka.com.ua/ua/160667956/p160667956/ (13 on page)
   Description for product 244 (14 on page)
   Description for product 245 (15 on page)
   Description for product 246 (16 on page)
   Description for product 247 (17 on page)
   Skipped product 247 https://rozetka.com.ua/ua/151502849/p151502849/ (18 on page)
   Skipped product 247 https://rozetka.com.ua/ua/planshet-hoozo-mt232/g12609074/ (19 on page)
   Description for product 248 (20 on page)
   Skipped product 248 ht

   Skipped product 319 https://rozetka.com.ua/ua/231273157/p231273157/ (20 on page)
   Description for product 320 (21 on page)
   Description for product 321 (22 on page)
   Description for product 322 (23 on page)
   Skipped product 322 https://rozetka.com.ua/ua/68845725/p68845725/ (24 on page)
   Description for product 323 (25 on page)
   Skipped product 323 https://rozetka.com.ua/ua/219757507/p219757507/ (26 on page)
   Skipped product 323 https://rozetka.com.ua/ua/planshet-hoozo-mt232-silver-chekhol-knizhka-karta-pamyati-32gb/g19790023/ (27 on page)
   Description for product 324 (28 on page)
   Skipped product 324 https://rozetka.com.ua/ua/219758407/p219758407/ (29 on page)
   Description for product 325 (30 on page)
   Description for product 326 (31 on page)
   Description for product 327 (32 on page)
   Skipped product 327 https://rozetka.com.ua/ua/139488022/p139488022/ (33 on page)
   Description for product 328 (34 on page)
   Skipped product 328 https://rozetka.com.ua/ua/2

In [173]:
# options = webdriver.ChromeOptions()
# # options.add_argument('headless')
# browser = webdriver.Chrome(options=options)
# browser.set_window_size(1920, 1080)
# browser.get('https://rozetka.com.ua/ua/oppo_a91_128gb_blue/p208181857/')
# html = browser.page_source
# soup = BeautifulSoup(html, 'html.parser')

# soup.select_one('.product__title').get_text(separator=' ').strip()
# soup.select_one('div.text.rich-content div.rz-sec_text').get_text(separator=' ')