So, in this file I was trying to grab information from the website www.dermstore.com for skin-care segment. I've used the scrapy library, manual methods for scrapping main webpages.

Here is the libraries for Spider with manual methods

In [1]:
import pandas as pd
import scrapy
from scrapy.crawler import CrawlerProcess
import requests
import sidetable

## Let's try the Spider

### Version 1. iterating pages from the all pages

In [2]:
class DermStoreSpider (scrapy.Spider):
    """
    A Scrapy Spider to scrape skin care products and reviews from DermStore website.
    
    Attributes:
        name (str): The name of the Spider.
        url (str): The URL of the product listing page
    """
    name = "skin_care"
    url = "https://www.dermstore.com/skin-care.list"
    
    def start_requests(self):
        """
        Generates initial requests to start scraping.
        
        returns:
            scrapy.Request: A request to the URL specified in 'url' attribute.
        """
        #here we define the links that we'll scrape
        start_urls = ['https://www.dermstore.com/skin-care.list']
        for url in start_urls:
            yield scrapy.Request(url = url, callback = self.parse_current_page)
            
    def parse_current_page(self, response):
        """"    
        Method for pagination. Accept the start page's content and extract the number of total pages in this category. 
        It then starts the loop, where it defines the current page and follows it for further extraction in the
        parse_front() method.
        
        args:
            response (scrapy.http.Response): The response object containing the page content.
        
        returns:
            a request object for the current url with products
        """
        #define the number of total pages
        total_pages = response.xpath('//nav[@class="responsivePaginationPages"]/@data-total-pages').get()
        #define the current page for extracting and follow it 
        for page in range(1, int(total_pages) + 1):
            next_page_url = self.url + '?pageNumber=' + str(page)
            yield response.follow(url = next_page_url, callback=self.parse_front)
    
    def parse_front(self, response):
        """
        Accept the content from current page. It extracts the links for each product displayed on current page and follows
        it for further extraction in the parse_pages() method
        
        args:
            response (scrapy.http.Response): The response object containing the page content.
            
        returns:
            a request object for the product's url
        """
        #define the all products on the current page
        product_block = response.css('div.productBlock')
        
        #extract the link to the product page and follow it for further scraping
        for product in product_block:
            attr = product.css('a::attr(href)').get()
            product_url = 'www.dermstore.com' + attr
            yield response.follow(url=product_url, callback=self.parse_pages, meta={"product_url": product_url})
        
    def parse_pages(self, response):
        """
        Accept the content from the product page and extract the product name, product ingredients and the product url 
        as meta data from the function originated the request. Define the number of total reviews. 
        Depending on the number it proceed to next steps in follow logic:
        - if there are more than 10 reviews for product it extracts the link of all reviews and follows it for next 
        extraction in the parse_review() function
        - if there are less or equal 10 reviews it extracts the review date, review title, review body, 
        rating and returns the dictionary containig the scraped data
        
        args: 
            response (scrapy.http.Response): The response object containing the product page content and meta data
            from the function that originated the request.
            
        returns: 
            depends on number of reviews: 
            - a dictionary containing the scraped data 
            - a request object for the review's url
        """
        #extract the product name, product ingredients
        product_name = response.xpath('//h1[contains(@class,"productName_title")]/text()').get()
        product_ingredients_list = response.xpath('//*[@id="product-description-content-7"]/div/div/p/text()').getall() 
        product_ingredients = ' '.join(product_ingredients_list).strip()
        #access meta data
        product_url = response.request.meta['product_url']
            
        #define the number of total reviews
        total_reviews = response.xpath('//p[@class="athenaProductReviews_reviewCount Auto"]/text()').get()
        #define the logic of next steps if the reviews exists
        if total_reviews:
            total_reviews = int(total_reviews.replace(' Reviews', ''))
            #the logic when there are more than 10 reviews
            if total_reviews >= 10:
                #extract a link to the reviews page
                review_links = response.xpath('//a[contains(@class,"athenaProductReviews_seeReviewsButton")]/@href').getall()
                #follow the link to the reviews page for further scraping
                for next_review in review_links:
                    yield response.follow(url=next_review, callback=self.parse_review, 
                                          meta={"product_name": product_name,
                                                "product_ingredients": product_ingredients, 
                                                "product_url": response.url})
            #the logic when there are less or equal 10 reviews
            else:
                #access meta data
                product_url = response.request.meta['product_url']
                #define reviews
                reviews = response.xpath('//div[@class="athenaProductReviews_topReviewSingle"]')
                #extract the review date, review title, review body and rating
                for review in reviews:
                    review_date = review.xpath(".//span[@data-js-element='createdDate']/text()").get()
                    review_title = review.xpath(".//h3[@id='product-review-1-title']/text()").get()
                    review_body = review.xpath(".//p[@class='athenaProductReviews_topReviewsExcerpt']/text()").get().strip('\n ')
                    rating = float(
                        review.xpath('.//div[@class="athenaProductReviews_topReviewsRatingStarsContainer"]/@aria-label')
                        .get().replace(' Stars', ''))
                    
                    #yield the scraped data
                    item = {
                        'product_name': product_name if product_name else None, 
                        'product_url': product_url if product_url else None,
                        'product_ingredients': product_ingredients if product_ingredients else None,
                        'review_date': review_date if review_date else None,
                        'review_title': review_title.strip('\n ') if review_title else None,
                        'review_body': review_body if review_body else None,
                        'rating': rating if rating else None
                    }
                    yield item
        else:
            #yeild the scraped data
            item = {
                'product_name': product_name if product_name else None, 
                'product_url': product_url if product_url else None,
                'product_ingredients': product_ingredients if product_ingredients else None,
                'review_date': None,
                'review_title': None,
                'review_body': None,
                'rating': None
            }
            yield item
        
    def parse_review(self, response):
        """
        Accepts the content of the reviews page and the meta data from the function that originated the request.
        Extracts the date, title, body and rating from each review.
        Follows a link to the next page to repeat the process (if there is a next page)
        
        args: 
            response: holds the url page's content and meta data from the function that originated the request.
            
        returns:
            a dictionary containing the scraped data for each review
        """
        #access the meta data
        product_name = response.request.meta['product_name']
        product_url = response.request.meta['product_url']
        product_ingredients = response.request.meta['product_ingredients']
        
        #define all reviews on the page
        reviews = response.xpath('//div[@class = "athenaProductReviews_review"]')
        
        #extract the date, title, body and rating for each review
        for review in reviews:
            review_date = review.xpath(".//div/span[@data-js-element='createdDate']/text()").get()
            review_title = review.xpath('.//h3[@class="athenaProductReviews_reviewTitle"]/text()').get()
            review_body = review.xpath('.//p[@class="athenaProductReviews_reviewContent"]/text()').get().strip('\n ')
            rating = float(review.xpath('.//span[@class="athenaProductReviews_schemaRatingValue"]/text()').get())

            #yield the scraped data
            item = {
                'product_name': product_name if product_name else None, 
                'product_url': product_url if product_url else None,
                'product_ingredients': product_ingredients if product_ingredients else None,
                'review_date': review_date if review_date else None,
                'review_title': review_title if review_title else None,
                'review_body': review_body if review_body else None,
                'rating': rating if rating else None
            }
            
            yield item
        
        #pagination for a single product's review
        next_page_review = response.xpath('//a[@aria-label="Next page"]/@href').get()
        #if a next page link exists, follow and repeat the scraping procedure
        if next_page_review:
            yield response.follow(url=str(next_page_review), callback=self.parse_review,
                                     meta={"product_name": product_name,
                                          "product_ingredients": product_ingredients, 
                                           "product_url": product_url})

#define the empty DataFrame               
df_dermStore = pd.DataFrame(
    columns=['product_name', 'product_url', 'product_ingredients', 'review_date', 'review_title', 'review_body', 
             'rating'])

#define the Class for Pipeline
class DermStorePipeline:
    """"
    A Scrapy pipeline class to process and store scraped data into a pandas DataFrame.
    
    """
    #function for filling the empty DataFrame by yielded data
    def process_item(self, item, spider):
        """
        Processes the scraped item and stores it in the DataFrame.
        
        Attributes:
            item (dict): The scraped data item.
            spider (scrapy.Spider): The Spider instance.
            
        returns:
            dict: The processed item.
        """
        df_dermStore.loc[len(df_dermStore)] = [item['product_name'], item['product_url'], item['product_ingredients'], 
                                               item['review_date'], item['review_title'], item['review_body'], item['rating']]
        return item

process = CrawlerProcess(
    settings={'ITEM_PIPELINES': {'__main__.DermStorePipeline': 300}, 
              'LOG_LEVEL': 'INFO', 
              'AUTOTHROTTLE_ENABLED': 'True',
             'AUTOTHROTTLE_TARGET_CONCURRENCY':'1.0',
             'HTTPCACHE_ENABLED': 'True'})
process.crawl(DermStoreSpider)
process.start()


2023-08-10 21:13:05 [scrapy.utils.log] INFO: Scrapy 2.9.0 started (bot: scrapybot)
2023-08-10 21:13:05 [scrapy.utils.log] INFO: Versions: lxml 4.9.2.0, libxml2 2.9.12, cssselect 1.2.0, parsel 1.8.1, w3lib 2.1.1, Twisted 22.10.0, Python 3.7.10 (default, Feb 26 2021, 13:06:18) [MSC v.1916 64 bit (AMD64)], pyOpenSSL 23.2.0 (OpenSSL 3.1.1 30 May 2023), cryptography 41.0.1, Platform Windows-10-10.0.19041-SP0
2023-08-10 21:13:05 [scrapy.crawler] INFO: Overridden settings:
{'AUTOTHROTTLE_ENABLED': 'True',
 'AUTOTHROTTLE_TARGET_CONCURRENCY': '1.0',
 'HTTPCACHE_ENABLED': 'True',
 'LOG_LEVEL': 'INFO'}


See the documentation of the 'REQUEST_FINGERPRINTER_IMPLEMENTATION' setting for information on how to handle this deprecation.
  return cls(crawler)

2023-08-10 21:13:05 [scrapy.extensions.telnet] INFO: Telnet Password: 5d385743cc8a5e2f
2023-08-10 21:13:05 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.ex

2023-08-10 21:53:06 [scrapy.extensions.logstats] INFO: Crawled 5142 pages (at 130 pages/min), scraped 38033 items (at 914 items/min)
2023-08-10 21:54:05 [scrapy.extensions.logstats] INFO: Crawled 5251 pages (at 109 pages/min), scraped 38750 items (at 717 items/min)
2023-08-10 21:55:05 [scrapy.extensions.logstats] INFO: Crawled 5332 pages (at 81 pages/min), scraped 39148 items (at 398 items/min)
2023-08-10 21:56:05 [scrapy.extensions.logstats] INFO: Crawled 5390 pages (at 58 pages/min), scraped 39457 items (at 309 items/min)
2023-08-10 21:57:06 [scrapy.extensions.logstats] INFO: Crawled 5437 pages (at 47 pages/min), scraped 39646 items (at 189 items/min)
2023-08-10 21:58:05 [scrapy.extensions.logstats] INFO: Crawled 5512 pages (at 75 pages/min), scraped 39944 items (at 298 items/min)
2023-08-10 21:59:06 [scrapy.extensions.logstats] INFO: Crawled 5623 pages (at 111 pages/min), scraped 40563 items (at 619 items/min)
2023-08-10 22:00:05 [scrapy.extensions.logstats] INFO: Crawled 5719 pages

2023-08-10 22:55:06 [scrapy.extensions.logstats] INFO: Crawled 11160 pages (at 100 pages/min), scraped 78435 items (at 916 items/min)
2023-08-10 22:56:06 [scrapy.extensions.logstats] INFO: Crawled 11280 pages (at 120 pages/min), scraped 79617 items (at 1182 items/min)
2023-08-10 22:57:06 [scrapy.extensions.logstats] INFO: Crawled 11406 pages (at 126 pages/min), scraped 80819 items (at 1202 items/min)
2023-08-10 22:58:06 [scrapy.extensions.logstats] INFO: Crawled 11539 pages (at 133 pages/min), scraped 82018 items (at 1199 items/min)
2023-08-10 22:59:06 [scrapy.extensions.logstats] INFO: Crawled 11683 pages (at 144 pages/min), scraped 83357 items (at 1339 items/min)
2023-08-10 23:00:06 [scrapy.extensions.logstats] INFO: Crawled 11820 pages (at 137 pages/min), scraped 84639 items (at 1282 items/min)
2023-08-10 23:01:06 [scrapy.extensions.logstats] INFO: Crawled 11964 pages (at 144 pages/min), scraped 85964 items (at 1325 items/min)
2023-08-10 23:02:06 [scrapy.extensions.logstats] INFO: C

Let's take a look at the results

In [3]:
df_dermStore

Unnamed: 0,product_name,product_url,product_ingredients,review_date,review_title,review_body,rating
0,Replenix Lifting Firming Neck Cream (1.7 fl. oz.),https://www.dermstore.com/replenix-lifting-and...,"Purified Water, Caprylic/Capric Triglyceride, ...",10/30/22,Good Firming Neck Cream,I keep trying new neck creams. As an older wo...,5.0
1,Replenix Lifting Firming Neck Cream (1.7 fl. oz.),https://www.dermstore.com/replenix-lifting-and...,"Purified Water, Caprylic/Capric Triglyceride, ...",10/15/22,liked it,I liked this neck cream... I am not sure it re...,4.0
2,Replenix Lifting Firming Neck Cream (1.7 fl. oz.),https://www.dermstore.com/replenix-lifting-and...,"Purified Water, Caprylic/Capric Triglyceride, ...",9/10/22,My new holy grail,The neck cream has so many of the best ingredi...,5.0
3,Replenix Lifting Firming Neck Cream (1.7 fl. oz.),https://www.dermstore.com/replenix-lifting-and...,"Purified Water, Caprylic/Capric Triglyceride, ...",4/24/22,I see a difference with consistent use,I see a difference with consistent use. It too...,5.0
4,Replenix Lifting Firming Neck Cream (1.7 fl. oz.),https://www.dermstore.com/replenix-lifting-and...,"Purified Water, Caprylic/Capric Triglyceride, ...",4/3/22,Fantastic neck cream,My neck has held up well and I am in my 60's. ...,5.0
...,...,...,...,...,...,...,...
107040,SkinCeuticals C E Ferulic (1 fl. oz.),https://www.dermstore.com/skinceuticals-c-e-fe...,"Aqua/Water/Eau, Ethoxydiglycol, Ascorbic Acid,...",3/22/06,Wrinkle free serum,I have used this serum for several years and k...,4.0
107041,SkinCeuticals C E Ferulic (1 fl. oz.),https://www.dermstore.com/skinceuticals-c-e-fe...,"Aqua/Water/Eau, Ethoxydiglycol, Ascorbic Acid,...",2/8/21,Slow but steady,I have been using this in collaboration with S...,5.0
107042,SkinCeuticals C E Ferulic (1 fl. oz.),https://www.dermstore.com/skinceuticals-c-e-fe...,"Aqua/Water/Eau, Ethoxydiglycol, Ascorbic Acid,...",2/7/21,Awesome product,This leaves your face feeling so good! It has ...,5.0
107043,SkinCeuticals C E Ferulic (1 fl. oz.),https://www.dermstore.com/skinceuticals-c-e-fe...,"Aqua/Water/Eau, Ethoxydiglycol, Ascorbic Acid,...",2/7/21,Vitamin c potency,"This has very potent vitamin c in it, and it a...",5.0


In [4]:
df_dermStore['product_name'].nunique()

3517

Let's put the data to csv file

In [5]:
df_dermStore.to_csv('C:/Users/Dell/Documents/For project/MeNow!/web scraping project/dermstore/skin_care.csv', 
                    index = False)

In [6]:
df_dermStore.stb.missing(style = True)

Unnamed: 0,missing,total,percent
review_title,4316,107045,4.03%
review_date,1206,107045,1.13%
review_body,1206,107045,1.13%
rating,1206,107045,1.13%
product_ingredients,364,107045,0.34%
product_name,6,107045,0.01%
product_url,0,107045,0.00%


### Version 2 -  Let's check if I'll grab only name of products without diving to list of reviews

In [None]:
class DermStoreSpider_nameOnly (scrapy.Spider):
    name = "face_moisturizers_name_only"
    url = "https://www.dermstore.com/skin-care/moisturizers/face-moisturizer.list"
    
    def start_requests(self):
        urls = ['https://www.dermstore.com/skin-care/moisturizers/face-moisturizer.list']
        for url in urls:
            yield scrapy.Request(url = url, callback = self.parse_front)
            
    def parse_front(self, response):
        product_block = response.css('div.productBlock_itemDetails_wrapper')
        print(f"Found {len(product_block)} products on page")
        product_links = product_block.xpath('./a/@href').getall()
        print(f"Found {len(product_links)} links on the page")
        for url in product_links:
            yield response.follow(url=url, callback=self.parse_pages,
                                 meta={'product_url':url})
        
        total_pages = response.xpath('//nav[@class="responsivePaginationPages"]/@data-total-pages').get()
        for page in range(2, int(total_pages) + 1):
            next_page_url = self.url + '?pageNumber=' + str(page)
            yield response.follow(url = next_page_url, callback=self.parse_front)
            
    def parse_pages(self, response):
        product_url = response.request.meta['product_url']
        product_name = response.xpath('//h1[contains(@class,"productName_title")]/text()').get()
        item={'product_name': product_name if product_name else None, 
              'product_url': product_url if product_url else None}
        yield item
        
df_dermStore_nameOnly = pd.DataFrame(columns=['product_name','product_url'])

class DermStorePipeline_nameOnly:
    
    def process_item(self, item, spider):
        df_dermStore_nameOnly.loc[len(df_dermStore_nameOnly)] = [item['product_name'], item['product_url']]
        return item

process = CrawlerProcess(settings={
    'ITEM_PIPELINES': {'__main__.DermStorePipeline_nameOnly': 1},
    'LOG_LEVEL': 'INFO'})
process.crawl(DermStoreSpider_nameOnly)
process.start()

In [None]:
df_dermStore_nameOnly

In [None]:
df_dermStore_nameOnly['product_name'].nunique()