# Libraries

In [718]:
import scrapy
from scrapy.crawler import CrawlerRunner, CrawlerProcess
import re
import csv
import pandas as pd 
pd.set_option('Max_Columns', None)
from collections import defaultdict
from crochet import setup, wait_for
setup()

# Scrapy Spider and Pipeline

In [794]:
class AmazonToCsv(scrapy.Spider):
    name = "Amazon"
    start_urls = [
        'https://www.amazon.in/Head-Shoulders-Hairfall-Shampoo-675ml/dp/B00ENZRCBI',
    ]
    custom_settings = {
        'ITEM_PIPELINES': {
            '__main__.AmazonPipeline': 1
        },
        'FEEDS': {
            'amazon.csv': {
                'format': 'csv',
                'overwrite': True
            }
        }
    }

    def parse(self, response):
        #print(response.xpath('//*[@id="productDescription"]/ul[2]/li/span/span/text()').extract())
        img = []
        for image in response.xpath('//img/@src').extract():
            img.append(response.urljoin(image))
        table = response.xpath('//*[@id="poExpander"]/div[1]/div/table/tr/td/span/text()').extract()
        title = ''.join(response.xpath('//h1[@id="title"]/span/text()').extract()).strip()
        sale_price = ''.join(response.xpath('//*[@id="olp_feature_div"]/div[2]/span[1]/a/span[2]/text()').extract()).strip()
        delivery_charge = ''.join(response.xpath('//*[@id="olp_feature_div"]/div[2]/span[2]/text()').extract()).strip()
        size = ''.join(response.xpath('//*[@id="variation_size_name"]/div/span/text()').extract()).strip()
        available_size = response.xpath('//*[@id="twisterContainer"]/div/form/div/ul/li/span/div/span/span/span/button/div/div/p/text()').extract()
        available_design = response.xpath('//*[@id="variation_pattern_name"]/ul/li/span/div/span/span/span/button/div/div/img/@alt').extract()
        available_design_img = response.xpath('//*[@id="variation_pattern_name"]/ul/li/span/div/span/span/span/button/div/div/img/@src').extract()
        brand = ''.join(response.xpath('//*[@id="bylineInfo"]/text()').extract()).strip()
        rating = ''.join(response.xpath('//*[@id="acrPopover"]/span[1]/a/i[1]/span[1]/text()')[0].extract()).strip()
        total_rating = ''.join(response.xpath('//*[@id="acrCustomerReviewText"]/text()')[0].extract()).strip()
        answered_ques = ''.join(response.xpath('//*[@id="askATFLink"]/span/text()').extract()).strip()
        return_policy = ''.join(response.xpath('//*[@id="RETURNS_POLICY"]/span/div[2]/a/text()').extract()).strip()
        variation_name = ''.join(response.xpath('//*[@id="variation_pattern_name"]/div/span/text()').extract()).strip()
        about_line = response.xpath('//*[@id="featurebullets_feature_div"]/div/h1/text()').extract()[0]
        about_product = [', '.join(map(lambda x: x.strip(), response.xpath('//*[@id="featurebullets_feature_div"]/div/ul/li/span/text()').extract()))]
        #details_line = response.xpath('//*[@id="detailBulletsWrapper_feature_div"]/h2/text()').extract()[0]
        details_product = list(map(lambda x: x.strip('\n                                    \u200f\n                                        :\n                                    \u200e'), response.xpath('//*[@id="detailBulletsWrapper_feature_div"]/div/ul/li/span/span/text()').extract()))
        best_seller_line = response.xpath('//*[@id="detailBulletsWrapper_feature_div"]/ul/li/span/span/text()').extract()[0].strip()[:-1]
        best_seller_rank = [response.xpath('//*[@id="detailBulletsWrapper_feature_div"]/ul/li/span/text()').extract()[1].strip()[:-1], response.xpath('//*[@id="detailBulletsWrapper_feature_div"]/ul[1]/li/span/ul/li/span/text()').extract()[0].strip()+' '+response.xpath('//*[@id="detailBulletsWrapper_feature_div"]/ul[1]/li/span/ul/li/span/a/text()').extract()[0].strip()]
        category = [', '.join(map(lambda x: x.strip(), response.xpath('//a[@class="a-link-normal a-color-tertiary"]/text()').extract()))]
        imp_info_head = response.xpath('//*[@id="important-information"]/h2/text()').extract()[0]
        imp_info_label = response.xpath('//*[@id="important-information"]/div/h4/text()').extract()
        imp_info_str = response.xpath('//*[@id="important-information"]/div/p[2]/text()').extract()
        product_des_label = response.xpath('//*[@id="productDescription"]/h3/span/text()').extract()
        product_des_value = response.xpath('//*[@id="productDescription"]/p/span/text()').extract()
        product_benefit = response.xpath('//*[@id="productDescription"]/h4/span/text()').extract()
        product_bene_h5 = response.xpath('//*[@id="productDescription"]/h5/span/text()').extract()
        bene_available = response.xpath('//*[@id="productDescription"]/ul/li/span/span/text()').extract()
        
        
        d1 = defaultdict(dict)
        for i in range(len(product_benefit)):
            d1[product_benefit[i]][product_des_value[2*i+1]] =  product_des_value[2*i+2] 
            d1[product_benefit[i]][product_bene_h5[i]] =  response.xpath('//*[@id="productDescription"]/ul['+str(i+1)+']/li/span/span/text()').extract()
        
        product_details = {'Name':title,
                           'Sale_Price':sale_price,
                           'Delivery_Charge':delivery_charge,
                           'Size':size,
                           'Available_Size':available_size,
                           'Design_Name':variation_name,
                           'Available_Design':available_design,
                           'Available_Design_Image':available_design_img,
                           'Brands':brand,
                           'Rating':rating,
                           'Total_Rating':total_rating,
                           'Answered_Questions':answered_ques,
                           'Return_Policy':return_policy,
                           about_line:about_product,
                           'Category':category,
                           'Manufacture By': details_product[7],
                           best_seller_line:best_seller_rank,
                           imp_info_head:{imp_info_label[0][:-1]:imp_info_str[0],imp_info_label[1][:-1]:imp_info_str[1]},
                           product_des_label[0]: product_des_value[0],
                           product_des_label[1]:d1,
                           
                           'Image_URLS':img,
                          }
        for i in range(int(len(table)/2)):
            product_details[table[2*i]] =  table[2*i+1]
          
        for j in range(int(len(details_product)/2)):
            product_details[details_product[2*j]] =  details_product[2*j+1]
        
        yield product_details


class AmazonPipeline(object):
    
    def process_item(self, item, spider):
        #print(item)
        return item

# Run Spider

In [795]:
@wait_for(10)
def run_spider():
    crawler = CrawlerRunner()
    d = crawler.crawl(AmazonToCsv)
    return d
run_spider()

# Extracted Data

In [796]:
df = pd.read_csv('amazon.csv')
df

Unnamed: 0,Name,Sale_Price,Delivery_Charge,Size,Available_Size,Design_Name,Available_Design,Available_Design_Image,Brands,Rating,Total_Rating,Answered_Questions,Return_Policy,About this item,Category,Manufacture By,Best Sellers Rank,Important information,Product Description,From the Manufacturer,Image_URLS,Brand,Scent,Hair Type,Liquid Volume,Item Weight,Item Dimensions LxWxH,Recommended Uses For Product,Item Form,Age Range (Description),Is Discontinued By Manufacturer,Product Dimensions,Date First Available,Manufacturer,ASIN,Item model number,Packer,Importer,Net Quantity,Included Components
0,"Head & Shoulders, Anti-Hairfall, Anti-Dandruff...",₹505.00,+ ₹70.00 Delivery charge,650ml,"72ml,180ml,340ml,360ml,650ml,675ml",Shampoo,"Shampoo,Shampoo + Shampoo + Conditioner,Shampo...",https://m.media-amazon.com/images/I/3173bpPit2...,Brand: Head & Shoulders,4.3 out of 5 stars,"11,637 ratings",171 answered questions,Not Returnable,"Gives upto 100% dandruff free hair, Rescue hai...","Beauty, Hair Care, Shampoo & Conditioner, Sham...",Procter & Gamble Home Products Private Ltd,"#151 in Beauty ,#20 in Shampoos (Beauty)",{'Safety Information': 'Avoid contact with eye...,"Rescue hair from damage, dullness, and hairfal...","defaultdict(<class 'dict'>, {'Anti Hairfall': ...",https://images-eu.ssl-images-amazon.com/images...,Head & Shoulders,Fresh,Thinning/Hairfall,650 Millilitres,752 g,9 x 6.9 x 22.5 Centimeters,Anti-dandruff,Foam,Adult,No,8.99 x 6.89 x 22.5 cm; 751.7 Grams,1 January 2017,"Procter & Gamble Home Products Private Ltd, Pr...",B00ENZRCBI,H&S AHF 750ml,"Procter & Gamble Home Products Private Ltd, Pl...",Procter & Gamble Hygiene and Health Care Limit...,750.0 millilitre,Shampoo


In [799]:
df.shape

(1, 40)