In [1]:
import pandas as pd
import sqlite3
import time
import requests
import concurrent.futures
from bs4 import BeautifulSoup
import warnings
import math
warnings.filterwarnings('ignore')

In [2]:
# Retrieving Names of Categories
types_df = pd.read_csv("Zara listOfTypes.csv")
categories = types_df['Type'].to_list()
categories

['men-clothing',
 'women-clothing',
 'footwear',
 'watches',
 'wallets',
 'bags',
 'jewellery',
 'belts',
 'ties',
 'cufflinks',
 'pocket-squares',
 'caps',
 'hats',
 'scarves',
 'gloves',
 'phones-cases',
 'rings',
 'wristwear',
 'socks',
 'bracelets',
 'chains']

# Zara

In [3]:
base_url = "https://www.zara.com"
headers = {"User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:91.0) Gecko/20100101 Firefox/91.0"}
page_url = "https://api.empathybroker.com/search/v1/query/zara/search?jsonCallback=_pdJsonpCallback_1&o=json&m=30&q={}&filter=sectionName%3A%22{}%22%20OR%20sectionName%3A%22ALL%22&filter=subsectionCode%3A%22default%22&scope=default&t=*&lang=en_GB&store=11744&catalogue=36551&warehouse=28551&section={}&start={}&rows=30&origin=default&session=7ad37610-1ec7-449f-94a8-392fb4df3e0e&user=8c9ee228-06a1-4040-8630-00243650bbc6&sort=score%20desc&sort=kpi_salesAmount_11744%20desc&hideOptionalProducts=false&contextualizeEnabled=true"
product_url = "https://www.zara.com/in/en/{}-p{}.html?v1={}"
per_page_total = 30

In [4]:
categ_avail = ["man", "woman", "footwear", "bags", "jewellery", "belts", "scarves", "hats", "caps"]
categ_avail

['man',
 'woman',
 'footwear',
 'bags',
 'jewellery',
 'belts',
 'scarves',
 'hats',
 'caps']

In [5]:
def get_page_nums(categories):
    
    keyword = '"numFound":'
    page_nums_man = []
    page_nums_woman = []
    
    print("For men's gender :- \n")
    for category in categories:
        res = requests.session().get(page_url.format(category, "MAN", "MAN", "0")).text
        index = res.find(keyword)+len(keyword)
        total = math.ceil(int(res[index:index + res[index:].find(",")])/per_page_total)
        print(category, total)
        page_nums_man.append(total)
    
    print("\nWomen's gender :- \n")
    for category in categories:
        res = requests.session().get(page_url.format(category, "WOMAN", "WOMAN", "0")).text
        index = res.find(keyword)+len(keyword)
        total = math.ceil(int(res[index:index + res[index:].find(",")])/per_page_total)
        print(category, total)
        page_nums_woman.append(total)
    
    return page_nums_man, page_nums_woman

page_nums_man, page_nums_woman = get_page_nums(categ_avail)

For men's gender :- 

man 98
woman 0
footwear 11
bags 11
jewellery 1
belts 2
scarves 1
hats 3
caps 3

Women's gender :- 

man 0
woman 195
footwear 21
bags 19
jewellery 5
belts 7
scarves 2
hats 3
caps 4


In [6]:
def get_prod_urls(page_data, categ):
    
    results = page_data.split('"seo":')[1:]
    prod_urls = []
    
    for item in results:
        
        try:
            key1 = '"keyword":"'
            key2 = '"seoProductId":"'
            key3 = '"discernProductId":"'    

            item1 = item[item.find(key1)+len(key1):item.find('","seoProductId":"')]
            item2 = item[item.find(key2)+len(key2):item.find('","discernProductId"')]
            item3 = item[item.find(key3)+len(key3):item.find('"},"ebTagging"')]
            
            req_url = product_url.format(item1, item2, item3)
            prod_urls.append([req_url, categ_avail[categ]])
            
        except:
            continue
    
    return prod_urls

In [7]:
urls_men = []
urls_women = []

start = time.time()


for gender in ['man', 'woman']:
    
    if gender == 'man':
        print("Now retrieving men's products\n")
        for categ in range(len(categ_avail)):
            count = 0
            categ_num = page_nums_man[categ]
            for page_num in range(categ_num):
                page_data = requests.session().get(page_url.format(categ_avail[categ], "MAN", "MAN", str(page_num*30))).text
                urls_men.extend(get_prod_urls(page_data, categ))
            
            print("Category:- {} Done".format(categ_avail[categ]))
            print("Time Elapsed {} mins\n".format(round((time.time() - start)/60, 2)))
            
    else:
        start_ = time.time()
        print("\n" + "*"*100 + "\n")
        print("Now retrieving women's products\n")   
        
        for categ in range(len(categ_avail)):
            
            categ_num = page_nums_woman[categ]
            for page_num in range(categ_num):
                page_data = requests.session().get(page_url.format(categ_avail[categ], "WOMAN", "WOMAN", str(page_num*30))).text
                urls_women.extend(get_prod_urls(page_data, categ))
                
            print("Category:- {} Done".format(categ_avail[categ]))
            print("Time Elapsed {} mins\n".format(round((time.time() - start_)/60, 2)))   
            
    print("Total Time Elapsed {} mins\n".format(round((time.time() - start)/60, 2)))   


Now retrieving men's products

Category:- man Done
Time Elapsed 1.82 mins

Category:- woman Done
Time Elapsed 1.82 mins

Category:- footwear Done
Time Elapsed 2.02 mins

Category:- bags Done
Time Elapsed 2.31 mins

Category:- jewellery Done
Time Elapsed 2.33 mins

Category:- belts Done
Time Elapsed 2.36 mins

Category:- scarves Done
Time Elapsed 2.37 mins

Category:- hats Done
Time Elapsed 2.43 mins

Category:- caps Done
Time Elapsed 2.48 mins

Total Time Elapsed 2.48 mins


****************************************************************************************************

Now retrieving women's products

Category:- man Done
Time Elapsed 0.0 mins

Category:- woman Done
Time Elapsed 2.28 mins

Category:- footwear Done
Time Elapsed 2.62 mins

Category:- bags Done
Time Elapsed 2.92 mins

Category:- jewellery Done
Time Elapsed 2.99 mins

Category:- belts Done
Time Elapsed 3.12 mins

Category:- scarves Done
Time Elapsed 3.15 mins

Category:- hats Done
Time Elapsed 3.2 mins

Category:- cap

In [50]:
Product_Link=[]
Product_Name=[]
Product_Brand=[]
Size_Available=[]
Price=[]
MRP=[]
Gender=[]
Description=[]
Cat_list=[]
Primary_Image_Link=[]
Secondary_Image_Link=[]

In [51]:
def extract(url, cat, gender):
    
    time.sleep(0.2)
    r = requests.session().get(url, headers=headers).text
    soup = BeautifulSoup(r, "html.parser")
    
    item = soup.find_all("script")
    
    item1 = str(item[22])
    item2 = str(item[23])
    
    
    try:
        
        website = base_url

        #product link
        keyword = '"url":"'
        index = item1.find(keyword)+len(keyword)
        product_link = item1[index:index + item1[index:].find('"')]

        #product name
        keyword = '"name":"'
        index = item1.find(keyword)+len(keyword)
        product_name = item1[index:index + item1[index:].find('"')]

        #product brand
        keyword = '"brand":"'
        index = item1.find(keyword)+len(keyword)
        product_brand = item1[index:index + item1[index:].find('"')]

        #product category
        product_category = cat

        #sizes available

        sizes = []

        keyword = '"sizes":[{"'
        index = item2.find('"sizes":[{"') + len(keyword)
        req_text = item2[index: index + item2[index:].find(']')]

        while True:
            keyword = 'availability":"in_stock"'
            index = req_text.find(keyword)
            if index != -1:
                req_text = req_text[index+len(keyword):]
                keyword = '"name":"'
                index = req_text.find(keyword) + len(keyword)
                sizes.append(req_text[index:index + req_text[index:].find('"')]) 
                req_text = req_text[index+4:]
            else:
                break

        if len(sizes) == 0:
            #print("here")
            sizes = "ONE SIZE"
        else:
            sizes = ",".join(sizes)

        #price
        keyword = '"price":"'
        index = item1.find(keyword)+len(keyword)
        price = item1[index:index + item1[index:].find('"')]

        #mrp
        mrp = price

        #gender
        gender = gender

        #description
        keyword = '"description":"'
        index = item1.find(keyword)+len(keyword)
        description = item1[index:index + item1[index:].find('"')].strip('\\n')
        description = item1[index:index + item1[index:].find('"')].strip('\n')


        """IMAGES"""

        keyword = '"image":["'
        index = item1.find(keyword) + len(keyword)
        req_text = item1[index: index + item1[index:].find(']')]
        images = req_text.split('","')[:-1]

        #primary image
        p_link = images[0]

        #secondary images
        sec_links = images[1:]
        sec_links = ",".join(sec_links)
        
        if sec_links == None:
            sec_links = "not available"

       
    except:
        pass
    
    else:
        Product_Link.append(product_link)
        Product_Brand.append(product_brand)
        Product_Name.append(product_name)
        Description.append(description)
        Gender.append(gender)
        Cat_list.append(product_category)
        MRP.append(mrp)
        Price.append(price)
        Primary_Image_Link.append(p_img_link)
        Secondary_Image_Link.append(sec_links)
        Size_Available.append(sizes)


In [64]:
len(Size_Available),len(Primary_Image_Link),len(Secondary_Image_Link),len(Product_Link),len(Description),len(Gender),len(Cat_list),len(MRP),len(Price)

(0, 0, 0, 4586, 4586, 4586, 4586, 4586, 4586)

In [52]:
"""
MAIN FUNCTION
"""

data = []
counter = 0
start = time.time()

jump = 5


"""
Men's section
"""

print("Now retrieving men's products \n")

for pivot in range(0, len(urls_men), jump):
    
    prod_urls = []
    prod_cats = []
    gender_lst = ["men"] * jump
    temp_lst = urls_men[pivot:pivot+jump]
    
    for i in temp_lst:
        prod_urls.append(i[0])
        if i[1] == 'man':
            prod_cats.append('men clothing')
        else:
            prod_cats.append(i[1])
    with concurrent.futures.ThreadPoolExecutor() as executor:
        executor.map(extract, prod_urls, prod_cats, gender_lst)
    
    counter += (len(prod_urls))
    
    if counter % 50 == 0:
        print("Total Amount scraped :- ", counter, sep="  ")
        print("Current Category:- ", prod_cats[-1])
        print("\nTime Elapsed:- ", round((time.time() - start)/60, 2), "mins\n")
        print()


    # Re-initializing list for next page
    data = []

end = time.time()
print("\nTotal Time Elapsed:- ", round((end - start)/60, 2), "mins\n\n")


"""
Women's section
"""
data = []
counter = 0
start = time.time()

print("Now retrieving women's products \n")

for pivot in range(0, len(urls_women), jump):
    
    prod_urls = []
    prod_cats = []
    gender_lst = ["women"] * jump
    
    temp_lst = urls_women[pivot:pivot+jump]
    
    for i in temp_lst:
        prod_urls.append(i[0])
        if i[1] == 'woman':
            prod_cats.append('women clothing')
        else:
            prod_cats.append(i[1])
    with concurrent.futures.ThreadPoolExecutor() as executor:
        executor.map(extract, prod_urls, prod_cats, gender_lst)
    
    counter += (len(prod_urls))
    
    if counter % 50 == 0:
        print("Total Amount scraped :- ", counter, sep="  ")
        print("Current Category:- ", prod_cats[-1])
        print("\nTime Elapsed:- ", round((time.time() - start)/60, 2), "mins\n")
        print()


    # Re-initializing list for next page
    data = []

end = time.time()
print("\nTotal Time Elapsed:- ", round((end - start)/60, 2), "mins\n")

Now retrieving men's products 

Total Amount scraped :-   50
Current Category:-  men clothing

Time Elapsed:-  0.36 mins


Total Amount scraped :-   100
Current Category:-  men clothing

Time Elapsed:-  0.7 mins


Total Amount scraped :-   150
Current Category:-  men clothing

Time Elapsed:-  0.99 mins


Total Amount scraped :-   200
Current Category:-  men clothing

Time Elapsed:-  1.32 mins


Total Amount scraped :-   250
Current Category:-  men clothing

Time Elapsed:-  1.62 mins


Total Amount scraped :-   300
Current Category:-  men clothing

Time Elapsed:-  1.93 mins


Total Amount scraped :-   350
Current Category:-  men clothing

Time Elapsed:-  2.24 mins


Total Amount scraped :-   400
Current Category:-  men clothing

Time Elapsed:-  2.57 mins


Total Amount scraped :-   450
Current Category:-  men clothing

Time Elapsed:-  2.86 mins


Total Amount scraped :-   500
Current Category:-  men clothing

Time Elapsed:-  3.09 mins


Total Amount scraped :-   550
Current Category:-  

In [53]:
out=pd.DataFrame({
    
    "Website":'https://www.zara.com/in/',
    "Product_Link":Product_Link,
    "Product_Name":Product_Name,
    "Product_Brand":Product_Brand,
    "Product_Category":Cat_list,
    "Size_Avail":Size_Available,
    "Price":Price,
    "MRP":MRP,
    "Gender":Gender,
    "Description":Description,
    "Primary_Image_Links":Primary_Image_Link,
    "Secondary_Image_Links":Secondary_Image_Link,
    "Affiliate_Link":''
    
})

ValueError: arrays must all be same length

In [55]:
Size_Available

[]

In [12]:
con=sqlite3.connect("Zara.db")  
cur=con.cursor()
cur.execute('CREATE TABLE product_details (Website varchar(40) not null,Product_Link text PRIMARY KEY,Product_Name varchar(50) not null,Product_Brand varchar(50) not null,Product_Category varchar(50),Size_Avail varchar(20) not null,Price int not null,MRP int not null,Gender varchar(15) not null,Description text not null,Primary_Image_Links text not null,Secondary_Image_Links text not null,Affiliate_Link text not null )')
for i in range(len(Product_Link)):
    try:
        cur.execute('insert into product_details  values(%r,%r,%r,%r,%r,%r,%d,%d,%r,%r,%r,%r,%r)'%('https://www.zara.com/in/',Product_Link[i],Product_Name[i],Product_Brand[i],Cat_list[i],Size_Available[i],Price[i],MRP[i],Gender[i],Description[i],Primary_Image_Link[i],Secondary_Image_Link[i],''))
    except:
        pass
con.commit()
con.close()