In [1]:
import pandas as pd
import sqlite3
import time
import requests
import concurrent.futures
from bs4 import BeautifulSoup
import warnings
import json
import math
warnings.filterwarnings('ignore')



In [2]:
# Retrieving Names of Categories
types_df = pd.read_csv("VanHeusen listOfTypes.csv")
categories = types_df['Type'].to_list()
categories

['men-clothing',
 'women-clothing',
 'footwear',
 'watches',
 'wallets',
 'bags',
 'jewellery',
 'belts',
 'ties',
 'cufflinks',
 'pocket-squares',
 'caps',
 'hats',
 'scarves',
 'gloves',
 'phones-cases',
 'rings',
 'wristwear',
 'socks',
 'bracelets',
 'chains']

In [5]:
base_url = "https://www.vanheusenindia.com"
headers = {"User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:91.0) Gecko/20100101 Firefox/91.0"}
main_url = "https://www.vanheusenindia.com/cat/fetchLefProductsWRTPage?category_id=0&category_name=&strShopName=Van%20Heusen&gender_name=&intShopID=6&search_query={}&intPageNo=1&orderby=position&orderway=asc&outofstock=&actiontype=refresh&hdnLoad=1&from={}&pageSize=500"
per_page_total = 500

In [6]:
# Get Each Category Number of Pages
def get_page_nums(categories):
    
    nums_lst = []
    
    for category in categories:
        
        res = json.loads(requests.session().get(main_url.format(category, "0")).text)
        pages_total = math.ceil(res['Products']['Results']['hits']['total']/per_page_total)
        print(category, " :- ", pages_total)
        nums_lst.append(pages_total)
    
    return nums_lst

page_nums = get_page_nums(categories)

men-clothing  :-  10
women-clothing  :-  5
footwear  :-  1
watches  :-  0
wallets  :-  1
bags  :-  1
jewellery  :-  0
belts  :-  1
ties  :-  1
cufflinks  :-  1
pocket-squares  :-  1
caps  :-  0
hats  :-  0
scarves  :-  1
gloves  :-  0
phones-cases  :-  0
rings  :-  0
wristwear  :-  0
socks  :-  1
bracelets  :-  0
chains  :-  0


In [7]:
Product_Link=[]
Product_Name=[]
Product_Brand=[]
Size_Available=[]
Price=[]
MRP=[]
Gender=[]
Description=[]
Cat_list=[]
Primary_Image_Link=[]
Secondary_Image_Link=[]

In [8]:
def extract(item):
    
    item = item['_source']
    
    try:
        website = base_url
        
        
        # product name
        product_name = item['Name']
        
        # product link
        
        template = "https://www.vanheusenindia.com/product/{}-{}.html"
        product_link = template.format("-".join(product_name.lower().split()), item["ProductID"])


        # product brand
        product_brand = item["Features"]['Brand']
        
        # product category
        product_category = item['Features']['ProductType']

        # sizes available
        
        sizes = ",".join([str(size['Name']) for size in item["Sizes"]])
        
        
        # price
        price = item['Price']

        # mrp
        mrp = price

        # gender
        gender = item["GenderName"]
        
        
        # description
        description = item['Description']
        
        
        # Images 
        img_template = "https://vanheusenindia.imgix.net/img/app/product/6/{}.jpg?auto=format&wa=91"
        
        # primary image link
        p_img_link = img_template.format(item['Media']['Images'][0]['Name'])

        # secondary image links
        sec_links = []
        for elem in item['Media']['Images']:
            sec_links.append(img_template.format(elem['Name']))

        sec_links = ",".join(sec_links)
    
    except:
        pass
    
    else:
        Product_Link.append(product_link)
        Product_Brand.append(product_brand)
        Product_Name.append(product_name)
        Description.append(description)
        Gender.append(gender)
        Cat_list.append(product_category)
        MRP.append(mrp)
        Price.append(price)
        Primary_Image_Link.append(p_img_link)
        Secondary_Image_Link.append(sec_links)
        Size_Available.append(sizes)
    
    
   

In [10]:
"""
Main Function 

"""
data = []

counter = 0
start = time.time()

for category in range(len(categories)):
    
    for page in range(page_nums[category]):
        
        page_data = requests.session().get(main_url.format(categories[category], page*per_page_total)).text
        page_data = json.loads(page_data)
        
        for product in page_data['Products']['Results']['hits']['hits']:
            extract(product)
        
        counter += len(data)
        
        
        print("Total Amount scraped :- ", counter, sep="  ")
        print("Current Category:- ", categories[category])
        print("\nTime Elapsed:- ", round((time.time() - start)/60, 2), "mins\n")
        print()
        
        data = []
    

end = time.time()
print("\nTotal Time Elapsed:- ", round((time.time() - start)/60, 2), "mins\n\n")

Total Amount scraped :-   0
Current Category:-  men-clothing

Time Elapsed:-  0.06 mins


Total Amount scraped :-   0
Current Category:-  men-clothing

Time Elapsed:-  0.12 mins


Total Amount scraped :-   0
Current Category:-  men-clothing

Time Elapsed:-  0.22 mins


Total Amount scraped :-   0
Current Category:-  men-clothing

Time Elapsed:-  0.34 mins


Total Amount scraped :-   0
Current Category:-  men-clothing

Time Elapsed:-  0.46 mins


Total Amount scraped :-   0
Current Category:-  men-clothing

Time Elapsed:-  0.5 mins


Total Amount scraped :-   0
Current Category:-  men-clothing

Time Elapsed:-  0.55 mins


Total Amount scraped :-   0
Current Category:-  men-clothing

Time Elapsed:-  0.6 mins


Total Amount scraped :-   0
Current Category:-  men-clothing

Time Elapsed:-  0.63 mins


Total Amount scraped :-   0
Current Category:-  men-clothing

Time Elapsed:-  0.67 mins


Total Amount scraped :-   0
Current Category:-  women-clothing

Time Elapsed:-  0.69 mins


Total Amou

In [11]:
con=sqlite3.connect("Van_Heusen.db")  
cur=con.cursor()
cur.execute('CREATE TABLE product_details (Website varchar(40) not null,Product_Link text PRIMARY KEY,Product_Name varchar(50) not null,Product_Brand varchar(50) not null,Product_Category varchar(50),Size_Avail varchar(20) not null,Price int not null,MRP int not null,Gender varchar(15) not null,Description text not null,Primary_Image_Links text not null,Secondary_Image_Links text not null,Affiliate_Link text not null )')
for i in range(len(Product_Link)):
    try:
        cur.execute('insert into product_details  values(%r,%r,%r,%r,%r,%r,%d,%d,%r,%r,%r,%r,%r)'%('https://www.vanheusenindia.com/',Product_Link[i],Product_Name[i],Product_Brand[i],Cat_list[i],Size_Available[i],Price[i],MRP[i],Gender[i],Description[i],Primary_Image_Link[i],Secondary_Image_Link[i],''))
    except:
        pass
con.commit()
con.close()