In [1]:
import pandas as pd
import sqlite3
import time
import requests
import concurrent.futures
from bs4 import BeautifulSoup
import warnings
import math
warnings.filterwarnings('ignore')

In [3]:
#Retrieving Names of Categories
types_df = pd.read_csv("BananaRepublic listOfTypes.csv")
categories = types_df['Type'].to_list()
categories

['men-clothing',
 'women-clothing',
 'footwear',
 'watches',
 'wallets',
 'bags',
 'jewellery',
 'belts',
 'ties',
 'cufflinks',
 'pocket-squares',
 'caps',
 'hats',
 'scarves',
 'gloves',
 'phones-cases',
 'rings',
 'wristwear',
 'socks',
 'bracelets',
 'chains']

In [4]:
# Gender Seperation
men_categories = ['men-clothing']
men_categories.extend(["men-" + cat for cat in categories[2:]])
women_categories = ['women-clothing']
women_categories.extend(["women-" + cat for cat in categories[2:]])

In [5]:
base_url = "https://bananarepublic.gap.com"
headers = {"User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:91.0) Gecko/20100101 Firefox/91.0"}
per_page_total = 200
main_url = "https://core.dxpapi.com/api/v1/core/?account_id=6105&auth_key=&domain_key=bananarepublic&request_id=2803065648391&_br_uid_2=uid%3D9716267812900%3Av%3D12.0%3Ats%3D1630514037116%3Ahc%3D20&url=https://bananarepublic.gap.com/index.html%23brm-search%3Frequest_type=search&search_type=keyword&q={}&l={}&br_origin=searchbox&realm=prod&ref_url=https://bananarepublic.gap.com/&request_type=search&rows=200&start={}&facet.limit=300&fl=sku_color,style_color_id,sku_sizes,swatch_image_attribute,pid,title,brand,price,maxSalePrice,minSalePrice,sale_price,price_type,promotions,thumb_image,sku_thumb_images,sku_swatch_images,sku_color_group,url,price_range,sale_price_range,description,is_live,score,defaultColorMarketingMessage,styleMarketingMessage&stats.field=sale_price&segment=customer_tier:160481&"

In [6]:
#Number of pages
def get_page_nums(categories):
    
    nums_lst = []
    
    for category in categories:
        
        res = requests.session().get(main_url.format(category, category, "0")).json()
        pages_total = math.ceil(res['response']['numFound']/per_page_total)
        print(category, " :- ", pages_total)
        nums_lst.append(pages_total)
    
    return nums_lst

print("Per Page Total products = \n", per_page_total)
page_nums_men = get_page_nums(men_categories)
print("\n")
page_nums_women = get_page_nums(women_categories)

Per Page Total products = 
 200
men-clothing  :-  4
men-footwear  :-  1
men-watches  :-  0
men-wallets  :-  1
men-bags  :-  1
men-jewellery  :-  0
men-belts  :-  1
men-ties  :-  1
men-cufflinks  :-  1
men-pocket-squares  :-  1
men-caps  :-  1
men-hats  :-  1
men-scarves  :-  1
men-gloves  :-  1
men-phones-cases  :-  0
men-rings  :-  1
men-wristwear  :-  0
men-socks  :-  1
men-bracelets  :-  0
men-chains  :-  0


women-clothing  :-  8
women-footwear  :-  1
women-watches  :-  1
women-wallets  :-  1
women-bags  :-  1
women-jewellery  :-  0
women-belts  :-  2
women-ties  :-  1
women-cufflinks  :-  0
women-pocket-squares  :-  1
women-caps  :-  1
women-hats  :-  1
women-scarves  :-  1
women-gloves  :-  1
women-phones-cases  :-  1
women-rings  :-  1
women-wristwear  :-  0
women-socks  :-  1
women-bracelets  :-  1
women-chains  :-  1


In [6]:
def extract(item, cat, gender):
    
    try:
        website = base_url

        # product link
        product_link = base_url + item['url']

        # product name
        product_name = item['title']

        # product brand
        product_brand = "Banana Republic"
        if item['brand'] != "":
            product_brand = item['brand']

        # product category
        product_category = cat

        # sizes available
        sizes = []
        
        for size in item['variants'][0]['sku_sizes']:
            sizes.append(size)

        sizes = ",".join(sizes)
        
        
        # price
        price = item['minSalePrice']

        # mrp
        mrp = price

        # gender
        gender = gender
        
        
        # description
        temp_description = item['description'].replace('\\u', ' ').replace('\\', '').replace('002c', '')
        find_key = "."
        index = temp_description.find(find_key)
        description = temp_description[2:index]
        
        
        # primary image link
        p_img_link = item['thumb_image']

        # secondary image links
        sec_links = []
        for elem in item['variants']:
            for img in elem['sku_thumb_images']:
                sec_links.append(base_url + img)

        sec_links = ",".join(sec_links)

        final_results = [website, product_link, product_name, product_brand, product_category, sizes,
                        price, mrp, gender, description, p_img_link, sec_links]

        data.append(final_results)
    
    except:
#         print('here')
        pass

In [13]:
# CSV initialize
columns = ["Website", "Product_Link", "Product_Name", "Product_Brand", "Product_Category", "Size_Avail", "Price",
          "MRP", "Gender", "Description", "Primary_Image_Links", "Secondary_Image_Links"]

base_df = pd.DataFrame(columns=columns)
base_df.to_csv("bananarep.csv", index=False)

In [14]:
"""
Main Function 
"""


"""
Men's Section
"""

data = []

counter = 0
start = time.time()

for category in range(len(men_categories)):
    
    for page in range(page_nums_men[category]):
        
        page_data = requests.session().get(main_url.format(men_categories[category], men_categories[category], str(page*per_page_total))).json()
        
        for product in page_data['response']['docs']:
            extract(product, men_categories[category], "men")
        
        counter += len(data)
        
        temp_df = pd.DataFrame(data, columns=columns)
        temp_df.to_csv('bananarep.csv', mode='a', header=False, index=False)
        
        print("Total Amount scraped :- ", counter, sep="  ")
        print("Current Category:- ", men_categories[category])
        print("\nTime Elapsed:- ", round((time.time() - start)/60, 2), "mins\n")
        print()
        
        data = []
    

end = time.time()
print("\nTotal Time Elapsed:- ", round((time.time() - start)/60, 2), "mins\n\n")


"""
Women's Section
"""


data = []

counter = 0
start = time.time()

for category in range(len(women_categories)):
    
    for page in range(page_nums_women[category]):
        
        page_data = requests.session().get(main_url.format(women_categories[category], women_categories[category], str(page*per_page_total))).json()
        
        for product in page_data['response']['docs']:
            extract(product, women_categories[category], "women")
        
        counter += len(data)
        
        temp_df = pd.DataFrame(data, columns=columns)
        temp_df.to_csv('bananarep.csv', mode='a', header=False, index=False)
        
        print("Total Amount scraped :- ", counter, sep="  ")
        print("Current Category:- ", women_categories[category])
        print("\nTime Elapsed:- ", round((time.time() - start)/60, 2), "mins\n")
        print()
        
        data = []
    

end = time.time()
print("\nTotal Time Elapsed:- ", round((time.time() - start)/60, 2), "mins\n\n")

Total Amount scraped :-   200
Current Category:-  men-clothing

Time Elapsed:-  0.03 mins


Total Amount scraped :-   400
Current Category:-  men-clothing

Time Elapsed:-  0.06 mins


Total Amount scraped :-   600
Current Category:-  men-clothing

Time Elapsed:-  0.09 mins


Total Amount scraped :-   699
Current Category:-  men-clothing

Time Elapsed:-  0.12 mins


Total Amount scraped :-   724
Current Category:-  men-footwear

Time Elapsed:-  0.14 mins


Total Amount scraped :-   728
Current Category:-  men-wallets

Time Elapsed:-  0.15 mins


Total Amount scraped :-   743
Current Category:-  men-bags

Time Elapsed:-  0.17 mins


Total Amount scraped :-   809
Current Category:-  men-belts

Time Elapsed:-  0.2 mins


Total Amount scraped :-   839
Current Category:-  men-ties

Time Elapsed:-  0.21 mins


Total Amount scraped :-   842
Current Category:-  men-cufflinks

Time Elapsed:-  0.23 mins


Total Amount scraped :-   851
Current Category:-  men-pocket-squares

Time Elapsed:-  0.25 m

In [15]:
df = pd.read_csv("bananarep.csv")
df = df.drop_duplicates()
df.dropna(inplace = True)
df["Affiliate_Link"] = [None]*df.shape[0]
df

Unnamed: 0,Website,Product_Link,Product_Name,Product_Brand,Product_Category,Size_Avail,Price,MRP,Gender,Description,Primary_Image_Links,Secondary_Image_Links,Affiliate_Link
0,https://bananarepublic.gap.com,https://bananarepublic.gap.com/browse/product....,Organic Soft Wash Henley T-Shirt,Banana Republic,men-clothing,"Tops|size|Regular|XS|,Tops|size|Regular|XL|,To...",36.5,36.5,men,SOFT WASH: These shirts undergo a delicate was...,/webcontent/0026/927/786/cn26927786.jpg,https://bananarepublic.gap.com/webcontent/0026...,
1,https://bananarepublic.gap.com,https://bananarepublic.gap.com/browse/product....,Untucked Slim-Fit Linen-Cotton Shirt,Banana Republic,men-clothing,"Tops|size|Regular|M|,Tops|size|Regular|XXL|,To...",69.5,69.5,men,LINEN & COTTON: A match made in heaven this s...,/webcontent/0026/929/387/cn26929387.jpg,https://bananarepublic.gap.com/webcontent/0026...,
2,https://bananarepublic.gap.com,https://bananarepublic.gap.com/browse/product....,Organic Crew-Neck T-Shirt,Banana Republic,men-clothing,"Tops|size|Regular|XXL|,Tops|size|Tall|M|,Tops|...",34.5,34.5,men,Better cotton better planet,/webcontent/0027/248/766/cn27248766.jpg,https://bananarepublic.gap.com/webcontent/0027...,
3,https://bananarepublic.gap.com,https://bananarepublic.gap.com/browse/product....,Authentic SUPIMA® T-Shirt Hoodie,Banana Republic,men-clothing,"Tops|size|Regular|XL|,Tops|size|Regular|XS|,To...",59.5,59.5,men,SUPIMA® COTTON: Soft strong and American gro...,/webcontent/0026/937/790/cn26937790.jpg,https://bananarepublic.gap.com/webcontent/0027...,
4,https://bananarepublic.gap.com,https://bananarepublic.gap.com/browse/product....,Organic Shirt Jacket,Banana Republic,men-clothing,"Tops|size|Regular|S|,Tops|size|Tall|XXL|,Tops|...",98.5,98.5,men,Specially washed for added softness this vers...,/webcontent/0020/702/373/cn20702373.jpg,https://bananarepublic.gap.com/webcontent/0020...,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2886,https://bananarepublic.gap.com,https://bananarepublic.gap.com/browse/product....,Stretch Leather Legging,Banana Republic,women-gloves,"Bottoms|size|Regular|20|XXL,Bottoms|inseam|Reg...",598.0,598.0,women,Stand apart in these exquisite leather legging...,/webcontent/0027/347/012/cn27347012.jpg,https://bananarepublic.gap.com/webcontent/0027...,
2902,https://bananarepublic.gap.com,https://bananarepublic.gap.com/browse/product....,Metallic Strappy Flat Sandal,Banana Republic,women-rings,"Shoes|size|Regular|7|,Shoes|size|Regular|9|,Sh...",88.0,88.0,women,So summer,/webcontent/0018/852/752/cn18852752.jpg,https://bananarepublic.gap.com/webcontent/0018...,
2992,https://bananarepublic.gap.com,https://bananarepublic.gap.com/browse/product....,Textured Basic Trouser Sock,Banana Republic,women-socks,Socks & Tights|size|Regular|One Size|,6.0,6.0,women,Lightweight cotton blend socks with angled toe...,/webcontent/0013/910/160/cn13910160.jpg,https://bananarepublic.gap.com/webcontent/0013...,
3006,https://bananarepublic.gap.com,https://bananarepublic.gap.com/browse/product....,Textured Bootie Sock,Banana Republic,women-socks,Socks & Tights|size|Regular|One Size|,5.0,5.0,women,Lightweight cotton blend socks with angled toe...,/webcontent/0013/910/140/cn13910140.jpg,https://bananarepublic.gap.com/webcontent/0013...,


In [16]:
conn = sqlite3.connect('BananaRepublic.db')
c = conn.cursor()
c.execute('CREATE TABLE product_details (Website varchar(40) NOT NULL, Product_Link TEXT PRIMARY KEY,Product_Name varchar(50) NOT NULL,Product_Brand varchar(50) NOT NULL,Product_Category varchar(50),Size_Avail varchar(20) NOT NULL,Price int NOT NULL,MRP int NOT NULL,Gender varchar(15) NOT NULL,Description TEXT NOT NULL,Primary_Image_Links TEXT NOT NULL,Secondary_Image_Links TEXT NOT NULL,Affiliate_Link TEXT NOT NULL)')
conn.commit()
df.to_sql('product_details', conn, if_exists='replace', index = False)

In [17]:
# Print 20 products
c.execute('''
SELECT * FROM product_details
          ''')

for row in c.fetchmany(size=20):
    print (row)

('https://bananarepublic.gap.com', 'https://bananarepublic.gap.com/browse/product.do?pid=744184002', 'Organic Soft Wash Henley T-Shirt', 'Banana Republic', 'men-clothing', 'Tops|size|Regular|XS|,Tops|size|Regular|XL|,Tops|size|Regular|L|,Tops|size|Tall|M|,Tops|size|Tall|L|,Tops|size|Tall|XL|,Tops|size|Regular|M|,Tops|size|Regular|XXL|,Tops|size|Tall|XXL|,Tops|size|Regular|S|', 36.5, 36.5, 'men', 'SOFT WASH: These shirts undergo a delicate wash cycle for 60 minutes followed by an enzyme wash  giving the shirt a softer  worn-in finish', '/webcontent/0026/927/786/cn26927786.jpg', 'https://bananarepublic.gap.com/webcontent/0026/926/128/cn26926128.jpg,https://bananarepublic.gap.com/webcontent/0026/927/786/cn26927786.jpg,https://bananarepublic.gap.com/webcontent/0026/926/138/cn26926138.jpg,https://bananarepublic.gap.com/webcontent/0026/926/157/cn26926157.jpg', None)
('https://bananarepublic.gap.com', 'https://bananarepublic.gap.com/browse/product.do?pid=788558032', 'Untucked Slim-Fit Linen-C