In [None]:
# Import necessary libraries
import undetected_chromedriver as uc  # This module allows using Chrome in undetected mode to bypass bot protection
from selenium.webdriver.common.by import By  # Used for locating elements on the webpage
from selenium.webdriver.support.ui import WebDriverWait  # Used to introduce explicit waits in the script
from selenium.webdriver.support import expected_conditions as EC  # Conditions for WebDriverWait
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup as bs  # BeautifulSoup is used for parsing and scraping HTML content
import time  # Used for adding delays between actions
import random  # Used for generating random wait times for mimicking human-like behavior

import urllib.request, urllib.error, urllib.parse # Importing the library to load URLS
import requests

import pandas as pd
import numpy as np

from scipy import stats

### Setting up a Selenium web driver instance to get the links for all PIDs in the Mens Cotton T Shirt PLP from Macy's

In [None]:
# Set up Chrome options for undetected-chromedriver
options = uc.ChromeOptions()
options.add_argument('--no-sandbox')  # Disable sandbox for compatibility issues
options.add_argument('--disable-dev-shm-usage')  # Avoid shared memory issues, especially in cloud-based environments
# options.add_argument('--headless')

# Initialize undetected Chrome WebDriver with the options specified
driver = uc.Chrome(service=Service(ChromeDriverManager().install()), options=options)

# Function to scroll down slowly
def slow_scroll(driver, scroll_height):
    current_height = 0
    increment = 600  # Scroll down 600 pixels at a time
    while current_height < scroll_height:
        # Execute JavaScript to scroll by 600 pixels from the current position
        driver.execute_script(f"window.scrollBy(0, {increment});")
        current_height += increment
        # Pause for a random amount of time (between 0.1 and 0.3 seconds) to mimic human behavior
        time.sleep(random.uniform(0.1, 0.3))

product_links = []  # List to store product links scraped from each page

# Loop through each URL in the 'urls' list

urls = ['https://www.macys.com/shop/featured/mens-cotton-t-shirt/Pageindex/1?ss=true',
        'https://www.macys.com/shop/featured/mens-cotton-t-shirt/Pageindex/2?ss=true']

for url in urls:
    driver.get(url)  # Open the URL in the browser
    assert 'Macy' in driver.title  # Ensure the page has loaded by checking the title contains 'Macy'

    # Get the total height of the page using JavaScript
    total_height = driver.execute_script("return document.body.scrollHeight")

    # Call the slow_scroll function to scroll down slowly until the bottom of the page
    slow_scroll(driver, total_height)
    # Pause for a random duration (3-7 seconds) after scrolling to allow the page to load completely
    time.sleep(random.uniform(3, 7))

    # Get the HTML source of the page after scrolling
    html = driver.page_source

    # Parse the HTML using BeautifulSoup for scraping content
    soup = bs(html, 'html.parser')

    # Find all div elements with the class 'description-spacing' (these contain product links)
    for html_class in soup.find_all('div', class_="description-spacing"):
        sublink = html_class.find('a')  # Find the 'a' tag (which contains the link to the product)
        link = sublink.get('href')  # Extract the 'href' attribute (the actual product link)
        print(link)  # Print the link (for debugging purposes)
        product_links.append(link)  # Append the product link to the product_links list

# Close the browser once the scraping is done
driver.quit()


/shop/product/calvin-klein-mens-5-pk.-cotton-classics-crew-neck-undershirts-created-for-macys?ID=4864044&swatchColor=White
/shop/product/champion-mens-cotton-jersey-t-shirt?ID=4339197&swatchColor=Oxford Gray
/shop/product/polo-ralph-lauren-mens-classic-fit-jersey-crewneck-t-shirt?ID=13738535&swatchColor=Blue
/shop/product/lacoste-mens-classic-crew-neck-soft-pima-cotton-t-shirt?ID=19588855&swatchColor=Black
/shop/product/calvin-klein-mens-smooth-cotton-solid-crewneck-t-shirt?ID=13365087&swatchColor=Brilliant White
/shop/product/calvin-klein-mens-cotton-classics-3-pk.-crewneck-t-shirts?ID=11597359
/shop/product/mens-thistletown-hills-t-shirt?ID=12732051&swatchColor=Mountain Red
/shop/product/nautica-mens-classic-fit-solid-crew-neck-pocket-t-shirt?ID=1498845&swatchColor=True Black
/shop/product/hanes-mens-ultimate-6pk.-crewneck-undershirts?ID=13632373&swatchColor=White
/shop/product/alfani-mens-solid-supima-blend-crewneck-t-shirt-created-for-macys?ID=12535684&swatchColor=Navy Blue
/shop/p

### Using Requests and Beautiful Soup to scrape the price of every product in our list

In [None]:
headers_list = [{
        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.0 Safari/605.1.15",
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
        "Accept-Language": "en-US,en;q=0.5",
        "Referer": "https://www.google.com/",
        "DNT": "1",
        "Connection": "keep-alive",
        "Upgrade-Insecure-Requests": "1"}]

In [None]:
%%time

product_price_list = []
product_id_list = []
# i = 0
for link in product_links:
    product_link_mod = link.replace(' ', '%20')
    link_final = 'https://www.macys.com' + product_link_mod
    print(link_final)
    headers = headers_list[0]
    r = requests.Session()
    r.headers = headers
    product_html = r.get(link_final).text

    soup = bs(product_html, 'html.parser')
    relevant = soup.find('span', class_='price-red price-lg')
    # if condition because the class is different for some products (discounted products have price in red and rest have it in black)
    if relevant is None:
        relevant = soup.find('span', class_='price-lg')

    # getting the product price
    product_price = relevant.text
    product_price = float(product_price.split('$')[1])

    # getting the product ID
    pid = link.split('&swatchColor')[0].split('ID=')[-1]
    pid = pid.split('&tdp=cm_app')[0]

    product_price_list.append(product_price)
    product_id_list.append(pid)
    time.sleep(random.uniform(3, 7))


https://www.macys.com/shop/product/calvin-klein-mens-5-pk.-cotton-classics-crew-neck-undershirts-created-for-macys?ID=4864044&swatchColor=White
https://www.macys.com/shop/product/champion-mens-cotton-jersey-t-shirt?ID=4339197&swatchColor=Oxford%20Gray
https://www.macys.com/shop/product/polo-ralph-lauren-mens-classic-fit-jersey-crewneck-t-shirt?ID=13738535&swatchColor=Blue
https://www.macys.com/shop/product/lacoste-mens-classic-crew-neck-soft-pima-cotton-t-shirt?ID=19588855&swatchColor=Black
https://www.macys.com/shop/product/calvin-klein-mens-smooth-cotton-solid-crewneck-t-shirt?ID=13365087&swatchColor=Brilliant%20White
https://www.macys.com/shop/product/calvin-klein-mens-cotton-classics-3-pk.-crewneck-t-shirts?ID=11597359
https://www.macys.com/shop/product/mens-thistletown-hills-t-shirt?ID=12732051&swatchColor=Mountain%20Red
https://www.macys.com/shop/product/nautica-mens-classic-fit-solid-crew-neck-pocket-t-shirt?ID=1498845&swatchColor=True%20Black
https://www.macys.com/shop/product/

In [None]:
# Converting the lists into a pandas dataframe
macys_data = pd.DataFrame(list(zip(product_id_list, product_price_list)),columns =['product_id', 'price'])
macys_data

Unnamed: 0,product_id,price
0,4864044,38.92
1,4339197,14.00
2,13738535,20.99
3,19588855,42.00
4,13365087,24.49
...,...,...
115,4687960,23.09
116,18201994,12.99
117,19620458,20.99
118,17103364,9.10


### Setting up a Selenium web driver instance to get the links for all PIDs in the Mens Cotton T Shirt PLP from Gap

In [None]:
# Set up Chrome options for undetected-chromedriver
options = uc.ChromeOptions()
options.add_argument('--no-sandbox')  # Disable sandbox for compatibility issues
options.add_argument('--disable-dev-shm-usage')  # Avoid shared memory issues, especially in cloud-based environments
options.add_argument('--headless')

# Initialize undetected Chrome WebDriver with the options specified
driver = uc.Chrome(service=Service(ChromeDriverManager().install()), options=options)

product_links_gap = []  # List to store product links scraped from each page

# Loop through each URL in the 'urls' list

url_gap = 'https://www.gap.com/browse/search.do?searchText=mens%20cotton%20t%20shirt'

driver.get(url_gap)  # Open the URL in the browser

assert 'Gap' in driver.title  # Ensure the page has loaded by checking the title contains 'Gap'
SCROLL_PAUSE_TIME = 5

# Get scroll height
last_height = driver.execute_script("return document.body.scrollHeight")
print(last_height)
while True:
        # Scroll down to bottom
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")

        # Wait to load page
        time.sleep(SCROLL_PAUSE_TIME)

        # Calculate new scroll height and compare with last scroll height
        new_height = driver.execute_script("return document.body.scrollHeight")
        print(new_height)
        if new_height == last_height:
            break
        last_height = new_height

html_gap = driver.page_source # Get the html of the page

# Parse the HTML using BeautifulSoup for scraping content
soup_gap = bs(html_gap, 'html.parser')

# Find all div elements with the class 'product-card css-zu5l9z' (these contain product links)
for html_class_gap in soup_gap.find_all('div', class_="product-card css-zu5l9z"):
    sublink_gap = html_class_gap.find('a')  # Find the 'a' tag (which contains the link to the product)
    link_gap = sublink_gap.get('href')  # Extract the 'href' attribute (the actual product link)
    print(link_gap)  # Print the link (for debugging purposes)
    product_links_gap.append(link_gap)  # Append the product link to the product_links list

# Close the browser once the scraping is done
driver.quit()


1166
9202
18662
18662
https://www.gap.com/browse/product.do?pid=796255112&vid=1&searchText=mens%20cotton%20t%20shirt
https://www.gap.com/browse/product.do?pid=878111272&vid=1&searchText=mens%20cotton%20t%20shirt
https://www.gap.com/browse/product.do?pid=540609042&vid=1&searchText=mens%20cotton%20t%20shirt
https://www.gap.com/browse/product.do?pid=428038022&vid=1&searchText=mens%20cotton%20t%20shirt
https://www.gap.com/browse/product.do?pid=540610022&vid=1&searchText=mens%20cotton%20t%20shirt
https://www.gap.com/browse/product.do?pid=440775002&vid=1&searchText=mens%20cotton%20t%20shirt
https://www.gap.com/browse/product.do?pid=855773032&vid=1&searchText=mens%20cotton%20t%20shirt
https://www.gap.com/browse/product.do?pid=500099012&vid=1&searchText=mens%20cotton%20t%20shirt
https://www.gap.com/browse/product.do?pid=521667012&vid=1&searchText=mens%20cotton%20t%20shirt
https://www.gap.com/browse/product.do?pid=606222052&vid=1&searchText=mens%20cotton%20t%20shirt
https://www.gap.com/browse/p

In [None]:
len(product_links_gap)

159

### Using Requests and Beautiful Soup to scrape the price of every product in our list

In [None]:
%%time

product_price_list_gap = []
product_id_list_gap = []
i = 0
for link_gap in product_links_gap:

    headers = headers_list[0]
    r = requests.Session()
    r.headers = headers
    product_html_gap = r.get(link_gap).text

    soup_gap = bs(product_html_gap, 'html.parser')
    relevant_gap = soup_gap.find('span', class_='pdp-pricing--highlight pdp-pricing__selected pdp-mfe-1lkqao0')
    # if condition because the class is different for some products (discounted products have price in red and rest have it in black)
    if relevant_gap is None:
        relevant_gap = soup_gap.find('span', class_='pdp-pricing__selected pdp-mfe-1lkqao0')

    # getting the product price
    product_price_gap = relevant_gap.text
    product_price_gap = float(product_price_gap.split('$')[1])

    # getting the product ID

    pid_gap = link_gap.split('&vid')[0].split('pid=')[-1]
    pid_gap = pid_gap.split('&tdp=cm_app')[0]

    product_price_list_gap.append(product_price_gap)
    product_id_list_gap.append(pid_gap)
    time.sleep(random.uniform(3, 7))
    i=i+1
    print(i) # for debugging purposes

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
CPU times: total: 5.98 s
Wall time: 18min 57s


In [None]:
# Converting the lists into a pandas dataframe
gap_data = pd.DataFrame(list(zip(product_id_list_gap, product_price_list_gap)),columns =['product_id', 'price'])
gap_data

Unnamed: 0,product_id,price
0,796255112,14.00
1,878111272,12.00
2,540609042,14.00
3,428038022,17.00
4,540610022,14.00
...,...,...
154,432513022,16.99
155,682085012,8.00
156,881137022,54.99
157,876167002,34.95


### Running a T-Test on the sample data from both companies to see if there is any statistical significance in the mean price

In [None]:
gap_data_sample = gap_data[0:100]
macys_data_sample = macys_data[0:100]

In [None]:
gap_data_sample.price.mean(), macys_data_sample.price.mean()

(20.646900000000002, 26.3373)

In [None]:
stats.ttest_ind(gap_data_sample['price'], macys_data_sample['price'], alternative='less')

TtestResult(statistic=-3.6374099236780477, pvalue=0.0001756105785187589, df=198.0)