# Initializing Libraries and tools

In [393]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options

from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

from bs4 import BeautifulSoup
import requests

import pandas as pd
import time
from urllib.parse import urlparse
import re



# Functions to generate the product link

In [394]:
def get_html_content(url):
    # set Chrome options to set language to English
    options = Options()
#     options.add_argument("--lang=en-US")
    
    prefs = {
      "translate_whitelists": {"ja":"en"},
      "translate":{"enabled":"true"}
    }
    options.add_experimental_option("prefs", prefs)
    # initialize the webdriver with options
#     driver = webdriver.Chrome(executable_path='C:/Users/najmus/Documents/chromedriver/chromedriver.exe', options=options)
    driver = webdriver.Chrome(executable_path='/Users/pcworld/chromedriver/chromedriver', options=options)
    
    # send a request to the URL
    driver.get(url)
    
    # retrieve the page source
    html_content = driver.page_source
    
    # close the webdriver
    driver.quit()
    
    # return the HTML content
    return html_content


def get_divs_with_class(response, class_name):
    # parse the HTML content with BeautifulSoup
    soup = BeautifulSoup(response, 'html.parser')
    
    # find all div elements with the specified class
    divs = soup.find_all('div', {'class': class_name})
    
    products = []
    
    for div in divs:
        products.append(div)
    
    # return the list of attribute values
    return products

def get_anchor_with_class(response, class_name, attribute):
    # parse the HTML content with BeautifulSoup
    soup = BeautifulSoup(response, 'html.parser')
    
    # find all div elements with the specified class
    divs = soup.find_all('div', {'class': class_name})
    
    products_id = []
    
    for div in divs:
        anchor = div.find('a')
        if anchor:
            if anchor.get(attribute) in products_id:
                continue
            products_id.append(anchor.get(attribute))
    
    # return the list of attribute values
    return products_id

# Testing the functions

In [395]:
response = get_html_content('https://shop.adidas.jp/item/?gender=mens&category=footwear&order=1&page=1')
href = get_anchor_with_class(response,'itemCardArea-cards test-card css-dhpxhu','href')
# products = get_divs_with_class(response,'itemCardArea-cards test-card css-dhpxhu')
# products_id = get_divs_with_class(response,'itemCardArea-cards test-card css-dhpxhu','data-ga-eec-product-id')
print(href)
# len(products_id)



  del sys.path[0]


['/products/ID4770/', '/products/IE1667/', '/products/GV9063/', '/products/HP7598/', '/products/HP3021/', '/products/HR0718/', '/products/HR0725/', '/products/HR0728/', '/products/FZ6262/', '/products/H06187/', '/products/HQ4422/', '/products/HQ6498/']


# Function to generate the individual product url 

In [396]:
def make_product_link(url,base_url_product,items_count):
    page_count = 1
    item = 1
    total_product_count = 0
    base = base_url_product
    product_url = []
    while total_product_count < items_count:
        
        link = url + str(page_count)
        response = get_html_content(link)
        paths = get_anchor_with_class(response,'itemCardArea-cards test-card css-dhpxhu','href')
        for path in paths:
            full_link = base + path
#             print(full_link)
            if full_link in product_url:
                continue
            product_url.append(full_link)
            if len(product_url) == items_count:
                break
            
        total_product_count = len(product_url)
        
        page_count+=1
        item +=1
        
    return product_url
    
    

# Calling the product link generator and making the list of product urls

In [361]:
link = 'https://shop.adidas.jp/item/?gender=mens&category=wear&order=1&page='
product_url = make_product_link(link,'https://shop.adidas.jp',300)



In [388]:
product_url


['https://shop.adidas.jp/products/IC1986/',
 'https://shop.adidas.jp/products/IC1987/',
 'https://shop.adidas.jp/products/IC1999/',
 'https://shop.adidas.jp/products/IC1998/',
 'https://shop.adidas.jp/products/HY5372/',
 'https://shop.adidas.jp/products/HY4691/',
 'https://shop.adidas.jp/products/HS7581/',
 'https://shop.adidas.jp/products/HS7597/',
 'https://shop.adidas.jp/products/IC2429/',
 'https://shop.adidas.jp/products/IC2430/',
 'https://shop.adidas.jp/products/H63016/',
 'https://shop.adidas.jp/products/IA1453/',
 'https://shop.adidas.jp/products/HS2092/',
 'https://shop.adidas.jp/products/HZ4162/',
 'https://shop.adidas.jp/products/IC5725/',
 'https://shop.adidas.jp/products/HR3319/',
 'https://shop.adidas.jp/products/IC8879/',
 'https://shop.adidas.jp/products/IC5737/',
 'https://shop.adidas.jp/products/IC5738/',
 'https://shop.adidas.jp/products/HZ1155/',
 'https://shop.adidas.jp/products/IC8861/',
 'https://shop.adidas.jp/products/IC6007/',
 'https://shop.adidas.jp/product

# Checking the data duplicate links

In [363]:
from collections import Counter

# create a sample list
my_list = product_url

# count the frequency of each element in the list
counts = Counter(my_list)

# create a list of duplicates and their frequency
duplicates = [(value, count) for value, count in counts.items() if count > 1]

# print the duplicates and their frequency
print('Duplicate Values and their Frequency:')
for value, count in duplicates:
    print(value, ':', count)


Duplicate Values and their Frequency:


# Functions to crawl over the web pages and extract desired informations

In [381]:
def parse_sizes(html):
    sizes = []
    soup = BeautifulSoup(html, 'html.parser')
    size_buttons = soup.find_all('button', {'type': 'button'})
    for button in size_buttons:
        if 'disable' not in button.get('class', []):
            size = button.text.strip()
            sizes.append(size)
    return sizes

def parse_path(html):
    sizes = []
    soup = BeautifulSoup(html, 'html.parser')
    paths = soup.find_all('a', {'class': 'breadcrumbListItemLink'})
    p = '/'
    for i, path in enumerate(paths):
        if i != 0:  # Skip the first path
            size = path.text.strip()
            p = p + str(size) + '/'
            sizes.append(size)
    return p

def get_element(soup,tag,class_name):
    soup = soup
    if soup:
        element = soup.find(tag, {'class': class_name})
        element_text = element.text.strip() if tag else None
    else:
        element_text = None
    
    return element_text

def driver_config():
    
    options = Options()
    options.add_argument("--lang=en-US")

    prefs = {
      "translate_whitelists": {"ja":"en"},
      "translate":{"enabled":"true"}
    }
    options.add_experimental_option("prefs", prefs)
    
    driver = webdriver.Chrome(executable_path='/Users/pcworld/chromedriver/chromedriver', options=options)
    time.sleep(15)
    
    return driver

# def scroll_and_delay(driver):
#     driver = driver
#     time.sleep(5)
    
#     driver.execute_script("window.scrollBy(0, 1200)")
#     time.sleep(2)
#     driver.execute_script("window.scrollBy(0, 300)")
    
# #     element = driver.find_element("id", "BVRRContainer")
# #     driver.execute_script("arguments[0].scrollIntoView();", element)
#     time.sleep(5)
#     driver.execute_script("window.scrollBy(0, 300)")
#     time.sleep(2)
    
#     element = driver.find_element("id", "BVRRContainer")
#     driver.execute_script("arguments[0].scrollIntoView();", element)
    
#     time.sleep(3)

def sizechart(soup):

    soup = soup
    try:
        size_chart_div = soup.find("div", class_="sizeChart test-sizeChart css-l7ym9o")
        table = size_chart_div.find("table", class_="sizeChartTable")
        thead = table.find("thead", class_="sizeChartTHeader")
        header_row = thead.find_all("tr")[1]

        headers = []

        for th in table.find_all('th', {'class': True }):
            if th.text:
                headers.append(th.text.strip())
            else:
                headers.append('size')

        tbody = size_chart_div.find("tbody")
        rows = []
        for tr in tbody.find_all("tr"):
            row_data = [td.text.strip() for td in tr.find_all("td")]
            row_data = '-'.join(row_data).replace(" ", "").replace(".", "")
            rows.append(row_data)

        data = []
        data.append(str(headers))
        data.append(str(rows))
    except:
        data = ['null','null']
#     df = pd.DataFrame([data], columns=header)
    
    return data
def review_parser(soup,p_id):
#     soup = BeautifulSoup(html, 'html.parser')
    soup = soup
    review = {}
    df = []
    
    review['p_id'] = p_id
    df.append(p_id)
    
    rating = soup.find('div', {'class': 'BVRRRatingNormalImage'}).img['alt']
#     alt_value = soup.find('div', {'class': 'BVRRRatingNormalImage'}).img['alt']

    review['rating'] = rating
    df.append(rating)

    title = soup.find('span', {'class': 'BVRRReviewTitle'}).text
    review['title'] = title
    df.append(title)

    date = soup.find('span', {'class': 'BVRRReviewDate'}).text
    review['date'] = date
    df.append(date)

    full_review = soup.find('div', class_='BVRRReviewText').text
    review['full_review'] = full_review
    df.append(full_review)
    
    
    user = soup.find('span', class_='BVRRValue BVRRUserNickname').text
    review['user'] = user
    df.append(user)
    
    recommendations = soup.find('div', class_='BVRRReviewDisplayStyle5Recommendations').text
    review['recommendations'] = recommendations
    df.append(recommendations)
    
    return review

def get_image_url(p_id,soup):
    soup=soup
    image_srcs = []
    df = {}
    df['p_id']=p_id
    soup = soup.find('ul', {'class': 'slider-list test-slider-list'})
    for img in soup.find_all('img'):
        src = img.get('src')
        if src:
            image_srcs.append('https://shop.adidas.jp'+ src)
    df['resource_url'] = str(image_srcs)
    return df

#     try:
#         image_urls = soup.find_all('ul', {'class': 'slider-list test-slider-list'}).img['src']
#         for src in image_urls:
#             img_srcs.append(src)
#     except:
#         image_urls = soup.find('ul', {'class': 'slider-list test-slider-list'}).img['src']
#         img_srcs.append(image_urls)
        
#     return img_srcs

def click_all_tabs(driver):
    driver = driver

    tab_labels = driver.find_elements(By.CLASS_NAME, 'tabLink')
    driver.execute_script("window.scrollBy(0, 600)")
    for label in tab_labels:
        if label.text in ['Featured Products','styling','Other popular products']:
            continue
        data_href = label.get_attribute('data-href')
#         print(label.text)
        label.click()  # click the tab label
        time.sleep(10)  # pause for 5 seconds

def parse_span_class(html):
    pattern = r'mod-marker_(\d+_\d+)'
    match = re.search(pattern, str(html))
    if match:
        return match.group(1)
    else:
        return None
    
def parse_description_text(p_id,soup):
    soup = soup
    product_description = {}
    product_description['p_id']=p_id
    heading = soup.find('h4', class_='heading itemFeature test-commentItem-subheading')
    part1 = soup.find('div', class_='description_part details test-itemComment-descriptionPart')
    part2 = soup.find('ul', class_='articleFeatures description_part css-1lxspbu')
    bullets = part2.find_all('li')
    des_list = []
    for li in bullets:
        des_list.append(li.text.strip())
    
    if heading:
        product_description['Title_Description']=heading.text.strip()
    else:
        product_description['Title_Description']=None
    if part1:    
        product_description['Detailed_Description']=part1.text.strip()
    else:
        product_description['Detailed_Description']=None
    if part2:
        product_description['Summary_Description']=str(des_list)
    else:
        product_description['Summary_Description']=None
#     text = part1.text.strip()
    return product_description

def parse_tags(soup):
    soup = soup
    tags = []
    soup = soup.find('div',{'class':'itemTagsPosition'})
    for link in soup.find_all('a'):
        tag = link.text.strip()
        if tag:
            tags.append(tag)
    return ', '.join(tags)

def parse_product_info(p_id,soup):
    soup = soup
    product_list = []
    items = soup.find_all('div', {'class': 'coordinate_box'})
    for item in items:
        for li in item.find_all('li'):
            product = {}
            a = li.find('div', {'class': 'coordinate_item_tile test-coordinate_item_tile'})
            product['p_id'] = p_id
            product['coordinate_product_name'] = a.img['alt']
            product['coordinate_product_price'] = a['data-price']
            product['coordinate_product_id'] = a['data-articleid']
            product['coordinate_product_image_url'] = 'https://shop.adidas.jp' + a.img['src']
            product['coordinate_product_url'] = f"https://shop.adidas.jp/products/{a['data-articleid']}" + "/"
            product_list.append(product)
    return product_list

def parse_ratings(html):
    soup = html
    ratings = {}
    try:
        for entry in soup.find_all('div', {'class': 'BVRRRatingEntry'}):
            rating_type = entry.find('div', {'class': 'BVRRRatingHeader'}).text.strip()
            rating_values = entry.find('img', {'class': 'BVImgOrSprite'})
            rating_values = rating_values['alt']
            ratings[rating_type] = str(rating_values)
        return ratings
    except:
        return None

# Main crawler function that extracts informations from the webpages

In [382]:
def crawl_and_extract_data(links, class_name,span_class,salesst,size,c_path):
  
    #------------Driver Config-----------start---------

    driver = driver_config() #Initializing driver configs
    
    #------------Driver Config-----------end---------
    count = 1
    #Initializing dataframes to store extracted info
    data = []
    df = pd.DataFrame(columns=['Link','Product_id','Category_Name' ,'Name', 'Price','Product_Status','Available_Sizes','Size_sense','Path','Size_Attributes','Size_Attributes_Values','Rating','Sense_of_Rating','Recommended_rate','KWs'])
    reviews_df = pd.DataFrame(columns=['p_id', 'rating', 'title', 'date', 'full_review', 'user', 'recommendations'])
    product_description = pd.DataFrame(columns=['p_id','Title_Description', 'Detailed_Description', 'Summary_Description'])
    resources = pd.DataFrame(columns=['p_id', 'resource_url'])
    coordinate_products = pd.DataFrame(columns=['p_id', 'coordinate_product_name','coordinate_product_price','coordinate_product_id','coordinate_product_image_url','coordinate_product_url'])
    # Loop through each link and retrieve the desired elements with the specified class
    for link in links:
        
        try:
            driver.get(link) # Navigate to the link
                
            html_content = driver.page_source # Retrieve the page source
            soup = BeautifulSoup(html_content, 'html.parser') # Parse the page source with BeautifulSoup
            click_all_tabs(driver)
            if soup.find('div', {'class': 'errorMessage test-errorMessage'}):
                continue        
        except:
            continue
            

#         click_all_tabs(driver)
        
        html_content = driver.page_source # Retrieve the page source
    
        soup = BeautifulSoup(html_content, 'html.parser') # Parse the page source with BeautifulSoup
        
        #-------------Getting Product Info------------start------------------
        parsed_url = urlparse(link)
        product_id = parsed_url.path.split('/')[-2]
        
        description = parse_description_text(product_id,soup)
        product_description = product_description.append(description, ignore_index=True)
        
        cordinate_items = parse_product_info(product_id,soup)
        for item in cordinate_items:
            coordinate_products = coordinate_products.append(item, ignore_index=True)
        
        product_review = soup.find('div', {'class': 'BVRRDisplayContentBody'})
        
        if product_review:
            array = []
            for r in product_review:

                review = review_parser(r,product_id)
                reviews_df = reviews_df.append(review, ignore_index=True)

                array.append(str(review))
            
        product_size_chart = sizechart(soup)#SIZECHART
        product_name = get_element(soup,'h1',class_name)#NAME
        category_name = get_element(soup,'span','categoryName test-categoryName')
        product_price = get_element(soup,'div',span_class)#PRICE
        product_status = get_element(soup,'div',salesst)#SALESSTATUS
        
        #SIZE
        size_selector = soup.find('ul', {'class': size})
        sizes = parse_sizes(str(size_selector))
        
        #SIZE Sense
        if soup.find('div',{'class':'sizeFitBar'}):
            bar = soup.find('div',{'class':'sizeFitBar'})
            size_sense = parse_span_class(bar)
#             print(size_sense)
#             print(bar)
        else:
            size_sense = 'N/A'
        #BREADCRUM WRAP
        category_path = soup.find('div', {'class': c_path})
        path = parse_path(str(category_path))
        
        #Rating Informations
        try:
            rating_container = soup.find('div', {'class': 'BVRRQuickTakeCustomWrapper'})
            rating = rating_container.find('div', {'class': 'BVRRRatingNormalImage'}).img['alt']
    #         rating = get_element(rating_container,'span','BVRRRatingNormalImage') #BVRRRatingNormalImage
            recommended_rate = get_element(rating_container,'span','BVRRBuyAgainPercentage')    
            sense_of_rating = parse_ratings(soup.find('div',{'class':'BVRRRatingContainerRadio'}))
        except:
            rating = 'N/A'
            recommended_rate = 'N/A'
            sense_of_rating = 'N/A'
        
        #Resources urls
        image_urls = get_image_url(product_id,soup)
        resources = resources.append(image_urls, ignore_index=True)
        
        #KWs
        KWs = parse_tags(soup)
        df.append([link,product_id,category_name, product_name, product_price,product_status,str(sizes),size_sense,str(path),str(product_size_chart[0]),str(product_size_chart[1]),str(rating),str(sense_of_rating),str(recommended_rate),str(KWs)])

        #-------------Getting Product Info------------end------------------
        data = [link,product_id,category_name, product_name, product_price,product_status,str(sizes),size_sense,str(path),str(product_size_chart[0]),str(product_size_chart[1]),str(rating),str(sense_of_rating),str(recommended_rate),str(KWs)]
        df.loc[len(df)] = data
        
        print(count)
        count += 1
        #Saving the data to a spreadsheet every 5 iterations to avoid errors like Network and server issues
        if count % 5 == 0:
            writer = pd.ExcelWriter('adidas_japan.xlsx')

            # write each dataframe to a different sheet
            df.to_excel(writer, sheet_name='product_details', index=False)
            reviews_df.to_excel(writer, sheet_name='product_reviews', index=False)
            product_description.to_excel(writer, sheet_name='product_description', index=False)
            resources.to_excel(writer, sheet_name='resources', index=False)
            coordinate_products.to_excel(writer, sheet_name='coordinate_products_list', index=False)

            # save the writer
            writer.save()
            print('data saved upto' + '-' + str(count))
            time.sleep(10)
        
    # Close the webdriver
    driver.quit()
    
    return df,reviews_df,product_description,resources,coordinate_products


# Calling the main function to extract info

In [383]:
dataframe,review,product_description,resources,coordinate_products = crawl_and_extract_data(product_url, 'itemTitle test-itemTitle','articlePrice test-articlePrice css-1apqb46','articleSalesStatus css-prnrg3','sizeSelectorList','breadcrumb_wrap')



1
2
3
4
data saved upto-5
5
6
7
8
9
data saved upto-10
10
11
12
13
14
data saved upto-15
15
16
17
18
19
data saved upto-20
20
21
22
23
24
data saved upto-25
25
26
27
28
29
data saved upto-30
30
31
32
33
34
data saved upto-35
35
36
37
38
39
data saved upto-40
40
41
42
43
44
data saved upto-45
45
46
47
48
49
data saved upto-50
50
51
52
53
54
data saved upto-55
55
56
57
58
59
data saved upto-60
60
61
62
63
64
data saved upto-65
65
66
67
68
69
data saved upto-70
70
71
72
73
74
data saved upto-75
75
76
77
78
79
data saved upto-80
80
81
82
83
84
data saved upto-85
85
86
87
88
89
data saved upto-90
90
91
92
93
94
data saved upto-95
95
96
97
98
99
data saved upto-100
100
101
102
103
104
data saved upto-105
105
106
107
108
109
data saved upto-110
110
111
112
113
114
data saved upto-115
115
116
117
118
119
data saved upto-120
120
121
122
123
124
data saved upto-125
125
126
127
128
129
data saved upto-130
130
131
132
133
134
data saved upto-135
135
136
137
138
139
data saved upto-140
140
141
142


In [397]:
dataframe.head()

Unnamed: 0,Link,Product_id,Category_Name,Name,Price,Product_Status,Available_Sizes,Size_sense,Path,Size_Attributes,Size_Attributes_Values,Rating,Sense_of_Rating,Recommended_rate,KWs
0,https://shop.adidas.jp/products/IC1986/,IC1986,adidas terrex,National Geographic Short Sleeve T-Shirt,5170,NEW,"['S.', 'M.', 'L.', 'XL', '2XL']",,/mens/outdoors/wear/clothes/tops/T-shirt/Adida...,"['size', 'chest circumference', 'Back length',...","['S-M-L-XL-2XL', '98cm-103cm-105cm-110cm-116cm...",,,,"wear/clothes, tops, T-shirt, Adidas Terrex, ou..."
1,https://shop.adidas.jp/products/IC1987/,IC1987,adidas terrex,National Geographic Short Sleeve T-Shirt,5170,NEW,"['S.', 'M.', 'L.', 'XL', '2XL']",,/mens/outdoors/wear/clothes/tops/T-shirt/Adida...,"['size', 'chest circumference', 'Back length',...","['S-M-L-XL-2XL', '98cm-103cm-105cm-110cm-116cm...",,,,"wear/clothes, tops, T-shirt, Adidas Terrex, ou..."
2,https://shop.adidas.jp/products/IC1999/,IC1999,adidas terrex,National Geographic Twill Pants,12100,NEW,"['S.', 'M.', 'L.', 'XL', '2XL']",,/mens/outdoors/wear/clothes/bottoms/pants/Adid...,"['size', 'waist', 'inseam', 'Rise', 'hip', 'ar...","['S-M-L-XL-2XL', '73cm-78cm-80cm-85cm-91cm', '...",,,,"ウェア・服, ボトムス, パンツ, アディダス テレックス, アウトドア, NATIONAL..."
3,https://shop.adidas.jp/products/IC1998/,IC1998,adidas terrex,national geographic shorts,7700,NEW,"['S.', 'M.', 'L.', 'XL', '2XL']",,/mens/outdoors/wear/clothes/bottoms/Shorts/sho...,"['size', 'waist', 'inseam', 'Rise', 'hip', 'ar...","['S-M-L-XL-2XL', '74cm-79cm-81cm-86cm-92cm', '...",,,,"ウェア・服, ボトムス, ショートパンツ・短パン, アディダス テレックス, アウトドア, ..."
4,https://shop.adidas.jp/products/HY5372/,HY5372,golf,Energy flower print short-sleeved shirt,9350,NEW,"['S.', 'M.', 'L.', 'XL', '2XL']",,/mens/golf/wear/clothes/tops/polo shirt/Energy...,"['size', 'chest circumference', 'Back length',...","['S-M-L-XL-2XL', '98cm-103cm-105cm-110cm-116cm...",,,,"ウェア・服, トップス, ポロシャツ, ゴルフ, 半袖, コリン・モリカワ, プライムブルー..."


In [385]:
review
product_description
# resources
# coordinate_products

Unnamed: 0,p_id,Title_Description,Detailed_Description,Summary_Description
0,IC1986,AEROREADY short-sleeved shirt partially made f...,"Inspired by the outdoors, this adidas t-shirt ...","['regular fit', 'Crew neck', '91% recycled pol..."
1,IC1987,AEROREADY short-sleeved shirt partially made f...,"Inspired by the outdoors, this adidas t-shirt ...","['regular fit', 'Crew neck', '91% recycled pol..."
2,IC1999,Twill pants made from a blend of recycled and ...,Take a sightseeing tour of the city in the mor...,"['slim fit', 'Front zip fly / Buttoned', '63% ..."
3,IC1998,WIND. RDY shorts partially made from recycled ...,Created in collaboration with National Geograp...,"['slim fit', 'Front zip fly / Buttoned', '87% ..."
4,HY5372,A breathable golf polo shirt made partially fr...,Head into your next round in this shirt that l...,"['regular fit', '3-button polo collar', '88% r..."
...,...,...,...,...
228,IC2413,A basketball jacket made from a blend of recyc...,Introducing an adidas jacket that celebrates t...,"['loose fit', 'Full zip/ribbed stand-up collar..."
229,IC2415,Basketball pants made from a blend of recycled...,Celebrate the rich history of basketball style...,"['loose fit', 'Ribbed elastic waist with drawc..."
230,IC2420,A basketball hoodie made from a blend of recyc...,Celebrate the rich history of basketball style...,"['loose fit', '2-ply hood', '100% recycled pol..."
231,HS3041,The Daewon T-shirt is always in the sweet spot.,I have a weakness for sweet things. Adidas ska...,"['regular fit', 'Crew neck', '100% cotton (sin..."


# Saving extracted informations as a excel spreadsheets

In [322]:
writer = pd.ExcelWriter('adidas_japan.xlsx')

# write each dataframe to a different sheet
dataframe.to_excel(writer, sheet_name='product_details', index=False)
review.to_excel(writer, sheet_name='product_reviews', index=False)
product_description.to_excel(writer, sheet_name='product_description', index=False)
resources.to_excel(writer, sheet_name='resources', index=False)
coordinate_products.to_excel(writer, sheet_name='coordinate_products_list', index=False)

# save the writer
writer.save()

In [323]:
dataframe.to_csv('product_details_adidas.csv', index=False)
review.to_csv('review_adidas.csv', index=False)
product_description.to_csv('product_description_adidas.csv', index=False)
resources.to_csv('resources_adidas.csv', index=False)
coordinate_products.to_csv('coordinate_products_adidas.csv', index=False)


# Rest of the codes are on test purpose 

In [386]:
unique_names = dataframe['Product_id'].nunique()
print('Number of unique names:', unique_names)

Number of unique names: 233


In [387]:
duplicates = dataframe[dataframe.duplicated(['Product_id'], keep=False)]

# count the frequency of each duplicate
counts = duplicates['Product_id'].value_counts()

# print the duplicates and their frequency
print('Duplicate Names:')
print(counts)

Duplicate Names:
Series([], Name: Product_id, dtype: int64)


In [389]:
def parse_html_website(url):
    # Send a request to the website and parse the HTML with BeautifulSoup
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Find all the div elements with class attributes and extract their text and class names
    divs = soup.find_all('div', class_=True)
    div_text = []
    div_class = []
    
    # Loop through each div element and check if it has any text
    for div in divs:
        text = div.get_text(strip=True)
        if text:
            div_text.append(text)
            div_class.append(div['class'])
    
    # Create a Pandas DataFrame from the div text and class lists
    df = pd.DataFrame({'div_text': div_text, 'div_class': div_class})
    
    return df

In [390]:
df = parse_html_website('https://shop.adidas.jp/products/HC7132/')

In [391]:
df

Unnamed: 0,div_text,div_class
0,HeaderBanneradiClubヘルプ注文検索ログインiconUserメールアドレス登...,"[page, css-1umyepy]"
1,HeaderBanneradiClubヘルプ注文検索ログインiconUserメールアドレス登...,"[mainHeaderBody, test-mainHeaderBody]"
2,HeaderBanner,"[headerBanner, css-14g6l88]"
3,adiClubヘルプ注文検索ログインiconUser,"[personalMenu, personalMenuDesktop, css-1yqpnqo]"
4,メールアドレス登録するiconArrowCircleRight,"[test-newsLetter, css-l2i13c]"
...,...,...
87,⠀,"[inputText, css-krm055]"
88,iconArrowRight,[main_submit]
89,adiClub利用規約adidas Online Shop利用規約adidas Online...,[info]
90,adiClub利用規約adidas Online Shop利用規約adidas Online...,"[info_inner, clearfix]"


In [None]:
print(df)

In [None]:
from selenium.webdriver.common.action_chains import ActionChains

url = """<div class="sizeChart test-sizeChart css-l7ym9o"><table class="sizeChartTable"><thead class="sizeChartTHeader"><tr class="sizeChartTRow"><th class="sizeChartTHeaderCell test-combined_table_header"></th></tr><tr class="sizeChartTRow"><th class="sizeChartTHeaderCell test-combined_table_header"><font style="vertical-align: inherit;"><font style="vertical-align: inherit;">chest circumference</font></font></th></tr><tr class="sizeChartTRow"><th class="sizeChartTHeaderCell test-combined_table_header"><font style="vertical-align: inherit;"><font style="vertical-align: inherit;">Back length</font></font></th></tr></thead></table><table class="sizeChartTable"><tbody><tr class="sizeChartTRow"><td class="sizeChartTCell"><span><font style="vertical-align: inherit;"><font style="vertical-align: inherit;">XS</font></font></span></td><td class="sizeChartTCell"><span><font style="vertical-align: inherit;"><font style="vertical-align: inherit;">S.</font></font></span></td><td class="sizeChartTCell"><span><font style="vertical-align: inherit;"><font style="vertical-align: inherit;">M.</font></font></span></td><td class="sizeChartTCell"><span><font style="vertical-align: inherit;"><font style="vertical-align: inherit;">L.</font></font></span></td><td class="sizeChartTCell"><span><font style="vertical-align: inherit;"><font style="vertical-align: inherit;">O (XL)</font></font></span></td><td class="sizeChartTCell"><span><font style="vertical-align: inherit;"><font style="vertical-align: inherit;">XO (2XL)</font></font></span></td><td class="sizeChartTCell"><span><font style="vertical-align: inherit;"><font style="vertical-align: inherit;">2XO (3XL)</font></font></span></td></tr><tr class="sizeChartTRow"><td class="sizeChartTCell"><span><font style="vertical-align: inherit;"><font style="vertical-align: inherit;">98cm</font></font></span></td><td class="sizeChartTCell"><span><font style="vertical-align: inherit;"><font style="vertical-align: inherit;">102cm</font></font></span></td><td class="sizeChartTCell"><span><font style="vertical-align: inherit;"><font style="vertical-align: inherit;">106cm</font></font></span></td><td class="sizeChartTCell"><span><font style="vertical-align: inherit;"><font style="vertical-align: inherit;">110cm</font></font></span></td><td class="sizeChartTCell"><span><font style="vertical-align: inherit;"><font style="vertical-align: inherit;">114cm</font></font></span></td><td class="sizeChartTCell"><span><font style="vertical-align: inherit;"><font style="vertical-align: inherit;">118cm</font></font></span></td><td class="sizeChartTCell"><span><font style="vertical-align: inherit;"><font style="vertical-align: inherit;">122cm</font></font></span></td></tr><tr class="sizeChartTRow"><td class="sizeChartTCell"><span><font style="vertical-align: inherit;"><font style="vertical-align: inherit;">64cm</font></font></span></td><td class="sizeChartTCell"><span><font style="vertical-align: inherit;"><font style="vertical-align: inherit;">66cm</font></font></span></td><td class="sizeChartTCell"><span><font style="vertical-align: inherit;"><font style="vertical-align: inherit;">68cm</font></font></span></td><td class="sizeChartTCell"><span><font style="vertical-align: inherit;"><font style="vertical-align: inherit;">70cm</font></font></span></td><td class="sizeChartTCell"><span><font style="vertical-align: inherit;"><font style="vertical-align: inherit;">72cm</font></font></span></td><td class="sizeChartTCell"><span><font style="vertical-align: inherit;"><font style="vertical-align: inherit;">74cm</font></font></span></td><td class="sizeChartTCell"><span><font style="vertical-align: inherit;"><font style="vertical-align: inherit;">76cm</font></font></span></td></tr></tbody></table></div>"""


def sizechart(url):

    driver = driver_config()

    # send a request to the URL
    driver.get(url)
    
    scroll_and_delay(driver)

    # retrieve the page source
    html_content = driver.page_source
    
    # close the webdriver
    driver.quit()
    
#     response = requests.get(url)
#     soup = BeautifulSoup(response.text, 'html.parser')

    # Create a BeautifulSoup object with the response text and specify the parser
    soup = BeautifulSoup(html_content, "html.parser")

    # Find the div containing the table and extract the table data
#     size_chart_div = soup.find('div', {'class' : 'sizeChart test-sizeChart css-l7ym9o' })

    size_chart_div = soup.find("div", class_="sizeChart test-sizeChart css-l7ym9o")
    table = size_chart_div.find("table", class_="sizeChartTable")
    thead = table.find("thead", class_="sizeChartTHeader")
    header_row = thead.find_all("tr")[1]
#     headers = [th.text.strip() for th in header_row.find_all("th")[1:]]
    
    headers = []
    for th in table.find_all('th', {'class': True }):
        if th.text:
            headers.append(th.text.strip())
        else:
            headers.append('size')

#     table = soup.find("table", class_="sizeChartTable")
#     thead = table.find("thead", class_="sizeChartTHeader")
#     header_row = thead.find_all("tr")
#     headers = [th.text.strip() for th in header_row.find_all("th")]
#     print(size_chart_div)
#     print('-----')
#     print(table)
#     print('-----')
#     print(thead)
#     print('-----')
#     print(header_row)
#     print('-----')
#     print(headers)

    tbody = size_chart_div.find("tbody")
    rows = []
    for tr in tbody.find_all("tr"):
        row_data = [td.text.strip() for td in tr.find_all("td")]
        row_data = '-'.join(row_data).replace(" ", "").replace(".", "")
        rows.append(row_data)
    
    data = []
    data.append(str(headers))
    data.append(str(rows))
#     print('-----')
#     print(rows)
#     print('-----')
#     print(data)
    header = ['attributes','values']
#     # Create a DataFrame with the extracted table data
    df = pd.DataFrame([data], columns=header)
#     df.set_index(headers[0], inplace=True)
    
    return data

In [None]:
url = 'https://shop.adidas.jp/products/HE7194/'
df = sizechart(url)

In [None]:
print(df)

In [None]:
url = 'https://shop.adidas.jp/products/IA3448/'
df1 = sizechart(url)

In [None]:
print(df1)

In [None]:
df

In [None]:
# merged_df = pd.merge(df, df1, on='attributes', how='outer')
# dfs = [df,df1]

# merged_df = pd.concat(dfs, axis=1, sort=False)
merged_df = pd.concat([df, df1], axis=0, ignore_index=True)

In [None]:
# merged_df = pd.merge(merged_df, df, on='size', how='outer')
merged_df

In [None]:
merged_df

In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time

def click_all_tabs(link):
    driver = driver_config()
    driver.get(link)  # load the HTML code in the browser
    tab_labels = driver.find_elements(By.CLASS_NAME, 'tabLink')
    driver.execute_script("window.scrollBy(0, 600)")
    for label in tab_labels:
        data_href = label.get_attribute('data-href')
        label.click()  # click the tab label
        time.sleep(5)  # pause for 5 seconds
    driver.quit()  # close the browser


In [None]:
click_all_tabs('https://shop.adidas.jp/products/HS2081/')

In [195]:
import re

def parse_span_class(html):
    soup = BeautifulSoup(html, "html.parser")
    if soup.find('div',{'class':'bar'}):
        a = soup.find('div',{'class':'bar'})
        b = a.find('span')
        print(b)
        
    pattern = r'mod-marker_(\d+_\d+)'
    match = re.search(pattern, html)
    print(type(match.group(1)))
    
    if match:
        return match.group(1)
    else:
        return None


In [196]:
html = '<div class="bar"><span class="test-marker marker mod-marker_1_5"></span><ul class="content"><li></li><li></li><li></li><li></li><li></li></ul></div>'
span_class = parse_span_class(html)
print(span_class)  # Output: 2_5


<span class="test-marker marker mod-marker_1_5"></span>
<class 'str'>
1_5


In [99]:
def parse_product_info(soup):
    soup = soup
    product_list = []
    items = soup.find('div', {'class': 'coordinate_box'})
    for item in items:
        for li in item.find_all('li'):
            product = {}
            a = li.find('a', {'class': 'coordinate_item_tile'})
            product['name'] = a['data-articleid']
            product['price'] = a['data-price']
            product['id'] = a['data-articleid']
            product['link'] = f"https://www.example.com/products/{a['data-articleid']}"
            product_list.append(product)
    return items

In [100]:
response = requests.get('https://shop.adidas.jp/products/HY2728/')

# Create a BeautifulSoup object with the response text and specify the parser
soup = BeautifulSoup(response.text, "html.parser")
output = parse_product_info(soup)

In [101]:
output

<div class="coordinate_box"><div class="coordinateItems css-vjspya"><div class="item_tile_wrapper test-item_tile_wrapper clearfix"><div class="css-1dfrz3r"></div></div></div><div class="coordinate_item_container test-coordinate_item_container"></div></div>

In [247]:
def parse_ratings(html):
    soup = BeautifulSoup(html, 'html.parser')
    ratings = {}
    for entry in soup.find_all('div', {'class': 'BVRRRatingEntry'}):
        rating_type = entry.find('div', {'class': 'BVRRRatingHeader'}).text.strip()
        rating_values = entry.find('img', {'class': 'BVImgOrSprite'})
        rating_values = rating_values['alt']
        ratings[rating_type] = str(rating_values)
    return ratings

In [248]:
html = """<div class="BVRRRatingContainerRadio"><div class="BVRRCustomRatingEntryWrapper"><div class="BVRRRatingEntry BVRROdd"> <div class="BVRRRatingHeader BVRRRatingRadioHeader BVRRRatingHeaderFit"><font style="vertical-align: inherit;"><font style="vertical-align: inherit;">
fit</font></font></div>
<div class="BVRRRating BVRRRatingRadio BVRRRatingFit"><div class="BVRRLabel BVRRRatingRadioLabel1"><font style="vertical-align: inherit;"><font style="vertical-align: inherit;">too tight</font></font></div><div class="BVRRRatingRadioImage"><img src="https://adidasjp.ugc.bazaarvoice.com/7896-ja_jp/2_8/5/ratingSlider.gif" class="BVImgOrSprite" alt="2.8/5" title="2.8/5"></div><div class="BVRRLabel BVRRRatingRadioLabel2"><font style="vertical-align: inherit;"><font style="vertical-align: inherit;">too loose</font></font></div></div></div><div class="BVRRRatingEntry BVRREven"> <div class="BVRRRatingHeader BVRRRatingRadioHeader BVRRRatingHeaderLength"><font style="vertical-align: inherit;"><font style="vertical-align: inherit;">
sense of length</font></font></div>
<div class="BVRRRating BVRRRatingRadio BVRRRatingLength"><div class="BVRRLabel BVRRRatingRadioLabel1"><font style="vertical-align: inherit;"><font style="vertical-align: inherit;">too short</font></font></div><div class="BVRRRatingRadioImage"><img src="https://adidasjp.ugc.bazaarvoice.com/7896-ja_jp/3_3/5/ratingSlider.gif" class="BVImgOrSprite" alt="3.3/5" title="3.3/5"></div><div class="BVRRLabel BVRRRatingRadioLabel2"><font style="vertical-align: inherit;"><font style="vertical-align: inherit;">too long</font></font></div></div></div></div><div class="BVRRCustomRatingEntryWrapper"><div class="BVRRRatingEntry BVRROdd"> <div class="BVRRRatingHeader BVRRRatingRadioHeader BVRRRatingHeaderQuality"><font style="vertical-align: inherit;"><font style="vertical-align: inherit;">
quality</font></font></div>
<div class="BVRRRating BVRRRatingRadio BVRRRatingQuality"><div class="BVRRLabel BVRRRatingRadioLabel1"><font style="vertical-align: inherit;"><font style="vertical-align: inherit;">low</font></font></div><div class="BVRRRatingRadioImage"><img src="https://adidasjp.ugc.bazaarvoice.com/7896-ja_jp/4_5/5/ratingSlider.gif" class="BVImgOrSprite" alt="4.5/5" title="4.5/5"></div><div class="BVRRLabel BVRRRatingRadioLabel2"><font style="vertical-align: inherit;"><font style="vertical-align: inherit;">expensive</font></font></div></div></div><div class="BVRRRatingEntry BVRREven"> <div class="BVRRRatingHeader BVRRRatingRadioHeader BVRRRatingHeaderComfort"><font style="vertical-align: inherit;"><font style="vertical-align: inherit;">
comfort</font></font></div>
<div class="BVRRRating BVRRRatingRadio BVRRRatingComfort"><div class="BVRRLabel BVRRRatingRadioLabel1"><font style="vertical-align: inherit;"><font style="vertical-align: inherit;">not comfortable at all</font></font></div><div class="BVRRRatingRadioImage"><img src="https://adidasjp.ugc.bazaarvoice.com/7896-ja_jp/5_0/5/ratingSlider.gif" class="BVImgOrSprite" alt="5/5" title="5/5"></div><div class="BVRRLabel BVRRRatingRadioLabel2"><font style="vertical-align: inherit;"><font style="vertical-align: inherit;">very comfortable</font></font></div></div></div></div></div>"""

rating_sense = parse_ratings(html)


In [249]:
rating_sense

{'fit': '2.8/5',
 'sense of length': '3.3/5',
 'quality': '4.5/5',
 'comfort': '5/5'}