### 1. Import necessary libraries

In [1]:
import os
import math
import pyautogui
import urllib.parse
import time
import pyperclip
import quopri
from bs4 import BeautifulSoup
from email import policy
from email.parser import BytesParser
import requests
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from PIL import Image, ImageEnhance
import math
import rembg
import pickle
import random
import warnings
from collections import OrderedDict
warnings.filterwarnings("ignore")

### 2. Download guitar info webpages 

In [2]:
# define a function to simulate manual downloading operation of webpages
def auto_download(url,file_name):
    pyautogui.hotkey('ctrl', '2') # switch Google Chrome tabs
    time.sleep(0.5)
    
    pyautogui.click(207,55) # click the browser's address bar.
    time.sleep(0.5)
    
    pyautogui.hotkey('ctrl', 'a') # select all the content in address bar.
    pyperclip.copy(url) # copy the url from pyperclip
    time.sleep(0.5)
    
    pyautogui.hotkey('ctrl', 'v') # paste the url
    time.sleep(0.5)
    
    pyautogui.hotkey('enter') # access the url
    time.sleep(3)
    
    pyautogui.click(1910,1020)
    pyautogui.mouseDown() # scroll down the page to look through all guitars
    time.sleep(30)
    pyautogui.mouseUp()# release the left mouse button
    
    pyautogui.hotkey('ctrl', 's') # save the page to local path
    time.sleep(2)
    
    pyperclip.copy(file_name) # copy the file name
    pyautogui.hotkey('ctrl', 'v') # paste the file name
    time.sleep(2)
    
    pyautogui.hotkey('enter') # save the page with given name

In [3]:
# url for guitarcenter webpage
base_url = "https://www.guitarcenter.com/6-String-Acoustic-Guitars.gc?N=1076+18154&Ns=bM&pageName=subcategory-page&recsPerPage=96&profileCountryCode=US&profileCurrencyCode=USD&SPA=true&Nao="

# create a directory for downloading the webpages
path = 'guitar_pages'

if not os.path.exists(path):
    os.makedirs(path)

# download pages
def download_pages_guitarcenter(base_url, pages, step):
    for i in range(0, pages * step, step):
        url = base_url + str(i)
        auto_download(url, "guitar_" + str(int(i/step)))
        time.sleep(5)

download_pages_guitarcenter(base_url, 44, 96)

### 3. Parse the webpages for urls of guitar image

In [4]:
# define a function to parse the guitarcenter mhtml files
def parse_mhtml_guitarcenter(file_path):
    # open the MHTML file in binary mode and parse it
    with open(file_path, 'rb') as file:
        msg = BytesParser(policy=policy.default).parse(file)

    # decode the HTML part correctly
    html_part = None
    for part in msg.walk():
        content_type = part.get_content_type()
        if content_type == 'text/html':
            html_part = part.get_payload(decode=True)
            break

    charset = 'utf-8'
    decoded_html = html_part.decode(charset)

    # use BeautifulSoup to parse the decoded HTML
    soup = BeautifulSoup(decoded_html, 'html.parser')
    
    # get the div with the class of jsx-1611966181 flex flex-auto flex-wrap
    div = soup.find(class_="jsx-1611966181 flex flex-auto flex-wrap")

    # get the item list from that class
    item_list = div.find_all("section",class_="plp-product-grid py-[19px] md:p-1.5 flex flex-none flex-col md:flex-row md:border-none border-b border-solid border-[#BBBBBB] w-full md:w-1/2 lg:w-1/3 xl:w-1/4")
    
    return item_list

# define a function to extract information
def instru_info_guitarcenter(item_list):
    # create a empty dictionary
    item_dict = {}

    for i in item_list:
        # image class
        img = i.find(class_ = "jsx-406435821 w-[264px] mt-5 md:mt-0")
        # title of guitar
        title = img.find_all("img")[1]["alt"]
        # url of guitar image
        imgurl = img.find_all("img")[1]["src"]
        # id for the guitar
        item_id = imgurl.split("/")[-1].split("-")[0]
        # price of guitar
        price = i.find(class_ = "jsx-2420341498 sale-price gc-font-bold text-[#2d2d2d]").text
        # location of the seller of the guitar
        if i.find(class_ = "jsx-3430979785 store-name-text"):
            location = i.find(class_ = "jsx-3430979785 store-name-text").text
        else:
            location = None
        # condition of the guitar
        condition = i.find(class_ = "jsx-3430979785 gc-font-light mb-2 text-xs").text.replace("Condition:","").strip()
        # add the info into dict
        item_dict[item_id] = {}
        item_dict[item_id]['image'] = imgurl
        item_dict[item_id]['title'] = title
        item_dict[item_id]['location'] = location
        item_dict[item_id]['price'] = price
        item_dict[item_id]['condition'] = condition
        
    return item_dict

### 4. Download guitar images

In [5]:
# define a funtion for downloading guitar images
def image_download(info,path):
    for key,value in info.items():
        # download the image that does not exist
        if 'image_status' not in info[key]:
            # URL of the image
            image_url = value['image']
            # name of image
            file_name = key + ".jpg"
            # the download path
            save_path = os.path.join(path, file_name)
            # Send a GET request to the image URL
            response = requests.get(image_url)

            # Check if the request was successful
            if response.status_code == 200:
                with open(save_path, "wb") as file:
                    file.write(response.content)
                info[key]['image_status'] = 'Success'
            
            time.sleep(1.7)

In [6]:
# set the directory for parsing webpages
path = 'guitar_pages'

# create a dictionary to store guitar info
guitarcenter_guitar_info = {}

# walk through all acoustic guitar files
for root, dirs, files in os.walk(path):
    for file in files:
        file_path = os.path.join(root, file)
        item_list = parse_mhtml_guitarcenter(file_path)
        item_dict = instru_info_guitarcenter(item_list)
        guitarcenter_guitar_info |= item_dict
        
# get the url of high-resolution images
for key,value in guitarcenter_guitar_info.items():
    guitarcenter_guitar_info[key]["image"] = guitarcenter_guitar_info[key]["image"].replace("264x264","600x600")

In [7]:
# download the high resolution guitar images
# count pending and total images
count = sum([int(bool(value.get("image_status",0))) for key,value in guitarcenter_guitar_info.items()])
total = len(guitarcenter_guitar_info)

# create a directory for downloading the images
path = 'guitar_images'
if not os.path.exists(path):
    os.makedirs(path)

# every time a guitar image is downloaded, its image_status will be updated to success
# if some image don't have the image_status key in the dictionary, it indicates the corresponding image has not been downloaded
# or fails to download. Then retry the download operation until all the images are downloaded.
while count < total:
    # handle errors related to unstable internet connection
    try:
        # download images to the assigned directory using the guitar info dictionary
        image_download(guitarcenter_guitar_info, path)
        count = sum([int(bool(value.get("image_status",0))) for key,value in guitarcenter_guitar_info.items()])
    except:
        # if the download fails, wait 2 minutes and try again.
        print(f"retry downloading images")
        time.sleep(120)

### 5. Save guitar information (id, location, condition, price...)

In [8]:
# convert the dictionary to dataframe 
df = pd.DataFrame(guitarcenter_guitar_info).T
df.to_csv("guitar_info.csv", index=True, index_label="index")