Website:1

url = "https://www.mykitsch.com/products/castor-oil-shampoo-conditioner-bar-combo-2pc"

extend_url = "https://www.mykitsch.com"

json_url = "https://www.mykitsch.com/products.json?limit=1000"

Website:2

url = "https://patternbeauty.com"

extend_url = "https://patternbeauty.com"

json_url = "https://patternbeauty.com/products.json?limit=1000"

Website: 3

url = "https://global.solawave.co/collections/shop-all"

extend_url = "https://global.solawave.co"

json_url = "https://global.solawave.co/products.json?limit=1000"


In [83]:
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
import time
import requests
import json

# Setup Chrome options (optional)
chrome_options = Options()
chrome_options.add_argument("--headless")  # Run headless Chrome (optional)
chrome_options.add_argument("--no-sandbox")  # Bypass OS security model (optional)
chrome_options.add_argument("--disable-dev-shm-usage")  # Overcome limited resource problems (optional)

# Path to your ChromeDriver
PATH = "./chromedriver.exe"
service = Service(PATH)  # Create a Service object with the path

# Function to extract HTML using Selenium and then parse with Beautiful Soup
def extract_html_with_selenium(url):
    # Initialize the Chrome driver using the Service object
    driver = webdriver.Chrome(service=service, options=chrome_options)
    
    # Open the webpage
    driver.get(url)

    # Wait for the page to fully load, including lazy-loaded content
    WebDriverWait(driver, 30).until(EC.presence_of_element_located((By.TAG_NAME, "body")))

    # Optionally, scroll down the page to trigger lazy loading (if applicable)
    SCROLL_PAUSE_TIME = 2  # Time to pause after scrolling
    last_height = driver.execute_script("return document.body.scrollHeight")

    while True:
        # Scroll down to the bottom
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        # Wait to load the page
        time.sleep(SCROLL_PAUSE_TIME)
        # Calculate new scroll height and compare with last scroll height
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            break
        last_height = new_height

    # Get the full HTML page content after scrolling
    html = driver.page_source

    # Parse the HTML with BeautifulSoup
    soup = BeautifulSoup(html, 'html.parser')
    
    driver.quit()  # Close the driver after extraction
    return soup

# Function to extract and render only the content from the <main> tag
def extract_main_content(soup):
    # Find the <main> tag content
    main_content = soup.find('main')
    
    if main_content:
        return main_content
    else:
        return None

# Function to extract anchor tags (inner HTML and links) from the <main> tag and save as DataFrame
def extract_anchor_tags(main_content):
    if main_content:
        anchors = main_content.find_all('a')  # Find all anchor tags within <main>

        # Function to get the text content of an element, ignoring child elements
        def get_visible_text(element):
            return ' '.join(element.stripped_strings)

        # Extract text content and link
        data = []
        for anchor in anchors:
            visible_text = get_visible_text(anchor)  # Get only the visible text
            link = anchor.get('href')  # Get the link (href)

            data.append({'inner html': visible_text, 'link': link})

        # Convert the data into a DataFrame
        df = pd.DataFrame(data)
        return df
    else:
        return pd.DataFrame(columns=['inner html', 'link'])

# Function to extract all data from URL
def extract_data_from_url(url):
    # Extract the fully loaded HTML using Selenium and parse it with Beautiful Soup
    soup = extract_html_with_selenium(url)
    
    # Extract the content from the <main> tag
    main_content = extract_main_content(soup)
    
    # Extract anchor tags from the <main> tag and create a DataFrame
    df = extract_anchor_tags(main_content)
    
    return df

# Function to clean and filter links
def get_cleaned_links(df):
    # Filter non-empty 'inner html'
    df_cleaned = df[df['inner html'] != '']
    # Filter rows where 'link' contains '/products/'
    df_products = df_cleaned[df_cleaned['link'].str.contains('/products/', na=False)]
    # Remove everything before and including '/products/' in the link
    df_products['cleaned_link'] = df_products['link'].apply(lambda link: link.split('/products/', 1)[-1])
    return df_products

# Function to fetch all products from paginated API
def get_all_products(json_url, max_limit=250):
    products = []
    page = 1
    while True:
        response = requests.get(f"{json_url}&limit={max_limit}&page={page}")
        if response.status_code != 200:
            break
        data = response.json()
        if 'products' in data:
            products.extend(data['products'])
            if len(data['products']) < max_limit:
                break
        else:
            break
        page += 1
    return products

# Function to extract product handles and titles
def extract_product_handles_and_titles(products):
    return [(product['handle'], product['title']) for product in products if 'handle' in product and 'title' in product]

# Function to match product handles with cleaned links and retain inner html
def match_links_with_handles(product_data, cleaned_links_df):
    df_products = pd.DataFrame(product_data, columns=['handle', 'title'])
    matched_df = pd.merge(df_products, cleaned_links_df, left_on='handle', right_on='cleaned_link', how='inner')
    return matched_df[['handle', 'title', 'inner html']]

# Function to add full URLs to matched product handles
def add_url_to_matched_links(matched_df, base_url):
    matched_df['full_url'] = matched_df['handle'].apply(lambda handle: f"{base_url}/products/{handle}")
    return matched_df

# Consolidated function to get the final DataFrame
def get_matched_df(url, extend_url, json_url):
    # Setup Chrome options for Selenium
    chrome_options = Options()
    chrome_options.add_argument("--headless")
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")

    # Path to your ChromeDriver
    PATH = "./chromedriver.exe"
    service = Service(PATH)

    # Step 1: Extract data from the URL
    df = extract_data_from_url(url)

    # Step 2: Clean and filter product links
    cleaned_links_df = get_cleaned_links(df)

    # Step 3: Fetch all products from the API
    all_products = get_all_products(json_url)

    # Step 4: Extract product handles and titles
    product_data = extract_product_handles_and_titles(all_products)

    # Step 5: Match the links with handles and retain 'inner html'
    matched_df = match_links_with_handles(product_data, cleaned_links_df)

    # Step 6: Add full URLs to matched products
    final_df = add_url_to_matched_links(matched_df, extend_url)

    return final_df


In [84]:
# Xpath for all links

def get_xpath(driver, element):
    # This script computes the full XPath of an element using JavaScript executed through Selenium
    return driver.execute_script(
        """
        function getElementXPath(element) {
            var paths = [];
            for (; element && element.nodeType == Node.ELEMENT_NODE; element = element.parentNode) {
                var index = 0;
                for (var sibling = element.previousSibling; sibling; sibling = sibling.previousSibling) {
                    if (sibling.nodeType == Node.DOCUMENT_TYPE_NODE) {
                        continue;
                    }
                    if (sibling.nodeName == element.nodeName) {
                        ++index;
                    }
                }
                var tagName = element.nodeName.toLowerCase();
                var pathIndex = (index ? "[" + (index + 1) + "]" : "");
                paths.splice(0, 0, tagName + pathIndex);
            }
            return paths.length ? "/" + paths.join("/") : null;
        }
        return getElementXPath(arguments[0]);
        """, element)

# Function to extract all anchor tags from the page and their XPaths
def extract_all_anchor_tags(driver):
    # Find all anchor tags in the entire page
    anchors = driver.find_elements(By.TAG_NAME, 'a')

    data = []
    for anchor in anchors:
        visible_text = anchor.text.strip()  # Get only the visible text
        link = anchor.get_attribute('href')  # Get the link (href)
        xpath = get_xpath(driver, anchor)  # Extract XPath

        # Append data for the DataFrame
        data.append({'inner html': visible_text, 'link': link, 'xpath': xpath})

    # Convert the data into a DataFrame
    df = pd.DataFrame(data)
    return df

# Function to extract HTML and return a DataFrame with links and XPaths
def extract_links_and_xpaths(url):
    # Initialize the Chrome driver using the Service object
    driver = webdriver.Chrome(service=service, options=chrome_options)
    
    # Open the webpage
    driver.get(url)

    # Wait for the page to fully load, including lazy-loaded content
    WebDriverWait(driver, 30).until(EC.presence_of_element_located((By.TAG_NAME, "body")))

    # Optionally, scroll down the page to trigger lazy loading (if applicable)
    SCROLL_PAUSE_TIME = 2  # Time to pause after scrolling
    last_height = driver.execute_script("return document.body.scrollHeight")

    while True:
        # Scroll down to the bottom
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        # Wait to load the page
        time.sleep(SCROLL_PAUSE_TIME)
        # Calculate new scroll height and compare with last scroll height
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            break
        last_height = new_height

    # Extract all anchor tags from the page and their XPaths
    df_xPath = extract_all_anchor_tags(driver)

    # Close the driver
    driver.quit()
    
    return df_xPath

In [103]:
url = "https://www.mykitsch.com/products/castor-oil-shampoo-conditioner-bar-combo-2pc"
extend_url = "https://www.mykitsch.com"
json_url = "https://www.mykitsch.com/products.json?limit=1000"

matched_df = get_matched_df(url, extend_url, json_url)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_products['cleaned_link'] = df_products['link'].apply(lambda link: link.split('/products/', 1)[-1])


In [104]:
matched_df

Unnamed: 0,handle,title,inner html,full_url
0,pack-light-bundle-castor-oil,Pack Light Bundle - Castor Oil,Rs. 840.00 OFF 10% OFF,https://www.mykitsch.com/products/pack-light-b...
1,pack-light-bundle-castor-oil,Pack Light Bundle - Castor Oil,Pack Light Bundle - Castor Oil 4.8 Rated 4.8 o...,https://www.mykitsch.com/products/pack-light-b...
2,castor-oil-shampoo-conditioner-soap-dish-bundle,Castor Oil Shampoo + Conditioner + Soap Dish B...,"Rs. 1,080.00 OFF 10% OFF",https://www.mykitsch.com/products/castor-oil-s...
3,castor-oil-shampoo-conditioner-soap-dish-bundle,Castor Oil Shampoo + Conditioner + Soap Dish B...,Castor Oil Shampoo + Conditioner + Soap Dish B...,https://www.mykitsch.com/products/castor-oil-s...
4,bottle-free-beauty-bar-bag-blush,Shampoo Bar Bag,Rs. 120.00 OFF,https://www.mykitsch.com/products/bottle-free-...
5,bottle-free-beauty-bar-bag-blush,Shampoo Bar Bag,Shampoo Bar Bag 4.9 Rated 4.9 out of 5 stars 1...,https://www.mykitsch.com/products/bottle-free-...
6,rpet-bottle-free-beauty-bar-conditioner-bag-black,Conditioner Bar Bag,Rs. 120.00 OFF,https://www.mykitsch.com/products/rpet-bottle-...
7,rpet-bottle-free-beauty-bar-conditioner-bag-black,Conditioner Bar Bag,Conditioner Bar Bag 4.9 Rated 4.9 out of 5 sta...,https://www.mykitsch.com/products/rpet-bottle-...
8,kitsch-self-draining-shower-caddy,Kitsch Self-Draining Shower Caddy,Rs. 880.00 OFF,https://www.mykitsch.com/products/kitsch-self-...
9,kitsch-self-draining-shower-caddy,Kitsch Self-Draining Shower Caddy,Kitsch Self-Draining Shower Caddy 4.9 Rated 4....,https://www.mykitsch.com/products/kitsch-self-...


In [105]:
import pandas as pd
from fuzzywuzzy import fuzz

# Function to calculate fuzzy score
def calculate_fuzzy_score(row):
    return fuzz.ratio(row['title'], row['inner html'])

# Calculate the fuzzy score for each row
matched_df['fuzzy_score'] = matched_df.apply(calculate_fuzzy_score, axis=1)

# Group by 'full_url' and get the row with the maximum fuzzy score for each full_url
matched_df = matched_df.loc[matched_df.groupby('full_url')['fuzzy_score'].idxmax()]

In [106]:
matched_df

Unnamed: 0,handle,title,inner html,full_url,fuzzy_score
5,bottle-free-beauty-bar-bag-blush,Shampoo Bar Bag,Shampoo Bar Bag 4.9 Rated 4.9 out of 5 stars 1...,https://www.mykitsch.com/products/bottle-free-...,22
3,castor-oil-shampoo-conditioner-soap-dish-bundle,Castor Oil Shampoo + Conditioner + Soap Dish B...,Castor Oil Shampoo + Conditioner + Soap Dish B...,https://www.mykitsch.com/products/castor-oil-s...,43
11,deep-moisturizing-conditioner-bar,Deep-Moisturizing Conditioner Bar for Dry Dama...,Deep-Moisturizing Conditioner Bar for Dry Dama...,https://www.mykitsch.com/products/deep-moistur...,50
9,kitsch-self-draining-shower-caddy,Kitsch Self-Draining Shower Caddy,Kitsch Self-Draining Shower Caddy 4.9 Rated 4....,https://www.mykitsch.com/products/kitsch-self-...,33
1,pack-light-bundle-castor-oil,Pack Light Bundle - Castor Oil,Pack Light Bundle - Castor Oil 4.8 Rated 4.8 o...,https://www.mykitsch.com/products/pack-light-b...,31
7,rpet-bottle-free-beauty-bar-conditioner-bag-black,Conditioner Bar Bag,Conditioner Bar Bag 4.9 Rated 4.9 out of 5 sta...,https://www.mykitsch.com/products/rpet-bottle-...,26


In [116]:
matched_df

Unnamed: 0,handle,title,inner html,full_url,fuzzy_score,xpath,tag-id,tag-index
5,bottle-free-beauty-bar-bag-blush,Shampoo Bar Bag,Shampoo Bar Bag 4.9 Rated 4.9 out of 5 stars 1...,https://www.mykitsch.com/products/bottle-free-...,22,/html/body/main/section[4]/div/div/div[2]/ul/l...,shopify-section-template--17351358185653__prod...,5
3,castor-oil-shampoo-conditioner-soap-dish-bundle,Castor Oil Shampoo + Conditioner + Soap Dish B...,Castor Oil Shampoo + Conditioner + Soap Dish B...,https://www.mykitsch.com/products/castor-oil-s...,43,/html/body/main/section[2]/div/div/div[2]/ul/l...,shopify-section-template--17351358185653__prod...,2
11,deep-moisturizing-conditioner-bar,Deep-Moisturizing Conditioner Bar for Dry Dama...,Deep-Moisturizing Conditioner Bar for Dry Dama...,https://www.mykitsch.com/products/deep-moistur...,50,/html/body/main/section[4]/div/div/div[2]/ul/l...,shopify-section-template--17351358185653__prod...,5
9,kitsch-self-draining-shower-caddy,Kitsch Self-Draining Shower Caddy,Kitsch Self-Draining Shower Caddy 4.9 Rated 4....,https://www.mykitsch.com/products/kitsch-self-...,33,/html/body/main/section[4]/div/div/div[2]/ul/l...,shopify-section-template--17351358185653__prod...,5
1,pack-light-bundle-castor-oil,Pack Light Bundle - Castor Oil,Pack Light Bundle - Castor Oil 4.8 Rated 4.8 o...,https://www.mykitsch.com/products/pack-light-b...,31,/html/body/main/section[2]/div/div/div[2]/ul/l...,shopify-section-template--17351358185653__prod...,2
7,rpet-bottle-free-beauty-bar-conditioner-bag-black,Conditioner Bar Bag,Conditioner Bar Bag 4.9 Rated 4.9 out of 5 sta...,https://www.mykitsch.com/products/rpet-bottle-...,26,/html/body/main/section[4]/div/div/div[2]/ul/l...,shopify-section-template--17351358185653__prod...,5


In [108]:
df_xPath = extract_links_and_xpaths(url)
df_xPath = df_xPath[df_xPath['xpath'].str.contains('/main/', na=False)]

In [109]:

df_xPath

Unnamed: 0,inner html,link,xpath
114,Home,https://www.mykitsch.com/,/html/body/main/section/section/div/div[2]/div...
115,"Rated 4.8 out of 5 stars\n2,294 Reviews\nClick...",https://www.mykitsch.com/products/castor-oil-s...,/html/body/main/section/section/div/div[2]/div/a
116,,https://www.afterpay.com/purchase-payment-agre...,/html/body/main/section/section/div/div[2]/div...
117,10% OFF,https://www.mykitsch.com/products/castor-oil-s...,/html/body/main/section[2]/div/div/div[2]/ul/l...
118,Castor Oil Shampoo + Conditioner + Soap Dish B...,https://www.mykitsch.com/products/castor-oil-s...,/html/body/main/section[2]/div/div/div[2]/ul/l...
119,10% OFF,https://www.mykitsch.com/products/pack-light-b...,/html/body/main/section[2]/div/div/div[2]/ul/l...
120,Pack Light Bundle - Castor Oil\nRated 4.8 out ...,https://www.mykitsch.com/products/pack-light-b...,/html/body/main/section[2]/div/div/div[2]/ul/l...
121,,https://www.mykitsch.com/products/deep-moistur...,/html/body/main/section[4]/div/div/div[2]/ul/l...
122,Deep-Moisturizing Conditioner Bar for Dry Dama...,https://www.mykitsch.com/products/deep-moistur...,/html/body/main/section[4]/div/div/div[2]/ul/l...
123,,https://www.mykitsch.com/products/kitsch-self-...,/html/body/main/section[4]/div/div/div[2]/ul/l...


In [110]:
def add_xpath_to_matched_links(matched_df, df_xpath):
    # Clean the links in df_xpath to match with full_url in matched_df
    df_xpath['cleaned_link'] = df_xpath['link'].apply(lambda link: link.split('/products/', 1)[-1] if link else None)

    # Extract the handle from full_url in matched_df (the part after '/products/')
    matched_df['cleaned_full_url'] = matched_df['full_url'].apply(lambda url: url.split('/products/', 1)[-1] if url else None)

    # Initialize an empty list to store xpaths
    xpaths = []

    # Loop through each row in matched_df to find the corresponding xpath from df_xpath
    for index, row in matched_df.iterrows():
        # Find the corresponding xpath from df_xpath
        matching_row = df_xpath[df_xpath['cleaned_link'] == row['cleaned_full_url']]
        
        if not matching_row.empty:
            # If a match is found, append the xpath
            xpaths.append(matching_row['xpath'].values[0])
        else:
            # If no match is found, append None
            xpaths.append(None)
    
    # Add the xpath column to matched_df
    matched_df['xpath'] = xpaths

    # Drop the temporary 'cleaned_full_url' column
    matched_df.drop(columns=['cleaned_full_url'], inplace=True)
    
    return matched_df

# Example usage:
# df_xpath = DataFrame with 'link' and 'xpath'
# matched_df = DataFrame with 'title', 'inner html', 'full_url'

final_df_with_xpath = add_xpath_to_matched_links(matched_df, df_xPath)


In [111]:
final_df_with_xpath

Unnamed: 0,handle,title,inner html,full_url,fuzzy_score,xpath
5,bottle-free-beauty-bar-bag-blush,Shampoo Bar Bag,Shampoo Bar Bag 4.9 Rated 4.9 out of 5 stars 1...,https://www.mykitsch.com/products/bottle-free-...,22,/html/body/main/section[4]/div/div/div[2]/ul/l...
3,castor-oil-shampoo-conditioner-soap-dish-bundle,Castor Oil Shampoo + Conditioner + Soap Dish B...,Castor Oil Shampoo + Conditioner + Soap Dish B...,https://www.mykitsch.com/products/castor-oil-s...,43,/html/body/main/section[2]/div/div/div[2]/ul/l...
11,deep-moisturizing-conditioner-bar,Deep-Moisturizing Conditioner Bar for Dry Dama...,Deep-Moisturizing Conditioner Bar for Dry Dama...,https://www.mykitsch.com/products/deep-moistur...,50,/html/body/main/section[4]/div/div/div[2]/ul/l...
9,kitsch-self-draining-shower-caddy,Kitsch Self-Draining Shower Caddy,Kitsch Self-Draining Shower Caddy 4.9 Rated 4....,https://www.mykitsch.com/products/kitsch-self-...,33,/html/body/main/section[4]/div/div/div[2]/ul/l...
1,pack-light-bundle-castor-oil,Pack Light Bundle - Castor Oil,Pack Light Bundle - Castor Oil 4.8 Rated 4.8 o...,https://www.mykitsch.com/products/pack-light-b...,31,/html/body/main/section[2]/div/div/div[2]/ul/l...
7,rpet-bottle-free-beauty-bar-conditioner-bag-black,Conditioner Bar Bag,Conditioner Bar Bag 4.9 Rated 4.9 out of 5 sta...,https://www.mykitsch.com/products/rpet-bottle-...,26,/html/body/main/section[4]/div/div/div[2]/ul/l...


We neet X_path now

Extract Tag Id and Index

In [112]:
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
import time

# Setup Chrome options (optional)
chrome_options = Options()
chrome_options.add_argument("--headless")  # Run headless Chrome (optional)
chrome_options.add_argument("--no-sandbox")  # Bypass OS security model (optional)
chrome_options.add_argument("--disable-dev-shm-usage")  # Overcome limited resource problems (optional)

# Path to your ChromeDriver
PATH = "./chromedriver.exe"
service = Service(PATH)  # Create a Service object with the path

# Function to extract HTML using Selenium and then parse with BeautifulSoup
def extract_html_with_selenium(url):
    # Initialize the Chrome driver using the Service object
    driver = webdriver.Chrome(service=service, options=chrome_options)
    
    # Open the webpage
    driver.get(url)

    # Wait for the page to fully load, including lazy-loaded content
    WebDriverWait(driver, 30).until(EC.presence_of_element_located((By.TAG_NAME, "body")))

    # Optionally, scroll down the page to trigger lazy loading (if applicable)
    SCROLL_PAUSE_TIME = 2  # Time to pause after scrolling
    last_height = driver.execute_script("return document.body.scrollHeight")

    while True:
        # Scroll down to the bottom
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        # Wait to load the page
        time.sleep(SCROLL_PAUSE_TIME)
        # Calculate new scroll height and compare with last scroll height
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            break
        last_height = new_height

    # Get the full HTML page content after scrolling
    html = driver.page_source

    # Parse the HTML with BeautifulSoup
    soup = BeautifulSoup(html, 'html.parser')

    # Close the driver
    driver.quit()
    
    return soup

# Function to extract all sections with tag-id 'shopify-section-template' and links inside them
def extract_shopify_sections(soup):
    # Find all elements with 'shopify-section-template' in their ID
    sections = soup.find_all(id=lambda x: x and 'shopify-section-template' in x)

    # Map to store tag-id and their index
    tag_id_map = {}
    current_index = 1  # Start indexing from 1

    # Extract tag-id, links, and tag index from each section
    data = []
    for section in sections:
        tag_id = section.get('id')  # Get the tag-id
        
        # Assign index to tag_id if it's new
        if tag_id not in tag_id_map:
            tag_id_map[tag_id] = current_index
            current_index += 1  # Increment the index for next unseen tag
        
        tag_index = tag_id_map[tag_id]  # Get the index of the current tag-id

        # Find all anchor tags with href inside this section
        links = section.find_all('a', href=True)

        # Append each link with the corresponding tag-id and tag-index
        for link in links:
            data.append({'tag-id': tag_id, 'tag-index': tag_index, 'link': link['href']})

    # Convert the data into a DataFrame
    df = pd.DataFrame(data)
    return df

# Function to take URL as input and return the DataFrame
def get_shopify_section_links(url):
    # Extract the fully loaded HTML using Selenium and parse it with Beautiful Soup
    soup = extract_html_with_selenium(url)

    # Extract shopify-section-template and links along with their index
    df_ids = extract_shopify_sections(soup)

    return df_ids



In [113]:
df_ids = get_shopify_section_links(url)
df_ids

Unnamed: 0,tag-id,tag-index,link
0,shopify-section-template--17351358185653__main,1,https://www.mykitsch.com
1,shopify-section-template--17351358185653__main,1,#shopify-block-okendo_reviews_widget_cG3WKq
2,shopify-section-template--17351358185653__main,1,https://www.afterpay.com/purchase-payment-agre...
3,shopify-section-template--17351358185653__prod...,2,/products/castor-oil-shampoo-conditioner-soap-...
4,shopify-section-template--17351358185653__prod...,2,/products/castor-oil-shampoo-conditioner-soap-...
5,shopify-section-template--17351358185653__prod...,2,/products/pack-light-bundle-castor-oil
6,shopify-section-template--17351358185653__prod...,2,/products/pack-light-bundle-castor-oil
7,shopify-section-template--17351358185653__prod...,5,/products/deep-moisturizing-conditioner-bar
8,shopify-section-template--17351358185653__prod...,5,/products/deep-moisturizing-conditioner-bar
9,shopify-section-template--17351358185653__prod...,5,/products/kitsch-self-draining-shower-caddy


In [96]:
import pandas as pd

# Function to match URLs and assign tag-id and tag-index to final_df_with_xpath
def match_tag_id_with_index(final_df_with_xpath, df):
    # Remove the initial part of the URL before '/products/' in both DataFrames
    final_df_with_xpath['processed_url'] = final_df_with_xpath['full_url'].str.extract(r'(/products/.*)')
    df['processed_link'] = df['link'].str.extract(r'(/products/.*)')

    # Initialize new columns in final_df_with_xpath for tag-id and tag-index
    final_df_with_xpath['tag-id'] = None
    final_df_with_xpath['tag-index'] = None

    # Iterate over final_df_with_xpath and match with df
    for i, row in final_df_with_xpath.iterrows():
        # Find the matching row in df based on processed URL
        match = df[df['processed_link'] == row['processed_url']]

        # If match is found, assign the tag-id and tag-index
        if not match.empty:
            final_df_with_xpath.at[i, 'tag-id'] = match['tag-id'].values[0]
            final_df_with_xpath.at[i, 'tag-index'] = match['tag-index'].values[0]
    
    # Drop the processed_url and processed_link columns as they are not needed anymore
    final_df_with_xpath.drop(columns=['processed_url'], inplace=True)
    df.drop(columns=['processed_link'], inplace=True)

    return final_df_with_xpath


In [119]:
final_df_with_xpath.shape

(6, 8)

In [114]:
final_df = match_tag_id_with_index(final_df_with_xpath, df_ids)

In [121]:
final_df

Unnamed: 0,handle,title,inner html,full_url,fuzzy_score,xpath,tag-id,tag-index
5,bottle-free-beauty-bar-bag-blush,Shampoo Bar Bag,Shampoo Bar Bag 4.9 Rated 4.9 out of 5 stars 1...,https://www.mykitsch.com/products/bottle-free-...,22,/html/body/main/section[4]/div/div/div[2]/ul/l...,shopify-section-template--17351358185653__prod...,5
3,castor-oil-shampoo-conditioner-soap-dish-bundle,Castor Oil Shampoo + Conditioner + Soap Dish B...,Castor Oil Shampoo + Conditioner + Soap Dish B...,https://www.mykitsch.com/products/castor-oil-s...,43,/html/body/main/section[2]/div/div/div[2]/ul/l...,shopify-section-template--17351358185653__prod...,2
11,deep-moisturizing-conditioner-bar,Deep-Moisturizing Conditioner Bar for Dry Dama...,Deep-Moisturizing Conditioner Bar for Dry Dama...,https://www.mykitsch.com/products/deep-moistur...,50,/html/body/main/section[4]/div/div/div[2]/ul/l...,shopify-section-template--17351358185653__prod...,5
9,kitsch-self-draining-shower-caddy,Kitsch Self-Draining Shower Caddy,Kitsch Self-Draining Shower Caddy 4.9 Rated 4....,https://www.mykitsch.com/products/kitsch-self-...,33,/html/body/main/section[4]/div/div/div[2]/ul/l...,shopify-section-template--17351358185653__prod...,5
1,pack-light-bundle-castor-oil,Pack Light Bundle - Castor Oil,Pack Light Bundle - Castor Oil 4.8 Rated 4.8 o...,https://www.mykitsch.com/products/pack-light-b...,31,/html/body/main/section[2]/div/div/div[2]/ul/l...,shopify-section-template--17351358185653__prod...,2
7,rpet-bottle-free-beauty-bar-conditioner-bag-black,Conditioner Bar Bag,Conditioner Bar Bag 4.9 Rated 4.9 out of 5 sta...,https://www.mykitsch.com/products/rpet-bottle-...,26,/html/body/main/section[4]/div/div/div[2]/ul/l...,shopify-section-template--17351358185653__prod...,5


Save the csv of the tag search

In [122]:
final_df.to_csv('mykitsch.csv',index=False)