In [25]:
# Extract Page

import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
import time

chrome_options = Options()
chrome_options.add_argument("--headless") 
chrome_options.add_argument("--no-sandbox")  
chrome_options.add_argument("--disable-dev-shm-usage")  

PATH = "./chromedriver.exe"
service = Service(PATH)  

driver = webdriver.Chrome(service=service, options=chrome_options)

def extract_html_with_selenium(url):
    driver.get(url)

    WebDriverWait(driver, 30).until(EC.presence_of_element_located((By.TAG_NAME, "body")))

    SCROLL_PAUSE_TIME = 2  
    last_height = driver.execute_script("return document.body.scrollHeight")

    while True:
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(SCROLL_PAUSE_TIME)
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            break
        last_height = new_height
    html = driver.page_source
    soup = BeautifulSoup(html, 'html.parser')
    
    return soup

# Recursive function to find parent <li> or <section> tag
def find_parent_with_tag(element):
    # Look for parent tags like <li> or <section>
    while element:
        parent = element.find_parent(['li', 'section'])
        if parent:
            return parent
        element = element.parent
    return None

def extract_currency_text(soup):
    data = []
    counter = 1

    # Search for all occurrences of ₹ or $ in the HTML
    for symbol in soup.find_all(string=lambda text: text and ('₹' in text or '$' in text or 'Rs.' in text)):
        # Find the parent <li> or <section> for each symbol
        parent = find_parent_with_tag(symbol)
        
        if parent:
            inner_text_parts = parent.get_text(separator="\n", strip=True).split("\n")
            
            for part in inner_text_parts:
                if part.strip():  # Ensure it's not an empty string
                    data.append({"inner-text": part, "counter": counter})
                    counter += 1
    
    return pd.DataFrame(data)

def extract_main_content(soup):
    main_content = soup.find('main')
    
    if main_content:
        return main_content
    else:
        return None

# url = "https://patternbeauty.com"  # Replace with the actual URL
url = "https://global.solawave.co/collections/shop-all"
# url = "https://www.mykitsch.com/products/castor-oil-shampoo-conditioner-bar-combo-2pc"


soup = extract_html_with_selenium(url)

main_content = extract_main_content(soup)

if main_content:
    df = extract_currency_text(main_content)

driver.quit()

df

Unnamed: 0,inner-text,counter
0,best seller,1
1,anti-aging,2
2,hydration,3
3,488,4
4,Reviews,5
...,...,...
631,Save,632
632,Rs. 300,633
633,!,634
634,Add To Cart,635


In [26]:
import re

def preprocess_text(text, stopwords, exclude_words):
    """
    Preprocess a given text by applying the following:
    1. Convert text to lowercase.
    2. Remove the text if it ends with a stopword.
    3. Remove the text if it contains excluded words like 'price', 'add to cart', 'review', etc.
    4. Remove the text if it only contains special characters.
    """
    # Convert text to lowercase
    text_lower = text.lower().strip()

    # Remove text if it ends with a stopword
    words = text_lower.split()
    if words and words[-1] in stopwords:
        return None

    # Remove text if it contains excluded words
    if any(word in text_lower for word in exclude_words):
        return None

    # Remove text if it only contains special characters
    if re.fullmatch(r'\W+', text_lower):  # Matches only non-alphanumeric characters
        return None

    return text

stopwords = ['the', 'a', 'an', 'for', 'and', 'or', 'to', 'at', 'by', 'in', 'on', 'with', 'from', 'of', 'per']
exclude_words = ['price', 'add to cart', 'review', 'rating', 'shop', 'off', '$', '₹','sold','sale','title']

df['inner-text'] = df['inner-text'].apply(lambda x: preprocess_text(x, stopwords, exclude_words))

df = df.dropna().reset_index(drop=True)

df

Unnamed: 0,inner-text,counter
0,best seller,1
1,anti-aging,2
2,hydration,3
3,488,4
4,4-in-1 Skincare Wand & Activating Serum Kit,6
...,...,...
436,Replace your lost charger.,629
437,Rs. 600,630
438,Rs. 900,631
439,Save,632


In [27]:
# Fetch Product titles from Products.json

import requests
import json

def get_all_products(base_url, max_limit=100):
    products = []
    page = 1
    
    while True:
        response = requests.get(f"{base_url}?limit={max_limit}&page={page}")
        if response.status_code != 200:
            print(f"Failed to retrieve data for page {page}. Status code: {response.status_code}")
            break
        
        data = response.json()

        if 'products' in data:
            products.extend(data['products'])
            if len(data['products']) < max_limit:
                break
        else:
            break
        
        page += 1
    
    return products

def save_to_json(data, filename):
    # Save the data to a JSON file
    with open(filename, 'w') as json_file:
        json.dump(data, json_file, indent=4)

def extract_product_titles(products):
    return [product['title'] for product in products]

url = "https://global.solawave.co/products.json?limit=1000"

# Fetch all products
all_products = get_all_products(url, max_limit=250)

save_to_json(all_products, "kitsch_products.json")

product_titles = extract_product_titles(all_products)

print(product_titles)


['Radiant Renewal Eye Recovery Pro', '4-in-1 Advanced Skincare Wand (V1)', 'Magnetic Charging Cord Replacement for Radiant Renewal Skincare Wand', 'Radiant Renewal 2-in-1 Skincare Mini', 'Wrinkle Retreat Light Therapy Face Mask', 'Neck & Chest Rejuvenating Mask', 'Radiant Renewal Skincare Wand Replacement Carrying Case', '4-in-1 Red Light Therapy Skincare Wand & Activating Serum Kit', 'Skin Therapy Activating Serum', 'Renew Complex Activating Serum', 'Glowing Skin Starter Set', 'Pre- & Probiotic Nourishing Moisturizer', 'Pre- & Probiotic Plumping Peptide Serum', 'Pre- & Probiotic Refreshing Jelly Mist', 'Pre- & Probiotic Hydrating Gel Cleanser', '4-in-1 Radiant Renewal Skincare Wand with Red Light Therapy', 'Hydrating Sheet Mask', 'Bye Acne: 3-Minute Pimple Spot Treatment']


In [28]:
import pandas as pd
import numpy as np
from gensim.models import Word2Vec
from sklearn.metrics.pairwise import cosine_similarity
from fuzzywuzzy import fuzz
import re


product_titles = [title.lower() for title in product_titles]

sentences = [title.split() for title in product_titles] + [text.split() for text in df['inner-text'].str.lower()]

word2vec_model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)

# Function to get average Word2Vec embeddings for a sentence
def get_sentence_vector(sentence, model):
    vector = np.zeros(model.vector_size)
    count = 0
    for word in sentence.split():
        if word in model.wv:
            vector += model.wv[word]
            count += 1
    if count > 0:
        vector /= count
    return vector

matched_data = []

product_vectors = np.array([get_sentence_vector(title, word2vec_model) for title in product_titles])

inner_vectors = np.array([get_sentence_vector(text, word2vec_model) for text in df['inner-text'].str.lower()])

# Compute cosine similarity between each inner text and all product titles
for i, inner_vector in enumerate(inner_vectors):
    cosine_similarities = cosine_similarity([inner_vector], product_vectors)
    
    best_match_idx = cosine_similarities.argmax()
    best_match_score = cosine_similarities[0, best_match_idx]
    
    # Set a threshold for the minimum acceptable match score (e.g., 0.2 or 20%)
    if best_match_score >= 0.5:
        matched_data.append((df['inner-text'][i], df['inner-text'][i].lower(), product_titles[best_match_idx], best_match_score))

matched_df = pd.DataFrame(matched_data, columns=['Original Inner Text', 'Lower Inner Text', 'Matched Product Title', 'Cosine Score'])

# Preprocessing function to remove brackets and content inside them
def preprocess_title(title):
    return re.sub(r'\s*\(.*?\)\s*', ' ', title).strip()  # Remove brackets and trim whitespace

matched_df['Matched Product Title'] = matched_df['Matched Product Title'].apply(preprocess_title)

# Now we calculate the fuzzy matching score for each row in `matched_df`
fuzzy_scores = []

for index, row in matched_df.iterrows():
    lower_inner_text = row['Lower Inner Text']
    matched_title = row['Matched Product Title']
    
    fuzzy_score = fuzz.partial_ratio(lower_inner_text, matched_title)
    
    fuzzy_scores.append(fuzzy_score)

matched_df['Fuzzy Score'] = fuzzy_scores

unique_df = matched_df.drop_duplicates(subset=['Original Inner Text', 'Matched Product Title'])

unique_df.reset_index(drop=True, inplace=True)

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Function to calculate cosine similarity using BOW embeddings
def cosine_similarity_bow(sentence1, sentence2):
    vectorizer = CountVectorizer()
    bow_matrix = vectorizer.fit_transform([sentence1, sentence2])
    similarity = cosine_similarity(bow_matrix[0:1], bow_matrix[1:2])
    return similarity[0][0]

# Calculate BOW cosine similarity for each row in `unique_df`
unique_df['BOW-cosine-similarity'] = unique_df.apply(
    lambda row: cosine_similarity_bow(row['Lower Inner Text'], row['Matched Product Title']),
    axis=1
)

unique_df.reset_index(drop=True, inplace=True)
unique_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  unique_df['BOW-cosine-similarity'] = unique_df.apply(


Unnamed: 0,Original Inner Text,Lower Inner Text,Matched Product Title,Cosine Score,Fuzzy Score,BOW-cosine-similarity
0,4-in-1 Skincare Wand & Activating Serum Kit,4-in-1 skincare wand & activating serum kit,4-in-1 red light therapy skincare wand & activ...,0.848641,86,0.816497
1,2-in-1 Skincare Mini,2-in-1 skincare mini,radiant renewal 2-in-1 skincare mini,0.82019,100,0.774597
2,Eye Recovery Pro,eye recovery pro,radiant renewal eye recovery pro,0.783095,100,0.774597
3,Neck & Chest Rejuvenating Mask,neck & chest rejuvenating mask,neck & chest rejuvenating mask,1.0,100,1.0
4,Wrinkle Retreat Light Therapy Face Mask,wrinkle retreat light therapy face mask,wrinkle retreat light therapy face mask,1.0,100,1.0
5,4-in-1 Skincare Wand,4-in-1 skincare wand,4-in-1 advanced skincare wand,0.781102,75,0.866025
6,3-Minute Pimple Spot Treatment,3-minute pimple spot treatment,bye acne: 3-minute pimple spot treatment,0.788414,100,0.816497
7,Harnesses the power of red and blue light ther...,harnesses the power of red and blue light ther...,wrinkle retreat light therapy face mask,0.531496,59,0.226455
8,Skin Therapy Activating Serum,skin therapy activating serum,skin therapy activating serum,1.0,100,1.0
9,Renew Complex Activating Serum,renew complex activating serum,renew complex activating serum,1.0,100,1.0


In [29]:
unique_df['avg_score'] = (unique_df['Cosine Score']*100 + unique_df['Fuzzy Score'] + 100*unique_df['BOW-cosine-similarity'])  /3
df_highest_avg_score = unique_df.loc[unique_df.groupby('Matched Product Title')['avg_score'].idxmax()]
unique_df = df_highest_avg_score.reset_index(drop=True)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  unique_df['avg_score'] = (unique_df['Cosine Score']*100 + unique_df['Fuzzy Score'] + 100*unique_df['BOW-cosine-similarity'])  /3


In [31]:
unique_df

Unnamed: 0,Original Inner Text,Lower Inner Text,Matched Product Title,Cosine Score,Fuzzy Score,BOW-cosine-similarity,avg_score
0,4-in-1 Skincare Wand,4-in-1 skincare wand,4-in-1 advanced skincare wand,0.781102,75,0.866025,79.904237
1,4-in-1 Skincare Wand & Activating Serum Kit,4-in-1 skincare wand & activating serum kit,4-in-1 red light therapy skincare wand & activ...,0.848641,86,0.816497,84.171237
2,3-Minute Pimple Spot Treatment,3-minute pimple spot treatment,bye acne: 3-minute pimple spot treatment,0.788414,100,0.816497,86.830338
3,Glowing Skin Starter Set,glowing skin starter set,glowing skin starter set,1.0,100,1.0,100.0
4,Magnetic Charging Cord Replacement for Radiant...,magnetic charging cord replacement for radiant...,magnetic charging cord replacement for radiant...,1.0,100,1.0,100.0
5,Neck & Chest Rejuvenating Mask,neck & chest rejuvenating mask,neck & chest rejuvenating mask,1.0,100,1.0,100.0
6,Nourishing Moisturizer,nourishing moisturizer,pre- & probiotic nourishing moisturizer,0.670376,100,0.707107,79.24942
7,Plumping Peptide Serum,plumping peptide serum,pre- & probiotic plumping peptide serum,0.702726,100,0.774597,82.577422
8,Refreshing Jelly Mist,refreshing jelly mist,pre- & probiotic refreshing jelly mist,0.739215,100,0.774597,83.793729
9,2-in-1 Skincare Mini,2-in-1 skincare mini,radiant renewal 2-in-1 skincare mini,0.82019,100,0.774597,86.49288


In [32]:
unique_df = unique_df[unique_df['avg_score']>75]
unique_df.reset_index(drop=True, inplace=True)

In [33]:
unique_df

Unnamed: 0,Original Inner Text,Lower Inner Text,Matched Product Title,Cosine Score,Fuzzy Score,BOW-cosine-similarity,avg_score
0,4-in-1 Skincare Wand,4-in-1 skincare wand,4-in-1 advanced skincare wand,0.781102,75,0.866025,79.904237
1,4-in-1 Skincare Wand & Activating Serum Kit,4-in-1 skincare wand & activating serum kit,4-in-1 red light therapy skincare wand & activ...,0.848641,86,0.816497,84.171237
2,3-Minute Pimple Spot Treatment,3-minute pimple spot treatment,bye acne: 3-minute pimple spot treatment,0.788414,100,0.816497,86.830338
3,Glowing Skin Starter Set,glowing skin starter set,glowing skin starter set,1.0,100,1.0,100.0
4,Magnetic Charging Cord Replacement for Radiant...,magnetic charging cord replacement for radiant...,magnetic charging cord replacement for radiant...,1.0,100,1.0,100.0
5,Neck & Chest Rejuvenating Mask,neck & chest rejuvenating mask,neck & chest rejuvenating mask,1.0,100,1.0,100.0
6,Nourishing Moisturizer,nourishing moisturizer,pre- & probiotic nourishing moisturizer,0.670376,100,0.707107,79.24942
7,Plumping Peptide Serum,plumping peptide serum,pre- & probiotic plumping peptide serum,0.702726,100,0.774597,82.577422
8,Refreshing Jelly Mist,refreshing jelly mist,pre- & probiotic refreshing jelly mist,0.739215,100,0.774597,83.793729
9,2-in-1 Skincare Mini,2-in-1 skincare mini,radiant renewal 2-in-1 skincare mini,0.82019,100,0.774597,86.49288


In [35]:

df = unique_df

extracted_data = pd.read_csv('extracted_data_solawave.csv')

# Convert to lowercase for matching
df['Matched Product Title'] = df['Matched Product Title'].str.lower()
df['Lower Inner Text'] = df['Lower Inner Text'].str.lower()
extracted_data['text'] = extracted_data['text'].str.lower()

# Initialize the xpath column
df['xpath'] = ''

for index, row in df.iterrows():
    matched_title = row['Matched Product Title']
    fuzzy_score = row['Fuzzy Score']
    lower_inner_text = row['Lower Inner Text']
    
    #  First search for h1-h6 and p tags
    matches = extracted_data[(extracted_data['text'] == matched_title) & 
                             (extracted_data['tag'].isin(['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p']))]
    
    # If no match is found, search for 'a' tags
    if matches.empty:
        matches = extracted_data[(extracted_data['text'] == matched_title) & (extracted_data['tag'] == 'a')]
    
    # If no match is found and the fuzzy score is 100, retry with Lower Inner Text
    if matches.empty and fuzzy_score >= 75:
        # Step 4: First search for h1-h6 and p tags using Lower Inner Text
        matches = extracted_data[(extracted_data['text'] == lower_inner_text) & 
                                 (extracted_data['tag'].isin(['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p']))]
        
        # If still no match, search for 'a' tags using Lower Inner Text
        if matches.empty:
            matches = extracted_data[(extracted_data['text'] == lower_inner_text) & (extracted_data['tag'] == 'a')]
    
    if not matches.empty:
        df.at[index, 'xpath'] = matches.iloc[0]['xpath']
    else:
        df.at[index, 'xpath'] = None

df_filtered = df.dropna(subset=['xpath'])

df_filtered.reset_index(drop=True, inplace=True)

df_filtered


Unnamed: 0,Original Inner Text,Lower Inner Text,Matched Product Title,Cosine Score,Fuzzy Score,BOW-cosine-similarity,avg_score,xpath
0,4-in-1 Skincare Wand,4-in-1 skincare wand,4-in-1 advanced skincare wand,0.781102,75,0.866025,79.904237,/html/body/div[1]/div/main/div/section[2]/ul/l...
1,4-in-1 Skincare Wand & Activating Serum Kit,4-in-1 skincare wand & activating serum kit,4-in-1 red light therapy skincare wand & activ...,0.848641,86,0.816497,84.171237,/html/body/div[1]/div/main/div/section[2]/ul/l...
2,3-Minute Pimple Spot Treatment,3-minute pimple spot treatment,bye acne: 3-minute pimple spot treatment,0.788414,100,0.816497,86.830338,/html/body/div[1]/div/main/div/section[2]/ul/l...
3,Glowing Skin Starter Set,glowing skin starter set,glowing skin starter set,1.0,100,1.0,100.0,/html/body/div[1]/div/main/div/section[3]/ul/l...
4,Magnetic Charging Cord Replacement for Radiant...,magnetic charging cord replacement for radiant...,magnetic charging cord replacement for radiant...,1.0,100,1.0,100.0,/html/body/div[1]/div/main/div/section[4]/ul/l...
5,Neck & Chest Rejuvenating Mask,neck & chest rejuvenating mask,neck & chest rejuvenating mask,1.0,100,1.0,100.0,/html/body/div[1]/div/div[1]/div[3]/div[2]/div...
6,Nourishing Moisturizer,nourishing moisturizer,pre- & probiotic nourishing moisturizer,0.670376,100,0.707107,79.24942,/html/body/div[1]/div/main/div/section[3]/ul/l...
7,Plumping Peptide Serum,plumping peptide serum,pre- & probiotic plumping peptide serum,0.702726,100,0.774597,82.577422,/html/body/div[1]/div/main/div/section[3]/ul/l...
8,Refreshing Jelly Mist,refreshing jelly mist,pre- & probiotic refreshing jelly mist,0.739215,100,0.774597,83.793729,/html/body/div[1]/div/main/div/section[3]/ul/l...
9,2-in-1 Skincare Mini,2-in-1 skincare mini,radiant renewal 2-in-1 skincare mini,0.82019,100,0.774597,86.49288,/html/body/div[1]/div/main/div/section[2]/ul/l...


In [45]:
# Finding xpaths


from selenium import webdriver
from selenium.webdriver.common.by import By

driver = webdriver.Chrome()

# Open the webpage where the products are listed
driver.get("https://global.solawave.co/collections/shop-all")

xpaths = df_filtered['xpath']

section_ids = []
for xpath in xpaths:
    try:
        print(f"Trying XPath: {xpath}")  
        # Find the element by XPath
        element = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.XPATH, xpath)))

        # Traverse upwards to find the nearest parent <section> or <div> with an ID containing "shopify-section-template"
        try:
            parent_section = element.find_element(By.XPATH, 'ancestor::*[self::section or self::div][contains(@id, "shopify-section-template")]')
            section_id = parent_section.get_attribute('id')
            print(f"Found Section ID: {section_id}")
        except Exception as e:
            # If no section ID is found, try using class attributes as a fallback
            print(f"ID not found for {xpath}. Trying class...")
            parent_section = element.find_element(By.XPATH, 'ancestor::*[self::section or self::div][contains(@class, "shopify-section-template")]')
            section_id = parent_section.get_attribute('class')
            print(f"Found Section Class: {section_id}")
        
        section_ids.append(section_id)
        
    except Exception as e:
        print(f"Failed for XPath: {xpath} - {str(e)}")
        section_ids.append(None)

driver.quit()

print(section_ids)

Trying XPath: /html/body/div[1]/div/main/div/section[2]/ul/li[6]/form/div/a[2]/h4
Failed for XPath: /html/body/div[1]/div/main/div/section[2]/ul/li[6]/form/div/a[2]/h4 - Message: no such window: target window already closed
from unknown error: web view not found
  (Session info: chrome=129.0.6668.70)
Stacktrace:
	GetHandleVerifier [0x00007FF7ACF8B125+29573]
	(No symbol) [0x00007FF7ACEFFF50]
	(No symbol) [0x00007FF7ACDBB6EA]
	(No symbol) [0x00007FF7ACD8FCD5]
	(No symbol) [0x00007FF7ACE3EF67]
	(No symbol) [0x00007FF7ACE57FC1]
	(No symbol) [0x00007FF7ACE370A3]
	(No symbol) [0x00007FF7ACE012DF]
	(No symbol) [0x00007FF7ACE02441]
	GetHandleVerifier [0x00007FF7AD2BC76D+3377613]
	GetHandleVerifier [0x00007FF7AD307B67+3685831]
	GetHandleVerifier [0x00007FF7AD2FCF8B+3641835]
	GetHandleVerifier [0x00007FF7AD04B2A6+816390]
	(No symbol) [0x00007FF7ACF0B25F]
	(No symbol) [0x00007FF7ACF07084]
	(No symbol) [0x00007FF7ACF07220]
	(No symbol) [0x00007FF7ACEF607F]
	BaseThreadInitThunk [0x00007FFFE01B257D+

In [37]:
list(section_ids)

['shopify-section-template--17078123823280__new_collection_category_6z8X4g',
 'shopify-section-template--17078123823280__new_collection_category_6z8X4g',
 'shopify-section-template--17078123823280__new_collection_category_6z8X4g',
 'shopify-section-template--17078123823280__new_collection_category_kqkBqp',
 'shopify-section-template--17078123823280__new_collection_category_Eqhmh8',
 None,
 'shopify-section-template--17078123823280__new_collection_category_kqkBqp',
 'shopify-section-template--17078123823280__new_collection_category_kqkBqp',
 'shopify-section-template--17078123823280__new_collection_category_kqkBqp',
 'shopify-section-template--17078123823280__new_collection_category_6z8X4g',
 'shopify-section-template--17078123823280__new_collection_category_6z8X4g',
 'shopify-section-template--17078123823280__new_collection_category_Eqhmh8',
 'shopify-section-template--17078123823280__new_collection_category_kqkBqp',
 'shopify-section-template--17078123823280__new_collection_category_k

In [38]:
df_filtered['section_id'] = section_ids
df_filtered

Unnamed: 0,Original Inner Text,Lower Inner Text,Matched Product Title,Cosine Score,Fuzzy Score,BOW-cosine-similarity,avg_score,xpath,section_id
0,4-in-1 Skincare Wand,4-in-1 skincare wand,4-in-1 advanced skincare wand,0.781102,75,0.866025,79.904237,/html/body/div[1]/div/main/div/section[2]/ul/l...,shopify-section-template--17078123823280__new_...
1,4-in-1 Skincare Wand & Activating Serum Kit,4-in-1 skincare wand & activating serum kit,4-in-1 red light therapy skincare wand & activ...,0.848641,86,0.816497,84.171237,/html/body/div[1]/div/main/div/section[2]/ul/l...,shopify-section-template--17078123823280__new_...
2,3-Minute Pimple Spot Treatment,3-minute pimple spot treatment,bye acne: 3-minute pimple spot treatment,0.788414,100,0.816497,86.830338,/html/body/div[1]/div/main/div/section[2]/ul/l...,shopify-section-template--17078123823280__new_...
3,Glowing Skin Starter Set,glowing skin starter set,glowing skin starter set,1.0,100,1.0,100.0,/html/body/div[1]/div/main/div/section[3]/ul/l...,shopify-section-template--17078123823280__new_...
4,Magnetic Charging Cord Replacement for Radiant...,magnetic charging cord replacement for radiant...,magnetic charging cord replacement for radiant...,1.0,100,1.0,100.0,/html/body/div[1]/div/main/div/section[4]/ul/l...,shopify-section-template--17078123823280__new_...
5,Neck & Chest Rejuvenating Mask,neck & chest rejuvenating mask,neck & chest rejuvenating mask,1.0,100,1.0,100.0,/html/body/div[1]/div/div[1]/div[3]/div[2]/div...,
6,Nourishing Moisturizer,nourishing moisturizer,pre- & probiotic nourishing moisturizer,0.670376,100,0.707107,79.24942,/html/body/div[1]/div/main/div/section[3]/ul/l...,shopify-section-template--17078123823280__new_...
7,Plumping Peptide Serum,plumping peptide serum,pre- & probiotic plumping peptide serum,0.702726,100,0.774597,82.577422,/html/body/div[1]/div/main/div/section[3]/ul/l...,shopify-section-template--17078123823280__new_...
8,Refreshing Jelly Mist,refreshing jelly mist,pre- & probiotic refreshing jelly mist,0.739215,100,0.774597,83.793729,/html/body/div[1]/div/main/div/section[3]/ul/l...,shopify-section-template--17078123823280__new_...
9,2-in-1 Skincare Mini,2-in-1 skincare mini,radiant renewal 2-in-1 skincare mini,0.82019,100,0.774597,86.49288,/html/body/div[1]/div/main/div/section[2]/ul/l...,shopify-section-template--17078123823280__new_...


In [42]:
with open('solawave_products.json', 'r') as f:
    products_data = json.load(f)

# Create a dictionary mapping product titles to their handles
# https://global.solawave.co/collections/shop-all
product_url_dict = {product['title'].lower(): f"https://global.solawave.co//products/{product['handle']}" for product in products_data['products']}

df_filtered['product_url'] = df_filtered['Matched Product Title'].apply(lambda title: product_url_dict.get(title.lower(), 'URL not found'))

df_filtered

Unnamed: 0,Original Inner Text,Lower Inner Text,Matched Product Title,Cosine Score,Fuzzy Score,BOW-cosine-similarity,avg_score,xpath,section_id,product_url
0,4-in-1 Skincare Wand,4-in-1 skincare wand,4-in-1 advanced skincare wand,0.781102,75,0.866025,79.904237,/html/body/div[1]/div/main/div/section[2]/ul/l...,shopify-section-template--17078123823280__new_...,URL not found
1,4-in-1 Skincare Wand & Activating Serum Kit,4-in-1 skincare wand & activating serum kit,4-in-1 red light therapy skincare wand & activ...,0.848641,86,0.816497,84.171237,/html/body/div[1]/div/main/div/section[2]/ul/l...,shopify-section-template--17078123823280__new_...,https://global.solawave.co//products/radiant-r...
2,3-Minute Pimple Spot Treatment,3-minute pimple spot treatment,bye acne: 3-minute pimple spot treatment,0.788414,100,0.816497,86.830338,/html/body/div[1]/div/main/div/section[2]/ul/l...,shopify-section-template--17078123823280__new_...,https://global.solawave.co//products/bye-acne-...
3,Glowing Skin Starter Set,glowing skin starter set,glowing skin starter set,1.0,100,1.0,100.0,/html/body/div[1]/div/main/div/section[3]/ul/l...,shopify-section-template--17078123823280__new_...,https://global.solawave.co//products/solabiome...
4,Magnetic Charging Cord Replacement for Radiant...,magnetic charging cord replacement for radiant...,magnetic charging cord replacement for radiant...,1.0,100,1.0,100.0,/html/body/div[1]/div/main/div/section[4]/ul/l...,shopify-section-template--17078123823280__new_...,https://global.solawave.co//products/radiant-r...
5,Neck & Chest Rejuvenating Mask,neck & chest rejuvenating mask,neck & chest rejuvenating mask,1.0,100,1.0,100.0,/html/body/div[1]/div/div[1]/div[3]/div[2]/div...,,https://global.solawave.co//products/red-light...
6,Nourishing Moisturizer,nourishing moisturizer,pre- & probiotic nourishing moisturizer,0.670376,100,0.707107,79.24942,/html/body/div[1]/div/main/div/section[3]/ul/l...,shopify-section-template--17078123823280__new_...,https://global.solawave.co//products/nourishin...
7,Plumping Peptide Serum,plumping peptide serum,pre- & probiotic plumping peptide serum,0.702726,100,0.774597,82.577422,/html/body/div[1]/div/main/div/section[3]/ul/l...,shopify-section-template--17078123823280__new_...,https://global.solawave.co//products/plumping-...
8,Refreshing Jelly Mist,refreshing jelly mist,pre- & probiotic refreshing jelly mist,0.739215,100,0.774597,83.793729,/html/body/div[1]/div/main/div/section[3]/ul/l...,shopify-section-template--17078123823280__new_...,https://global.solawave.co//products/refreshin...
9,2-in-1 Skincare Mini,2-in-1 skincare mini,radiant renewal 2-in-1 skincare mini,0.82019,100,0.774597,86.49288,/html/body/div[1]/div/main/div/section[2]/ul/l...,shopify-section-template--17078123823280__new_...,https://global.solawave.co//products/radiant-r...


In [43]:
df_filtered = df_filtered.drop(columns=['Cosine Score', 'Fuzzy Score','BOW-cosine-similarity','Original Inner Text'])
df_filtered

Unnamed: 0,Lower Inner Text,Matched Product Title,avg_score,xpath,section_id,product_url
0,4-in-1 skincare wand,4-in-1 advanced skincare wand,79.904237,/html/body/div[1]/div/main/div/section[2]/ul/l...,shopify-section-template--17078123823280__new_...,URL not found
1,4-in-1 skincare wand & activating serum kit,4-in-1 red light therapy skincare wand & activ...,84.171237,/html/body/div[1]/div/main/div/section[2]/ul/l...,shopify-section-template--17078123823280__new_...,https://global.solawave.co//products/radiant-r...
2,3-minute pimple spot treatment,bye acne: 3-minute pimple spot treatment,86.830338,/html/body/div[1]/div/main/div/section[2]/ul/l...,shopify-section-template--17078123823280__new_...,https://global.solawave.co//products/bye-acne-...
3,glowing skin starter set,glowing skin starter set,100.0,/html/body/div[1]/div/main/div/section[3]/ul/l...,shopify-section-template--17078123823280__new_...,https://global.solawave.co//products/solabiome...
4,magnetic charging cord replacement for radiant...,magnetic charging cord replacement for radiant...,100.0,/html/body/div[1]/div/main/div/section[4]/ul/l...,shopify-section-template--17078123823280__new_...,https://global.solawave.co//products/radiant-r...
5,neck & chest rejuvenating mask,neck & chest rejuvenating mask,100.0,/html/body/div[1]/div/div[1]/div[3]/div[2]/div...,,https://global.solawave.co//products/red-light...
6,nourishing moisturizer,pre- & probiotic nourishing moisturizer,79.24942,/html/body/div[1]/div/main/div/section[3]/ul/l...,shopify-section-template--17078123823280__new_...,https://global.solawave.co//products/nourishin...
7,plumping peptide serum,pre- & probiotic plumping peptide serum,82.577422,/html/body/div[1]/div/main/div/section[3]/ul/l...,shopify-section-template--17078123823280__new_...,https://global.solawave.co//products/plumping-...
8,refreshing jelly mist,pre- & probiotic refreshing jelly mist,83.793729,/html/body/div[1]/div/main/div/section[3]/ul/l...,shopify-section-template--17078123823280__new_...,https://global.solawave.co//products/refreshin...
9,2-in-1 skincare mini,radiant renewal 2-in-1 skincare mini,86.49288,/html/body/div[1]/div/main/div/section[2]/ul/l...,shopify-section-template--17078123823280__new_...,https://global.solawave.co//products/radiant-r...


In [44]:
df_filtered = df_filtered.drop(columns=['avg_score'])
df_filtered

Unnamed: 0,Lower Inner Text,Matched Product Title,xpath,section_id,product_url
0,4-in-1 skincare wand,4-in-1 advanced skincare wand,/html/body/div[1]/div/main/div/section[2]/ul/l...,shopify-section-template--17078123823280__new_...,URL not found
1,4-in-1 skincare wand & activating serum kit,4-in-1 red light therapy skincare wand & activ...,/html/body/div[1]/div/main/div/section[2]/ul/l...,shopify-section-template--17078123823280__new_...,https://global.solawave.co//products/radiant-r...
2,3-minute pimple spot treatment,bye acne: 3-minute pimple spot treatment,/html/body/div[1]/div/main/div/section[2]/ul/l...,shopify-section-template--17078123823280__new_...,https://global.solawave.co//products/bye-acne-...
3,glowing skin starter set,glowing skin starter set,/html/body/div[1]/div/main/div/section[3]/ul/l...,shopify-section-template--17078123823280__new_...,https://global.solawave.co//products/solabiome...
4,magnetic charging cord replacement for radiant...,magnetic charging cord replacement for radiant...,/html/body/div[1]/div/main/div/section[4]/ul/l...,shopify-section-template--17078123823280__new_...,https://global.solawave.co//products/radiant-r...
5,neck & chest rejuvenating mask,neck & chest rejuvenating mask,/html/body/div[1]/div/div[1]/div[3]/div[2]/div...,,https://global.solawave.co//products/red-light...
6,nourishing moisturizer,pre- & probiotic nourishing moisturizer,/html/body/div[1]/div/main/div/section[3]/ul/l...,shopify-section-template--17078123823280__new_...,https://global.solawave.co//products/nourishin...
7,plumping peptide serum,pre- & probiotic plumping peptide serum,/html/body/div[1]/div/main/div/section[3]/ul/l...,shopify-section-template--17078123823280__new_...,https://global.solawave.co//products/plumping-...
8,refreshing jelly mist,pre- & probiotic refreshing jelly mist,/html/body/div[1]/div/main/div/section[3]/ul/l...,shopify-section-template--17078123823280__new_...,https://global.solawave.co//products/refreshin...
9,2-in-1 skincare mini,radiant renewal 2-in-1 skincare mini,/html/body/div[1]/div/main/div/section[2]/ul/l...,shopify-section-template--17078123823280__new_...,https://global.solawave.co//products/radiant-r...
