<a href="https://colab.research.google.com/github/Zia-Ul-Hasan/NLP_SEMACNTIC_PRODUCT_SEARCH/blob/main/clean_data_semantic.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
# data_cleaning.py

import pandas as pd
import re
import os

products_df = pd.read_csv("product.csv", sep='\t')
queries_df = pd.read_csv("query.csv", sep='\t')
labels_df = pd.read_csv("label.csv", sep='\t')

merged_df = labels_df.merge(queries_df, on='query_id', how='left')
merged_df = merged_df.merge(products_df, on='product_id', how='left')

# Combine product-related text into one column
merged_df['product_text'] = (
    merged_df['product_name'].fillna('') + ' ' +
    merged_df['product_description'].fillna('') + ' ' +
    merged_df['product_features'].fillna('')
)

# Clean text: lowercase, remove punctuation, normalize spaces
def clean_text(text):
    text = str(text).lower()
    text = re.sub(r'[^a-z0-9\s]', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

merged_df['query'] = merged_df['query'].apply(clean_text)
merged_df['product_text'] = merged_df['product_text'].apply(clean_text)

# Drop rows with missing important labels
required_for_model = ['query_class', 'product_class', 'category hierarchy', 'product_description']
merged_df = merged_df.dropna(subset=required_for_model).reset_index(drop=True)


output_path = "cleaned_merged_products.csv"
merged_df.to_csv(output_path, index=False)

print(f" Final shape: {merged_df.shape}")


 Final shape: (181907, 15)


In [6]:
print(merged_df.head())

   id  query_id  product_id       label        query     query_class  \
0   0         0       25434       Exact  salon chair  Massage Chairs   
1   1         0       12088  Irrelevant  salon chair  Massage Chairs   
2   2         0       42931       Exact  salon chair  Massage Chairs   
3   3         0        2636       Exact  salon chair  Massage Chairs   
4   5         0       41156       Exact  salon chair  Massage Chairs   

                                        product_name  \
0       21.7 '' w waiting room chair with wood frame   
1                  22.5 '' wide polyester side chair   
2      24.4 '' w metal lounge chair with metal frame   
3  25 '' wide faux leather manual swivel standard...   
4  31.6 '' wide faux leather manual swivel ergono...   

              product_class  \
0       Waiting Room Chairs   
1             Accent Chairs   
2  Reception Seating Chairs   
3                 Recliners   
4                 Recliners   

                                  category 

In [8]:
columns_to_drop = [
    'id',
    'query_id',
    'rating_count',
    'review_count',
]

merged_df= merged_df.drop(columns=[col for col in columns_to_drop if col in merged_df.columns])

In [9]:
merged_df.head()

Unnamed: 0,product_id,label,query,query_class,product_name,product_class,category hierarchy,product_description,product_features,average_rating,product_text
0,25434,Exact,salon chair,Massage Chairs,21.7 '' w waiting room chair with wood frame,Waiting Room Chairs,Commercial Business Furniture / Commercial Off...,"this is a salon chair , barber chair for a hai...",backupholsterycolor : champagne|primarymateria...,,21 7 w waiting room chair with wood frame this...
1,12088,Irrelevant,salon chair,Massage Chairs,22.5 '' wide polyester side chair,Accent Chairs,Furniture / Living Room Furniture / Chairs & S...,add a beautiful accent to any room with this m...,overalldepth-fronttoback:27.5|design : side ch...,,22 5 wide polyester side chair add a beautiful...
2,42931,Exact,salon chair,Massage Chairs,24.4 '' w metal lounge chair with metal frame,Reception Seating Chairs,Shop Product Type / Chairs / Guest & Reception...,the heavy duty barber chair is built to last ....,color : black|seatcushionorupholsteryfillmater...,4.0,24 4 w metal lounge chair with metal frame the...
3,2636,Exact,salon chair,Massage Chairs,25 '' wide faux leather manual swivel standard...,Recliners,Furniture / Living Room Furniture / Chairs & S...,this is a chair designed for your barbershop ....,design : standard recliner|warrantylength:60 d...,5.0,25 wide faux leather manual swivel standard re...
4,41156,Exact,salon chair,Massage Chairs,31.6 '' wide faux leather manual swivel ergono...,Recliners,Furniture / Living Room Furniture / Chairs & S...,this barber chair would be a perfect choice fo...,positiontype:3-position|supplierintendedandapp...,,31 6 wide faux leather manual swivel ergonomic...
