In [2]:
import pandas as pd

In [3]:
df = pd.read_csv("dataset/product.csv", sep="\t")

In [4]:
df

Unnamed: 0,product_id,product_name,product_class,category hierarchy,product_description,product_features,rating_count,average_rating,review_count
0,0,solid wood platform bed,Beds,Furniture / Bedroom Furniture / Beds & Headboa...,"good , deep sleep can be quite difficult to ha...",overallwidth-sidetoside:64.7|dsprimaryproducts...,15.0,4.5,15.0
1,1,all-clad 7 qt . slow cooker,Slow Cookers,Kitchen & Tabletop / Small Kitchen Appliances ...,"create delicious slow-cooked meals , from tend...",capacityquarts:7|producttype : slow cooker|pro...,100.0,2.0,98.0
2,2,all-clad electrics 6.5 qt . slow cooker,Slow Cookers,Kitchen & Tabletop / Small Kitchen Appliances ...,prepare home-cooked meals on any schedule with...,features : keep warm setting|capacityquarts:6....,208.0,3.0,181.0
3,3,all-clad all professional tools pizza cutter,"Slicers, Peelers And Graters",Browse By Brand / All-Clad,this original stainless tool was designed to c...,overallwidth-sidetoside:3.5|warrantylength : l...,69.0,4.5,42.0
4,4,baldwin prestige alcott passage knob with roun...,Door Knobs,Home Improvement / Doors & Door Hardware / Doo...,the hardware has a rich heritage of delivering...,compatibledoorthickness:1.375 '' |countryofori...,70.0,5.0,42.0
...,...,...,...,...,...,...,...,...,...
42989,42989,malibu pressure balanced diverter fixed shower...,Shower Panels,Home Improvement / Bathroom Remodel & Bathroom...,the malibu pressure balanced diverter fixed sh...,producttype : shower panel|spraypattern : rain...,3.0,4.5,2.0
42990,42990,emmeline 5 piece breakfast dining set,Dining Table Sets,Furniture / Kitchen & Dining Furniture / Dinin...,,basematerialdetails : steel| : gray wood|ofhar...,1314.0,4.5,864.0
42991,42991,maloney 3 piece pub table set,Dining Table Sets,Furniture / Kitchen & Dining Furniture / Dinin...,this pub table set includes 1 counter height t...,additionaltoolsrequirednotincluded : power dri...,49.0,4.0,41.0
42992,42992,fletcher 27.5 '' wide polyester armchair,Teen Lounge Furniture|Accent Chairs,Furniture / Living Room Furniture / Chairs & S...,"bring iconic , modern style to your space in a...",legmaterialdetails : rubberwood|backheight-sea...,1746.0,4.5,1226.0


In [7]:
import pandas as pd

# Step 0: Assuming df is already read using:
# df = pd.read_csv("product.csv", sep="\t")

# Step 1: Define target columns
target_cols = ["product_name", "product_description", "product_features"]

# Step 2: Count and print number of empty or NaN values for each column
print("Missing or empty value counts:")
for col in target_cols:
    missing = df[col].isna().sum()
    empty = (df[col].astype(str).str.strip() == "").sum()
    total_missing = missing + empty
    print(f"{col}: {total_missing} (NaN: {missing}, Empty: {empty})")


Missing or empty value counts:
product_name: 0 (NaN: 0, Empty: 0)
product_description: 6008 (NaN: 6008, Empty: 0)
product_features: 0 (NaN: 0, Empty: 0)


In [8]:
# Step 3: Drop rows where any of the columns are NaN or empty string
before_shape = df.shape
df_clean = df.dropna(subset=target_cols)
df_clean = df_clean[~df_clean[target_cols].apply(lambda x: x.str.strip() == "").any(axis=1)]
after_shape = df_clean.shape

# Step 4: Show how many rows were removed
print(f"\nOriginal shape: {before_shape}")
print(f"Cleaned shape: {after_shape}")
print(f"Rows removed: {before_shape[0] - after_shape[0]}")



Original shape: (42994, 9)
Cleaned shape: (36986, 9)
Rows removed: 6008


In [11]:
# Compute character lengths
df_clean['name_char_len'] = df_clean['product_name'].astype(str).str.len()
df_clean['desc_char_len'] = df_clean['product_description'].astype(str).str.len()

# Print some stats
print("\nCharacter length stats:")
print("product_name chars:\n", df_clean['name_char_len'].describe())
print("product_description chars:\n", df_clean['desc_char_len'].describe())

# Define thresholds (adjust as needed)
name_char_threshold = 20
desc_char_threshold = 100

# Filter low-information rows based on char length
char_mask = (
    (df_clean['name_char_len'] >= name_char_threshold) &
    (df_clean['desc_char_len'] >= desc_char_threshold)
)
df_info_char = df_clean[char_mask].copy()

# Drop helper columns
df_info_char.drop(['name_char_len', 'desc_char_len'], axis=1, inplace=True)

print(f"\nRemoved {len(df_clean) - len(df_info_char)} low-information rows by char count. Remaining: {len(df_info_char)}")



Character length stats:
product_name chars:
 count    36986.000000
mean        40.603796
std         21.090450
min          3.000000
25%         28.000000
50%         37.000000
75%         49.000000
max        257.000000
Name: name_char_len, dtype: float64
product_description chars:
 count    36986.000000
mean       458.090115
std        299.150924
min          6.000000
25%        259.000000
50%        418.000000
75%        554.000000
max       4060.000000
Name: desc_char_len, dtype: float64

Removed 3913 low-information rows by char count. Remaining: 33073


In [None]:
## For token-level cleaning and general information
##
##
# # Count tokens (very basic word-level tokenization by whitespace)
# df_clean['name_token_len'] = df_clean['product_name'].astype(str).str.split().str.len()
# df_clean['desc_token_len'] = df_clean['product_description'].astype(str).str.split().str.len()

# # Print some stats
# print("\nToken length stats:")
# print("product_name tokens:\n", df_clean['name_token_len'].describe())
# print("product_description tokens:\n", df_clean['desc_token_len'].describe())

# # Define thresholds (can tweak as needed)
# name_threshold = 5
# desc_threshold = 10

# # Filter low-information rows
# info_mask = (df_clean['name_token_len'] >= name_threshold) & (df_clean['desc_token_len'] >= desc_threshold)
# df_info = df_clean[info_mask].copy()

# # Drop helper columns
# df_info.drop(['name_token_len', 'desc_token_len'], axis=1, inplace=True)

# print(f"\nRemoved {len(df_clean) - len(df_info)} low-information rows. Remaining: {len(df_info)}")



Token length stats:
product_name tokens:
 count    36986.000000
mean         6.767534
std          3.837314
min          1.000000
25%          4.000000
50%          6.000000
75%          8.000000
max         49.000000
Name: name_token_len, dtype: float64
product_description tokens:
 count    36986.000000
mean        83.023036
std         54.795643
min          1.000000
25%         46.000000
50%         76.000000
75%        102.000000
max        791.000000
Name: desc_token_len, dtype: float64

Removed 9766 low-information rows. Remaining: 27220
