### Pipeline to Transform Data

Assuming file is already present in notebooks folder

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import requests
from dotenv import load_dotenv
import os
from IPython.display import display
import re

In [3]:
df = pd.read_csv("Tyroo-dummy-data.csv", engine='pyarrow')

In [4]:
def print_bad_rows(df):
    col_names = set(df.columns.astype(str)) #get column names as strings

    mask = df.astype(str).isin(col_names) #convert all values to string (temporary) and check if they match any column name

    bad_row_indices = mask.any(axis=1) #identify rows with any such match

    bad_rows = df[bad_row_indices] #extract and count bad rows
    print(f"Found {bad_rows.shape[0]} suspicious rows.")
    display(bad_rows)
    return bad_row_indices

In [5]:
bad_row_indices = print_bad_rows(df)

Found 3 suspicious rows.


Unnamed: 0,platform_commission_rate,venture_category3_name_en,product_small_img,deeplink,availability,image_url_5,number_of_reviews,is_free_shipping,promotion_price,venture_category2_name_en,...,image_url_2,discount_percentage,seller_name,product_url,product_id,venture_category_name_local,rating_avg_value,product_big_img,image_url_3,price
5000,platform_commission_rate,venture_category3_name_en,product_small_img,deeplink,availability,image_url_5,number_of_reviews,is_free_shipping,promotion_price,venture_category2_name_en,...,image_url_2,discount_percentage,seller_name,product_url,product_id,venture_category_name_local,rating_avg_value,product_big_img,image_url_3,price
10001,platform_commission_rate,venture_category3_name_en,product_small_img,deeplink,availability,image_url_5,number_of_reviews,is_free_shipping,promotion_price,venture_category2_name_en,...,image_url_2,discount_percentage,seller_name,product_url,product_id,venture_category_name_local,rating_avg_value,product_big_img,image_url_3,price
15002,platform_commission_rate,venture_category3_name_en,product_small_img,deeplink,availability,image_url_5,number_of_reviews,is_free_shipping,promotion_price,venture_category2_name_en,...,image_url_2,discount_percentage,seller_name,product_url,product_id,venture_category_name_local,rating_avg_value,product_big_img,image_url_3,price


In [6]:
df = df[~bad_row_indices].reset_index(drop=True)

In [7]:
num_cols=[
    'platform_commission_rate', 'product_commission_rate', 
    'bonus_commission_rate', 'promotion_price', 'current_price', 
    'price', 'discount_percentage', 'number_of_reviews', 
    'rating_avg_value', 'seller_rating'
]

In [8]:
for col in num_cols:
    df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0.0)

In [9]:
df['is_free_shipping'] = df['is_free_shipping'].fillna('0').astype(bool)
df['availability'] = df['availability'].fillna('out of stock')
df['availability'] = df['availability'].map({'in stock': True, 'out of stock': False})

In [10]:
image_cols = [col for col in df.columns if 'img' in col or 'image_url' in col]
df[image_cols] = df[image_cols].fillna('')

In [11]:
df['deeplink'] = df['deeplink'].fillna('')
df['product_url'] = df['product_url'].fillna('')
df['seller_url'] = df['seller_url'].fillna('')

In [12]:
text_cols = [
    'venture_category3_name_en', 'venture_category2_name_en', 'venture_category1_name_en',
    'venture_category_name_local', 'brand_name', 'business_type', 'business_area',
    'product_name', 'seller_name'
]

df[text_cols] = df[text_cols].fillna('Unknown')

In [13]:
df['description'] = df['description'].fillna(df['product_name'])

In [14]:
def clean_description(text, product_name):
    if pd.isna(text):
        return product_name  # Return product_name if description is NaN
    
    text = str(text).strip().lower()
    
    junk_patterns = [               # Known garbage patterns
        r'^(&nbsp;)+$',             # only non-breaking spaces
        r'^-+$',                    # dashes only
        r'^\.{1,2}$',               # just . or ..
        r'^welcome to my shop.*$',  # shop templates
        r'^www.*$',                 # website urls
        r'^$',                      # empty strings
        r'^No description currently*$',  #pattern
        r'^no description currently*$', #pattern
        r'^_*$',                    # underscore only
    ]
    
    for pattern in junk_patterns:
        if re.match(pattern, text):
            return product_name  # Return product_name instead of NaN

    text = re.sub(r'&[a-z]+;', '', text)    # Strip HTML entities like &nbsp;

    text = re.sub(r'\s{2,}', ' ', text)     # Remove excessive spaces or dots
    text = text.strip()
    
    if len(text) < 10:  # If after cleaning it's too short, return product_name
        return product_name
    
    return text

# Apply the function row-wise, passing both description and product_name
df['description'] = df.apply(lambda row: clean_description(row['description'], row['product_name']), axis=1)


In [15]:
new_df = df[["product_name", "description"]].copy()

In [16]:
def fast_clean(series):
    return (
        series
        .fillna('')                               # Handle NaNs
        .str.encode('ascii', errors='ignore')     # Remove non-ASCII
        .str.decode('ascii')                      # Decode back to str
        .str.replace(r'[^a-zA-Z0-9\s]', '', regex=True)  # Remove special chars
        .str.replace(r'\s+', ' ', regex=True)     # Normalize whitespace
        .str.strip()                              # Remove leading/trailing spaces
        .str.lower()                              # Optional: lowercase
    )


In [17]:
new_df['product_name'] = fast_clean(new_df['product_name'])
new_df['description'] = fast_clean(new_df['description'])

In [18]:
from sklearn.feature_extraction.text import HashingVectorizer

vectorizer = HashingVectorizer(
    stop_words='english',
    n_features=2**10,
    alternate_sign=False,
    norm='l2'                  # ensures vectors are normalized for cosine via dot product
)

# Transform columns
name_vecs = vectorizer.transform(new_df['product_name'])
desc_vecs = vectorizer.transform(new_df['description'])

similarities = (name_vecs.multiply(desc_vecs)).sum(axis=1).A1  # faster row-wise dot product

# Add similarity scores
new_df['similarity'] = similarities

new_df


Unnamed: 0,product_name,description,similarity
0,myvi viva side bumper clipair panelwiper panel...,myvi viva side bumper clipair panelwiper panel...,1.000000
1,buy one get oneolaplex no7 bonding oil 30ml an...,key benefitsrepairs damaged and compromised ha...,0.287476
2,70mai s500 rearview dashcam wide with night vi...,70mai s500 rearview dashcam wide with night vi...,1.000000
3,tissues paper disposable cleaning cloths wipe ...,size50pcsroll the patterns are randomizedthe c...,0.291730
4,natural aloe vera gel 300g containing plant es...,natural aloe vera gel 300g containing plant es...,1.000000
...,...,...,...
999995,maylee 2 in 1 single fitted bedsheet set cadar...,what is inside this 2pcs set 1 pc of fitted be...,0.308697
999996,bullcaptain genuine leather mens wallet high q...,material cowhide made from the first layer of ...,0.303466
999997,ladies korean style loose long sleeve shirt top,style sweet and freshcollege clothing style de...,0.242091
999998,deli multi functional tools bag waist pouch be...,deli multi functional tools bag waist pouch be...,1.000000


In [19]:
new_df.loc[new_df['similarity'] == 0.0, 'description'] = new_df.loc[new_df['similarity'] == 0.0, 'product_name']

In [20]:
df['product_name'] = new_df['product_name']
df['description'] = new_df['description']

In [21]:
df.to_csv('transformed_data.csv', index=False)

-------------------------------

In [None]:
import pandas as pd
import numpy as np
import re
import os
import swifter
from sklearn.feature_extraction.text import HashingVectorizer
from IPython.display import display

# 1. Chunked CSV Reading
chunk_size = 50000
# chunks = pd.read_csv("Tyroo-dummy-data.csv", engine='pyarrow')
chunks = pd.read_csv("Tyroo-dummy-data.csv", chunksize=chunk_size)

def print_bad_rows(df):
    col_names = set(df.columns.astype(str))
    mask = df.astype(str).isin(col_names)
    bad_row_indices = mask.any(axis=1)
    bad_rows = df[bad_row_indices]
    print(f"Found {bad_rows.shape[0]} suspicious rows.")
    display(bad_rows)
    return bad_row_indices

# Process chunks and clean
df_list = []
for chunk in chunks:
    bad_row_indices = print_bad_rows(chunk)
    chunk = chunk[~bad_row_indices].reset_index(drop=True)
    df_list.append(chunk)

df = pd.concat(df_list, ignore_index=True)


# 2. Basic Data Cleanup
num_cols = [
    'platform_commission_rate', 'product_commission_rate',
    'bonus_commission_rate', 'promotion_price', 'current_price',
    'price', 'discount_percentage', 'number_of_reviews',
    'rating_avg_value', 'seller_rating'
]

df[num_cols] = df[num_cols].apply(pd.to_numeric, errors='coerce').fillna(0.0)

text_cols = [
    'venture_category3_name_en', 'venture_category2_name_en', 'venture_category1_name_en',
    'venture_category_name_local', 'brand_name', 'business_type', 'business_area',
    'product_name', 'seller_name'
]

df.fillna({
    'is_free_shipping': '0',
    'availability': 'out of stock',
    'deeplink': '',
    'product_url': '',
    'seller_url': '',
    'description': np.nan,
    **{col: 'Unknown' for col in text_cols}
}, inplace=True)

df['is_free_shipping'] = df['is_free_shipping'].astype(bool)
df['availability'] = df['availability'].map({'in stock': True, 'out of stock': False})

image_cols = [col for col in df.columns if 'img' in col or 'image_url' in col]
df[image_cols] = df[image_cols].fillna('')

df['description'] = df['description'].fillna(df['product_name'])


# 3. Clean Description
junk_patterns = [
    r'^(&nbsp;)+$', r'^-+$', r'^\.{1,2}$', r'^welcome to my shop.*$', r'^www.*$',
    r'^$', r'^No description currently*$', r'^no description currently*$', r'^_*$'
]
compiled_patterns = [re.compile(pat) for pat in junk_patterns]

def clean_description(text, product_name):
    if pd.isna(text):
        return product_name
    text = str(text).strip().lower()
    if any(p.match(text) for p in compiled_patterns):
        return product_name
    text = re.sub(r'&[a-z]+;', '', text)
    text = re.sub(r'\s{2,}', ' ', text).strip()
    if len(text) < 10:
        return product_name
    return text

df['description'] = df.swifter.apply(
    lambda row: clean_description(row['description'], row['product_name']),
    axis=1
)


# 4. Text Normalization
def fast_clean(series):
    return (
        series.fillna('')
              .str.encode('ascii', errors='ignore').str.decode('ascii')
              .str.replace(r'[^a-zA-Z0-9\s]', '', regex=True)
              .str.replace(r'\s+', ' ', regex=True)
              .str.strip()
              .str.lower()
    )

df['product_name'] = fast_clean(df['product_name'])
df['description'] = fast_clean(df['description'])


# 5. Vectorize and Compare
vectorizer = HashingVectorizer(
    stop_words='english',
    n_features=2**10,
    alternate_sign=False,
    norm='l2'
)

name_vecs = vectorizer.transform(df['product_name'])
desc_vecs = vectorizer.transform(df['description'])

# Dot product row-wise
similarities = (name_vecs.multiply(desc_vecs)).sum(axis=1).A1

# Update description if similarity is zero
df['similarity'] = similarities
df.loc[df['similarity'] == 0.0, 'description'] = df.loc[df['similarity'] == 0.0, 'product_name']

# Drop similarity column
df.drop(columns=['similarity'], inplace=True)

# 6. Save Output
df.to_csv('transformed_data.csv', index=False)
print("✅ Data transformation complete and saved to 'transformed_data.csv'")


  for chunk in chunks:


Found 3 suspicious rows.


Unnamed: 0,platform_commission_rate,venture_category3_name_en,product_small_img,deeplink,availability,image_url_5,number_of_reviews,is_free_shipping,promotion_price,venture_category2_name_en,...,image_url_2,discount_percentage,seller_name,product_url,product_id,venture_category_name_local,rating_avg_value,product_big_img,image_url_3,price
5000,platform_commission_rate,venture_category3_name_en,product_small_img,deeplink,availability,image_url_5,number_of_reviews,is_free_shipping,promotion_price,venture_category2_name_en,...,image_url_2,discount_percentage,seller_name,product_url,product_id,venture_category_name_local,rating_avg_value,product_big_img,image_url_3,price
10001,platform_commission_rate,venture_category3_name_en,product_small_img,deeplink,availability,image_url_5,number_of_reviews,is_free_shipping,promotion_price,venture_category2_name_en,...,image_url_2,discount_percentage,seller_name,product_url,product_id,venture_category_name_local,rating_avg_value,product_big_img,image_url_3,price
15002,platform_commission_rate,venture_category3_name_en,product_small_img,deeplink,availability,image_url_5,number_of_reviews,is_free_shipping,promotion_price,venture_category2_name_en,...,image_url_2,discount_percentage,seller_name,product_url,product_id,venture_category_name_local,rating_avg_value,product_big_img,image_url_3,price


Found 0 suspicious rows.


Unnamed: 0,platform_commission_rate,venture_category3_name_en,product_small_img,deeplink,availability,image_url_5,number_of_reviews,is_free_shipping,promotion_price,venture_category2_name_en,...,image_url_2,discount_percentage,seller_name,product_url,product_id,venture_category_name_local,rating_avg_value,product_big_img,image_url_3,price


Found 0 suspicious rows.


Unnamed: 0,platform_commission_rate,venture_category3_name_en,product_small_img,deeplink,availability,image_url_5,number_of_reviews,is_free_shipping,promotion_price,venture_category2_name_en,...,image_url_2,discount_percentage,seller_name,product_url,product_id,venture_category_name_local,rating_avg_value,product_big_img,image_url_3,price


Found 0 suspicious rows.


Unnamed: 0,platform_commission_rate,venture_category3_name_en,product_small_img,deeplink,availability,image_url_5,number_of_reviews,is_free_shipping,promotion_price,venture_category2_name_en,...,image_url_2,discount_percentage,seller_name,product_url,product_id,venture_category_name_local,rating_avg_value,product_big_img,image_url_3,price


Found 0 suspicious rows.


Unnamed: 0,platform_commission_rate,venture_category3_name_en,product_small_img,deeplink,availability,image_url_5,number_of_reviews,is_free_shipping,promotion_price,venture_category2_name_en,...,image_url_2,discount_percentage,seller_name,product_url,product_id,venture_category_name_local,rating_avg_value,product_big_img,image_url_3,price


Found 0 suspicious rows.


Unnamed: 0,platform_commission_rate,venture_category3_name_en,product_small_img,deeplink,availability,image_url_5,number_of_reviews,is_free_shipping,promotion_price,venture_category2_name_en,...,image_url_2,discount_percentage,seller_name,product_url,product_id,venture_category_name_local,rating_avg_value,product_big_img,image_url_3,price


Found 0 suspicious rows.


Unnamed: 0,platform_commission_rate,venture_category3_name_en,product_small_img,deeplink,availability,image_url_5,number_of_reviews,is_free_shipping,promotion_price,venture_category2_name_en,...,image_url_2,discount_percentage,seller_name,product_url,product_id,venture_category_name_local,rating_avg_value,product_big_img,image_url_3,price


Found 0 suspicious rows.


Unnamed: 0,platform_commission_rate,venture_category3_name_en,product_small_img,deeplink,availability,image_url_5,number_of_reviews,is_free_shipping,promotion_price,venture_category2_name_en,...,image_url_2,discount_percentage,seller_name,product_url,product_id,venture_category_name_local,rating_avg_value,product_big_img,image_url_3,price


Found 0 suspicious rows.


Unnamed: 0,platform_commission_rate,venture_category3_name_en,product_small_img,deeplink,availability,image_url_5,number_of_reviews,is_free_shipping,promotion_price,venture_category2_name_en,...,image_url_2,discount_percentage,seller_name,product_url,product_id,venture_category_name_local,rating_avg_value,product_big_img,image_url_3,price


Found 0 suspicious rows.


Unnamed: 0,platform_commission_rate,venture_category3_name_en,product_small_img,deeplink,availability,image_url_5,number_of_reviews,is_free_shipping,promotion_price,venture_category2_name_en,...,image_url_2,discount_percentage,seller_name,product_url,product_id,venture_category_name_local,rating_avg_value,product_big_img,image_url_3,price


Found 0 suspicious rows.


Unnamed: 0,platform_commission_rate,venture_category3_name_en,product_small_img,deeplink,availability,image_url_5,number_of_reviews,is_free_shipping,promotion_price,venture_category2_name_en,...,image_url_2,discount_percentage,seller_name,product_url,product_id,venture_category_name_local,rating_avg_value,product_big_img,image_url_3,price


Found 0 suspicious rows.


Unnamed: 0,platform_commission_rate,venture_category3_name_en,product_small_img,deeplink,availability,image_url_5,number_of_reviews,is_free_shipping,promotion_price,venture_category2_name_en,...,image_url_2,discount_percentage,seller_name,product_url,product_id,venture_category_name_local,rating_avg_value,product_big_img,image_url_3,price


Found 0 suspicious rows.


Unnamed: 0,platform_commission_rate,venture_category3_name_en,product_small_img,deeplink,availability,image_url_5,number_of_reviews,is_free_shipping,promotion_price,venture_category2_name_en,...,image_url_2,discount_percentage,seller_name,product_url,product_id,venture_category_name_local,rating_avg_value,product_big_img,image_url_3,price


Found 0 suspicious rows.


Unnamed: 0,platform_commission_rate,venture_category3_name_en,product_small_img,deeplink,availability,image_url_5,number_of_reviews,is_free_shipping,promotion_price,venture_category2_name_en,...,image_url_2,discount_percentage,seller_name,product_url,product_id,venture_category_name_local,rating_avg_value,product_big_img,image_url_3,price


Found 0 suspicious rows.


Unnamed: 0,platform_commission_rate,venture_category3_name_en,product_small_img,deeplink,availability,image_url_5,number_of_reviews,is_free_shipping,promotion_price,venture_category2_name_en,...,image_url_2,discount_percentage,seller_name,product_url,product_id,venture_category_name_local,rating_avg_value,product_big_img,image_url_3,price


Found 0 suspicious rows.


Unnamed: 0,platform_commission_rate,venture_category3_name_en,product_small_img,deeplink,availability,image_url_5,number_of_reviews,is_free_shipping,promotion_price,venture_category2_name_en,...,image_url_2,discount_percentage,seller_name,product_url,product_id,venture_category_name_local,rating_avg_value,product_big_img,image_url_3,price


Found 0 suspicious rows.


Unnamed: 0,platform_commission_rate,venture_category3_name_en,product_small_img,deeplink,availability,image_url_5,number_of_reviews,is_free_shipping,promotion_price,venture_category2_name_en,...,image_url_2,discount_percentage,seller_name,product_url,product_id,venture_category_name_local,rating_avg_value,product_big_img,image_url_3,price


Found 0 suspicious rows.


Unnamed: 0,platform_commission_rate,venture_category3_name_en,product_small_img,deeplink,availability,image_url_5,number_of_reviews,is_free_shipping,promotion_price,venture_category2_name_en,...,image_url_2,discount_percentage,seller_name,product_url,product_id,venture_category_name_local,rating_avg_value,product_big_img,image_url_3,price


Found 0 suspicious rows.


Unnamed: 0,platform_commission_rate,venture_category3_name_en,product_small_img,deeplink,availability,image_url_5,number_of_reviews,is_free_shipping,promotion_price,venture_category2_name_en,...,image_url_2,discount_percentage,seller_name,product_url,product_id,venture_category_name_local,rating_avg_value,product_big_img,image_url_3,price


Found 0 suspicious rows.


Unnamed: 0,platform_commission_rate,venture_category3_name_en,product_small_img,deeplink,availability,image_url_5,number_of_reviews,is_free_shipping,promotion_price,venture_category2_name_en,...,image_url_2,discount_percentage,seller_name,product_url,product_id,venture_category_name_local,rating_avg_value,product_big_img,image_url_3,price


Found 0 suspicious rows.


Unnamed: 0,platform_commission_rate,venture_category3_name_en,product_small_img,deeplink,availability,image_url_5,number_of_reviews,is_free_shipping,promotion_price,venture_category2_name_en,...,image_url_2,discount_percentage,seller_name,product_url,product_id,venture_category_name_local,rating_avg_value,product_big_img,image_url_3,price


Pandas Apply: 100%|██████████| 1000000/1000000 [00:33<00:00, 29732.31it/s]


✅ Data transformation complete and saved to 'transformed_data.csv'


In [None]:
import pandas as pd
import numpy as np
import re
import os
import swifter
from sklearn.feature_extraction.text import HashingVectorizer
from IPython.display import display


# 1. Chunked CSV Reading

chunk_size = 50000
# chunks = pd.read_csv("Tyroo-dummy-data.csv", engine='pyarrow')
chunks = pd.read_csv("Tyroo-dummy-data.csv", chunksize=chunk_size)

def print_bad_rows(df):
    col_names = set(df.columns.astype(str))
    mask = df.astype(str).isin(col_names)
    bad_row_indices = mask.any(axis=1)
    bad_rows = df[bad_row_indices]
    print(f"Found {bad_rows.shape[0]} suspicious rows.")
    display(bad_rows)
    return bad_row_indices

# Process chunks and clean
df_list = []
for chunk in chunks:
    bad_row_indices = print_bad_rows(chunk)
    chunk = chunk[~bad_row_indices].reset_index(drop=True)
    df_list.append(chunk)

df = pd.concat(df_list, ignore_index=True)


# 2. Basic Data Cleanup

num_cols = [
    'platform_commission_rate', 'product_commission_rate',
    'bonus_commission_rate', 'promotion_price', 'current_price',
    'price', 'discount_percentage', 'number_of_reviews',
    'rating_avg_value', 'seller_rating'
]

df[num_cols] = df[num_cols].apply(pd.to_numeric, errors='coerce').fillna(0.0)

text_cols = [
    'venture_category3_name_en', 'venture_category2_name_en', 'venture_category1_name_en',
    'venture_category_name_local', 'brand_name', 'business_type', 'business_area',
    'product_name', 'seller_name'
]

df.fillna({
    'is_free_shipping': '0',
    'availability': 'out of stock',
    'deeplink': '',
    'product_url': '',
    'seller_url': '',
    'description': np.nan,
    **{col: 'Unknown' for col in text_cols}
}, inplace=True)

df['is_free_shipping'] = df['is_free_shipping'].astype(bool)
df['availability'] = df['availability'].map({'in stock': True, 'out of stock': False})

image_cols = [col for col in df.columns if 'img' in col or 'image_url' in col]
df[image_cols] = df[image_cols].fillna('')

df['description'] = df['description'].fillna(df['product_name'])


# 3. Clean Description

junk_patterns = [
    r'^(&nbsp;)+$', r'^-+$', r'^\.{1,2}$', r'^welcome to my shop.*$', r'^www.*$',
    r'^$', r'^No description currently*$', r'^no description currently*$', r'^_*$'
]
compiled_patterns = [re.compile(pat) for pat in junk_patterns]

def clean_description(text, product_name):
    if pd.isna(text):
        return product_name
    text = str(text).strip().lower()
    if any(p.match(text) for p in compiled_patterns):
        return product_name
    text = re.sub(r'&[a-z]+;', '', text)
    text = re.sub(r'\s{2,}', ' ', text).strip()
    if len(text) < 10:
        return product_name
    return text

df['description'] = df.swifter.apply(
    lambda row: clean_description(row['description'], row['product_name']),
    axis=1
)


# 4. Text Normalization

def fast_clean(series):
    return (
        series.fillna('')
              .str.encode('ascii', errors='ignore').str.decode('ascii')
              .str.replace(r'[^a-zA-Z0-9\s]', '', regex=True)
              .str.replace(r'\s+', ' ', regex=True)
              .str.strip()
              .str.lower()
    )

df['product_name'] = fast_clean(df['product_name'])
df['description'] = fast_clean(df['description'])


# 5. Vectorize and Compare

vectorizer = HashingVectorizer(
    stop_words='english',
    n_features=2**10,
    alternate_sign=False,
    norm='l2'
)

name_vecs = vectorizer.transform(df['product_name'])
desc_vecs = vectorizer.transform(df['description'])

# Dot product row-wise
similarities = (name_vecs.multiply(desc_vecs)).sum(axis=1).A1

# Update description if similarity is zero
df['similarity'] = similarities
df.loc[df['similarity'] == 0.0, 'description'] = df.loc[df['similarity'] == 0.0, 'product_name']

# Drop similarity column
df.drop(columns=['similarity'], inplace=True)


# 6. Save Output

df.to_csv('transformed_data.csv', index=False)
print("✅ Data transformation complete and saved to 'transformed_data.csv'")


In [None]:
import pandas as pd
import numpy as np
import re
import os
import multiprocessing
from sklearn.feature_extraction.text import HashingVectorizer


# 1. Chunked CSV Reading

chunk_size = 50000
chunks = pd.read_csv("Tyroo-dummy-data.csv", chunksize=chunk_size)

def print_bad_rows(df):
    col_names = set(df.columns.astype(str))
    mask = df.astype(str).isin(col_names)
    bad_row_indices = mask.any(axis=1)
    print(f"Found {bad_row_indices.sum()} suspicious rows.")
    return bad_row_indices

df_list = []
for chunk in chunks:
    bad_row_indices = print_bad_rows(chunk)
    chunk = chunk[~bad_row_indices].reset_index(drop=True)
    df_list.append(chunk)

df = pd.concat(df_list, ignore_index=True)


# 2. Basic Data Cleanup

num_cols = [
    'platform_commission_rate', 'product_commission_rate',
    'bonus_commission_rate', 'promotion_price', 'current_price',
    'price', 'discount_percentage', 'number_of_reviews',
    'rating_avg_value', 'seller_rating'
]

df[num_cols] = df[num_cols].apply(pd.to_numeric, errors='coerce').fillna(0.0)

text_cols = [
    'venture_category3_name_en', 'venture_category2_name_en', 'venture_category1_name_en',
    'venture_category_name_local', 'brand_name', 'business_type', 'business_area',
    'product_name', 'seller_name'
]

df.fillna({
    'is_free_shipping': '0',
    'availability': 'out of stock',
    'deeplink': '',
    'product_url': '',
    'seller_url': '',
    'description': np.nan,
    **{col: 'Unknown' for col in text_cols}
}, inplace=True)

df['is_free_shipping'] = df['is_free_shipping'].astype(bool)
df['availability'] = df['availability'].map({'in stock': True, 'out of stock': False})

image_cols = [col for col in df.columns if 'img' in col or 'image_url' in col]
df[image_cols] = df[image_cols].fillna('')

df['description'] = df['description'].fillna(df['product_name'])


# 3. Clean Description in Parallel

junk_patterns = [
    r'^(&nbsp;)+$', r'^-+$', r'^\.{1,2}$', r'^welcome to my shop.*$', r'^www.*$',
    r'^$', r'^No description currently*$', r'^no description currently*$', r'^_*$'
]
compiled_patterns = [re.compile(pat) for pat in junk_patterns]

def clean_single_desc(args):
    desc, product_name = args
    if pd.isna(desc):
        return product_name
    text = str(desc).strip().lower()
    if any(p.match(text) for p in compiled_patterns):
        return product_name
    text = re.sub(r'&[a-z]+;', '', text)
    text = re.sub(r'\s{2,}', ' ', text).strip()
    if len(text) < 10:
        return product_name
    return text

# Run in parallel using multiprocessing
with multiprocessing.Pool() as pool:
    cleaned_descriptions = pool.map(
        clean_single_desc,
        zip(df['description'], df['product_name'])
    )

df['description'] = cleaned_descriptions


# 4. Text Normalization

def fast_clean(series):
    return (
        series.fillna('')
              .str.encode('ascii', errors='ignore').str.decode('ascii')
              .str.replace(r'[^a-zA-Z0-9\s]', '', regex=True)
              .str.replace(r'\s+', ' ', regex=True)
              .str.strip()
              .str.lower()
    )

df['product_name'] = fast_clean(df['product_name'])
df['description'] = fast_clean(df['description'])


# 5. Vectorize and Compare

vectorizer = HashingVectorizer(
    stop_words='english',
    n_features=2**10,
    alternate_sign=False,
    norm='l2'
)

name_vecs = vectorizer.transform(df['product_name'])
desc_vecs = vectorizer.transform(df['description'])

similarities = (name_vecs.multiply(desc_vecs)).sum(axis=1).A1

df['similarity'] = similarities
df.loc[df['similarity'] == 0.0, 'description'] = df.loc[df['similarity'] == 0.0, 'product_name']
df.drop(columns=['similarity'], inplace=True)


# 6. Save Output

df.to_csv('transformed_data.csv', index=False)
print("✅ Data transformation complete and saved to 'transformed_data.csv'")


  for chunk in chunks:


Found 3 suspicious rows.
Found 0 suspicious rows.
Found 0 suspicious rows.
Found 0 suspicious rows.
Found 0 suspicious rows.
Found 0 suspicious rows.
Found 0 suspicious rows.
Found 0 suspicious rows.
Found 0 suspicious rows.
Found 0 suspicious rows.
Found 0 suspicious rows.
Found 0 suspicious rows.
Found 0 suspicious rows.
Found 0 suspicious rows.
Found 0 suspicious rows.
Found 0 suspicious rows.
Found 0 suspicious rows.
Found 0 suspicious rows.
Found 0 suspicious rows.
Found 0 suspicious rows.
Found 0 suspicious rows.


In [None]:
import pandas as pd
import numpy as np
import re
import os
import multiprocessing
import logging
import time
from sklearn.feature_extraction.text import HashingVectorizer


# Logging Configuration

logging.basicConfig(
    level=logging.INFO,
    format='[%(asctime)s] %(levelname)s: %(message)s',
    datefmt='%H:%M:%S'
)

def log_time(msg, start_time):
    elapsed = time.time() - start_time
    logging.info(f"{msg} — completed in {elapsed:.2f} seconds")


# 1. Chunked CSV Reading

start = time.time()
logging.info("🔍 Reading CSV in chunks...")
chunk_size = 50000
chunks = pd.read_csv("Tyroo-dummy-data.csv", chunksize=chunk_size)

def print_bad_rows(df):
    col_names = set(df.columns.astype(str))
    mask = df.astype(str).isin(col_names)
    bad_row_indices = mask.any(axis=1)
    logging.info(f"⚠️  Found {bad_row_indices.sum()} suspicious rows.")
    return bad_row_indices

df_list = []
for i, chunk in enumerate(chunks):
    logging.info(f"📦 Processing chunk {i + 1}")
    bad_row_indices = print_bad_rows(chunk)
    chunk = chunk[~bad_row_indices].reset_index(drop=True)
    df_list.append(chunk)

df = pd.concat(df_list, ignore_index=True)
log_time("✅ CSV loading and filtering", start)


# 2. Basic Data Cleanup

start = time.time()
logging.info("🧹 Starting basic cleanup...")

num_cols = [
    'platform_commission_rate', 'product_commission_rate',
    'bonus_commission_rate', 'promotion_price', 'current_price',
    'price', 'discount_percentage', 'number_of_reviews',
    'rating_avg_value', 'seller_rating'
]

df[num_cols] = df[num_cols].apply(pd.to_numeric, errors='coerce').fillna(0.0)

text_cols = [
    'venture_category3_name_en', 'venture_category2_name_en', 'venture_category1_name_en',
    'venture_category_name_local', 'brand_name', 'business_type', 'business_area',
    'product_name', 'seller_name'
]

df.fillna({
    'is_free_shipping': '0',
    'availability': 'out of stock',
    'deeplink': '',
    'product_url': '',
    'seller_url': '',
    'description': np.nan,
    **{col: 'Unknown' for col in text_cols}
}, inplace=True)

df['is_free_shipping'] = df['is_free_shipping'].astype(bool)
df['availability'] = df['availability'].map({'in stock': True, 'out of stock': False})

image_cols = [col for col in df.columns if 'img' in col or 'image_url' in col]
df[image_cols] = df[image_cols].fillna('')
df['description'] = df['description'].fillna(df['product_name'])

log_time("✅ Basic cleanup", start)


# 3. Clean Description in Parallel

start = time.time()
logging.info("🧼 Cleaning descriptions in parallel...")

junk_patterns = [
    r'^(&nbsp;)+$', r'^-+$', r'^\.{1,2}$', r'^welcome to my shop.*$', r'^www.*$',
    r'^$', r'^No description currently*$', r'^no description currently*$', r'^_*$'
]
compiled_patterns = [re.compile(pat) for pat in junk_patterns]

def clean_single_desc(args):
    desc, product_name = args
    if pd.isna(desc):
        return product_name
    text = str(desc).strip().lower()
    if any(p.match(text) for p in compiled_patterns):
        return product_name
    text = re.sub(r'&[a-z]+;', '', text)
    text = re.sub(r'\s{2,}', ' ', text).strip()
    if len(text) < 10:
        return product_name
    return text

import swifter

df['description'] = df.swifter.apply(
    lambda row: clean_single_desc((row['description'], row['product_name'])),
    axis=1
)

log_time("✅ Description cleaning", start)


# 4. Text Normalization

start = time.time()
logging.info("🔠 Normalizing text fields...")

def fast_clean(series):
    return (
        series.fillna('')
              .str.encode('ascii', errors='ignore').str.decode('ascii')
              .str.replace(r'[^a-zA-Z0-9\s]', '', regex=True)
              .str.replace(r'\s+', ' ', regex=True)
              .str.strip()
              .str.lower()
    )

df['product_name'] = fast_clean(df['product_name'])
df['description'] = fast_clean(df['description'])

log_time("✅ Text normalization", start)


# 5. Vectorize and Compare

start = time.time()
logging.info("📊 Vectorizing and computing similarities...")

vectorizer = HashingVectorizer(
    stop_words='english',
    n_features=2**10,
    alternate_sign=False,
    norm='l2'
)

name_vecs = vectorizer.transform(df['product_name'])
desc_vecs = vectorizer.transform(df['description'])

similarities = (name_vecs.multiply(desc_vecs)).sum(axis=1).A1
df['similarity'] = similarities

df.loc[df['similarity'] == 0.0, 'description'] = df.loc[df['similarity'] == 0.0, 'product_name']
df.drop(columns=['similarity'], inplace=True)

log_time("✅ Similarity vectorization", start)


# 6. Save Output

start = time.time()
logging.info("💾 Saving final CSV...")

df.to_csv('transformed_data.csv', index=False)

log_time("✅ CSV saved to 'transformed_data.csv'", start)
logging.info("🎉 All steps completed successfully!")


[15:48:28] INFO: 🔍 Reading CSV in chunks...
  for i, chunk in enumerate(chunks):
[15:48:30] INFO: 📦 Processing chunk 1
[15:48:30] INFO: ⚠️  Found 3 suspicious rows.
[15:48:32] INFO: 📦 Processing chunk 2
[15:48:32] INFO: ⚠️  Found 0 suspicious rows.
[15:48:34] INFO: 📦 Processing chunk 3
[15:48:35] INFO: ⚠️  Found 0 suspicious rows.
[15:48:37] INFO: 📦 Processing chunk 4
[15:48:37] INFO: ⚠️  Found 0 suspicious rows.
[15:48:39] INFO: 📦 Processing chunk 5
[15:48:39] INFO: ⚠️  Found 0 suspicious rows.
[15:48:41] INFO: 📦 Processing chunk 6
[15:48:42] INFO: ⚠️  Found 0 suspicious rows.
[15:48:43] INFO: 📦 Processing chunk 7
[15:48:44] INFO: ⚠️  Found 0 suspicious rows.
[15:48:46] INFO: 📦 Processing chunk 8
[15:48:46] INFO: ⚠️  Found 0 suspicious rows.
[15:48:48] INFO: 📦 Processing chunk 9
[15:48:48] INFO: ⚠️  Found 0 suspicious rows.
[15:48:50] INFO: 📦 Processing chunk 10
[15:48:50] INFO: ⚠️  Found 0 suspicious rows.
[15:48:52] INFO: 📦 Processing chunk 11
[15:48:53] INFO: ⚠️  Found 0 suspicious

In [None]:
import pandas as pd
import numpy as np
import re
import os
import logging
import time
from sklearn.feature_extraction.text import HashingVectorizer
import swifter


# Logging Configuration

logging.basicConfig(
    level=logging.INFO,
    format='[%(asctime)s] %(levelname)s: %(message)s',
    datefmt='%H:%M:%S'
)

def log_time(msg, start_time):
    elapsed = time.time() - start_time
    logging.info(f"{msg} — completed in {elapsed:.2f} seconds")


# 1. Chunked CSV Reading

start = time.time()
logging.info("🔍 Reading CSV in chunks...")
chunk_size = 50000
chunks = pd.read_csv("Tyroo-dummy-data.csv", chunksize=chunk_size)

def print_bad_rows(df):
    col_names = set(df.columns.astype(str))
    mask = df.astype(str).isin(col_names)
    bad_row_indices = mask.any(axis=1)
    logging.info(f"⚠️  Found {bad_row_indices.sum()} suspicious rows.")
    return bad_row_indices

df_list = []
for i, chunk in enumerate(chunks):
    logging.info(f"📦 Processing chunk {i + 1}")
    bad_row_indices = print_bad_rows(chunk)
    chunk = chunk[~bad_row_indices].reset_index(drop=True)
    df_list.append(chunk)

df = pd.concat(df_list, ignore_index=True)
log_time("✅ CSV loading and filtering", start)


# 2. Basic Data Cleanup

start = time.time()
logging.info("🧹 Starting basic cleanup...")

num_cols = [
    'platform_commission_rate', 'product_commission_rate',
    'bonus_commission_rate', 'promotion_price', 'current_price',
    'price', 'discount_percentage', 'number_of_reviews',
    'rating_avg_value', 'seller_rating'
]

df[num_cols] = df[num_cols].apply(pd.to_numeric, errors='coerce').fillna(0.0)

text_cols = [
    'venture_category3_name_en', 'venture_category2_name_en', 'venture_category1_name_en',
    'venture_category_name_local', 'brand_name', 'business_type', 'business_area',
    'product_name', 'seller_name'
]

df.fillna({
    'is_free_shipping': '0',
    'availability': 'out of stock',
    'deeplink': '',
    'product_url': '',
    'seller_url': '',
    'description': np.nan,
    **{col: 'Unknown' for col in text_cols}
}, inplace=True)

df['is_free_shipping'] = df['is_free_shipping'].astype(bool)
df['availability'] = df['availability'].map({'in stock': True, 'out of stock': False})

image_cols = [col for col in df.columns if 'img' in col or 'image_url' in col]
df[image_cols] = df[image_cols].fillna('')
df['description'] = df['description'].fillna(df['product_name'])

log_time("✅ Basic cleanup", start)


# 3. Clean Description

start = time.time()
logging.info("🧼 Cleaning descriptions with vectorized filter + swifter...")

# Vectorized junk description detection
junk_regex = r'^(&nbsp;)+$|^-+$|^\.{1,2}$|^welcome to my shop.*$|^www.*$|^$|^no description currently.*$|^_*$'
desc_str = df['description'].fillna('').str.strip().str.lower()
junk_mask = desc_str.str.match(junk_regex)

# Fast replace with product_name
df.loc[junk_mask, 'description'] = df.loc[junk_mask, 'product_name']

# Clean remaining descriptions
def clean_single_desc(row):
    desc = row['description']
    product_name = row['product_name']
    if pd.isna(desc):
        return product_name
    text = str(desc).strip().lower()
    text = re.sub(r'&[a-z]+;', '', text)
    text = re.sub(r'\s{2,}', ' ', text).strip()
    if len(text) < 10:
        return product_name
    return text

df['description'] = df.swifter.apply(clean_single_desc, axis=1)

log_time("✅ Description cleaning", start)


# 4. Text Normalization

start = time.time()
logging.info("🔠 Normalizing text fields...")

def fast_clean(series):
    return (
        series.fillna('')
              .str.encode('ascii', errors='ignore').str.decode('ascii')
              .str.replace(r'[^a-zA-Z0-9\s]', '', regex=True)
              .str.replace(r'\s+', ' ', regex=True)
              .str.strip()
              .str.lower()
    )

df['product_name'] = fast_clean(df['product_name'])
df['description'] = fast_clean(df['description'])

log_time("✅ Text normalization", start)


# 5. Vectorize and Compare

start = time.time()
logging.info("📊 Vectorizing and computing similarities...")

vectorizer = HashingVectorizer(
    stop_words='english',
    n_features=2**10,
    alternate_sign=False,
    norm='l2'
)

name_vecs = vectorizer.transform(df['product_name'])
desc_vecs = vectorizer.transform(df['description'])

similarities = (name_vecs.multiply(desc_vecs)).sum(axis=1).A1
df['similarity'] = similarities

df.loc[df['similarity'] == 0.0, 'description'] = df.loc[df['similarity'] == 0.0, 'product_name']
df.drop(columns=['similarity'], inplace=True)

log_time("✅ Similarity vectorization", start)


# 6. Save Output

import sqlite3

start = time.time()
logging.info("💾 Saving final DataFrame to SQLite...")

# Connect to (or create) SQLite DB
conn = sqlite3.connect("transformed_data.db")

# Save DataFrame to table
df.to_sql("products_cleaned", conn, if_exists="replace", index=False)

conn.close()
log_time("✅ Data saved to SQLite table 'products_cleaned'", start)

logging.info("🎉 All steps completed successfully!")


  from .autonotebook import tqdm as notebook_tqdm
[16:01:46] INFO: 🔍 Reading CSV in chunks...
  for i, chunk in enumerate(chunks):
[16:01:48] INFO: 📦 Processing chunk 1
[16:01:48] INFO: ⚠️  Found 3 suspicious rows.
[16:01:50] INFO: 📦 Processing chunk 2
[16:01:50] INFO: ⚠️  Found 0 suspicious rows.
[16:01:52] INFO: 📦 Processing chunk 3
[16:01:53] INFO: ⚠️  Found 0 suspicious rows.
[16:01:55] INFO: 📦 Processing chunk 4
[16:01:55] INFO: ⚠️  Found 0 suspicious rows.
[16:01:57] INFO: 📦 Processing chunk 5
[16:01:57] INFO: ⚠️  Found 0 suspicious rows.
[16:01:59] INFO: 📦 Processing chunk 6
[16:02:00] INFO: ⚠️  Found 0 suspicious rows.
[16:02:02] INFO: 📦 Processing chunk 7
[16:02:02] INFO: ⚠️  Found 0 suspicious rows.
[16:02:04] INFO: 📦 Processing chunk 8
[16:02:04] INFO: ⚠️  Found 0 suspicious rows.
[16:02:06] INFO: 📦 Processing chunk 9
[16:02:07] INFO: ⚠️  Found 0 suspicious rows.
[16:02:09] INFO: 📦 Processing chunk 10
[16:02:09] INFO: ⚠️  Found 0 suspicious rows.
[16:02:11] INFO: 📦 Processin

In [2]:
import sqlite3
import pandas as pd

# Connect to the SQLite database
conn = sqlite3.connect("transformed_data.db")

# Show all tables
tables = pd.read_sql_query("SELECT name FROM sqlite_master WHERE type='table';", conn)
print("Tables:", tables)

# Load table into DataFrame
df = pd.read_sql_query("SELECT * FROM products_cleaned LIMIT 5;", conn)
print("🔍 Sample rows:")
print(df)

# Get row count
row_count = pd.read_sql_query("SELECT COUNT(*) as count FROM products_cleaned;", conn)
print(f"📦 Total rows: {row_count['count'][0]}")

# Get column stats (for numeric columns)
stats = pd.read_sql_query("SELECT * FROM products_cleaned LIMIT 100000;", conn).describe()
print("📊 Basic stats:")
print(stats)

conn.close()


Tables:                name
0  products_cleaned
🔍 Sample rows:
   platform_commission_rate venture_category3_name_en  \
0                      0.07            Parts & Spares   
1                      0.10                 Hair Care   
2                      0.07               Electronics   
3                      0.07        Disposable Napkins   
4                      0.10           Serum & Essence   

                                   product_small_img  \
0  https://my-live.slatic.net/p/98b201dcb23a6f15c...   
1  https://my-live.slatic.net/p/90097ebc33ddd0641...   
2  https://my-live.slatic.net/p/35c1806bcb2b6895f...   
3  https://my-live.slatic.net/p/06e24c1ead0e47cb3...   
4  https://my-live.slatic.net/p/e17a59e38fc2418f8...   

                                            deeplink  availability  \
0  lazada://my/d?uri=https://www.lazada.com.my/pr...             1   
1  lazada://my/d?uri=https://www.lazada.com.my/pr...             1   
2  lazada://my/d?uri=https://www.lazada.com.my/

In [4]:
#sqlite3 transformed_data.db

In [5]:
# .schema products_cleaned