In [4]:
%pip install pyarabic

from pyarabic import araby 
def normalize_text(text):
    text = araby.normalize_hamza(text)
    text = araby.normalize_ligature(text)
    text = araby.normalize_alef(text)  # Convert أ إ آ to ا
    text = araby.normalize_teh(text)  # Convert ة to ه

    
    
    return text

import re

def clean_text(text):
    text = re.sub(r'[!"#%\'()*+,./:;<=>?@[\\]^_`{|}~]', ' ', text)  # Remove punctuation
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces
    return text



Note: you may need to restart the kernel to use updated packages.


In [5]:

import pandas as pd
def read_product_names(file_path):
    df = pd.read_excel(file_path)
    product_names = df['product_name_ar'].tolist()
    return product_names

file_path = './Masterfile.xlsx'
product_names = read_product_names(file_path)
print(product_names)

['إندرال 10 مجم 50 قرص', 'إندرال 40 مجم 50 قرص', 'ابتاميل 1 لبن 400 جرام', 'ابل-لايت 30 قرص', 'ابيدرا سولوستار 100 وحدة دولية/مل 5 اقلام', 'ابيدون شراب 125 مل', 'ابيكسيدون 1 مجم 20 قرص', 'ابيكسيدون 1مجم/مل شراب 100 مل', 'ابيكسيدون 3 مجم 30 قرص', 'ابيكسيدون 4 مجم 30 قرص', 'ابيكوبرد 20 مجم 20 قرص سريع الذوبان بالفم', 'ابيكوبرد 5 مجم 30 قرص سريع الذوبان بالفم', 'ابيكوتيل 20 مجم 10 قرص', 'ابيمول 300 مجم 5 اقماع', 'ابيمول 500 مجم 20 قرص', 'اتاكاند 16 مجم 14 قرص', 'اتاكاند 4 مجم 14 قرص', 'اتاكاند 8 مجم 14 قرص', 'اتاكاند بلس 16/12.5مجم 14 قرص', 'اترابكس 100 مجم 4 كبسول', 'اتور 10 مجم 7 قرص', 'اتور 20 مجم 10 قرص', 'اتور 40 مجم 10 قرص', 'اتورستات 10 مجم 14 قرص', 'اتورستات 20 مجم 14 قرص', 'اتورستات 40 مجم 14 قرص', 'اتوريزا 10/10 مجم 28 قرص', 'اتوريزا 10/20 مجم 21 قرص', 'اتوريزا 10/40 مجم 28 قرص', 'اجركس 75 مجم 60 قرص', 'ادابالين 0.1% جيل 30 جم', 'ادويفلام 75مجم/3مل 6 امبول', 'اديمكس 1 مجم 20 قرص', 'ار اكس كريم مساج 50 جم', 'ارث فرى 20 مجم 30 قرص', 'ارثروفاست 150 مجم 14 قرص', 'اركوكسيا 90 مجم 14 

In [6]:
def filter_product_names(master_file_path, filter_file_path, output_file_path):
    # Read product names from master file
    master_df = pd.read_excel(master_file_path)
    master_product_names = master_df['product_name_ar'].tolist()
    
    # Read product names from filter file
    filter_df = pd.read_excel(filter_file_path)
    filter_product_names = filter_df['product_name_ar'].tolist()
    
    # Filter out names that are in the filter file
    filtered_names = [name for name in master_product_names if name not in filter_product_names]
    
    # Create a new DataFrame with the filtered names
    filtered_df = pd.DataFrame({'product_name_ar': filtered_names})
    
    # Write the filtered names to a new Excel file
    filtered_df.to_excel(output_file_path, index=False)

# Example usage
file_path = './Masterfile.xlsx'
filter_file_path = './filter.xlsx'
output_file_path = './filtered_masterfile.xlsx'
filter_product_names(file_path, filter_file_path, output_file_path)

In [7]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from openpyxl import load_workbook


# Function to process each chunk
def process_chunk(chunk_df):
    if 'seller_item_name' not in chunk_df.columns or 'marketplace_product_name_ar' not in chunk_df.columns:
        raise ValueError("Required columns are missing from the dataset")

    # Normalize and clean text
    chunk_df['seller_item_name'] = chunk_df['seller_item_name'].apply(normalize_text).apply(clean_text)
    chunk_df['marketplace_product_name_ar'] = chunk_df['marketplace_product_name_ar'].apply(normalize_text).apply(clean_text)

    # Compute TF-IDF and cosine similarity
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(chunk_df['seller_item_name'])
    cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

    # Create negative pairs
    for idx, row in chunk_df.iterrows():
        current_index = idx
        sim_scores = list(enumerate(cosine_sim[current_index]))
        sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
        
        # Select non-matching items with moderate similarity
        top_matches = [i for i in sim_scores if chunk_df.iloc[i[0]]['marketplace_product_name_ar'] != row['marketplace_product_name_ar']][1:5]  # Skip the first match (itself)
        
        for match in top_matches:
            chunk_df.at[match[0], 'match'] = 0  # Mark as negative pair

    return chunk_df

# Read the Dataset.xlsx file in chunks
dataset_path = './Dataset.xlsx'
chunksize = 1000  # Adjust the chunk size based on your memory constraints

# Initialize an empty DataFrame to store the results
augmented_df = pd.DataFrame()

# Load the workbook and get the sheet
wb = load_workbook(filename=dataset_path, read_only=True)
ws = wb.active

# Read the header
header = [cell.value for cell in next(ws.iter_rows(min_row=1, max_row=1))]

# Process the file in chunks
chunk = []
for row in ws.iter_rows(min_row=2, values_only=True):
    chunk.append(row)
    if len(chunk) == chunksize:
        chunk_df = pd.DataFrame(chunk, columns=header)
        processed_chunk = process_chunk(chunk_df)
        augmented_df = pd.concat([augmented_df, processed_chunk])
        chunk = []

# Process any remaining rows
if chunk:
    chunk_df = pd.DataFrame(chunk, columns=header)
    processed_chunk = process_chunk(chunk_df)
    augmented_df = pd.concat([augmented_df, processed_chunk])

# Save the augmented dataset to a new Excel file
augmented_dataset_path = './Augmented_Dataset.xlsx'
augmented_df.to_excel(augmented_dataset_path, index=False)