In [2]:
import pandas as pd
from fuzzywuzzy import process
PRODUCT_PATH = "./Data/Product_Information_Dataset.csv"
df_product = pd.read_csv(PRODUCT_PATH)

In [10]:
def fuzzy_search(df, query, column, limit=10):
    # Extract the column values as a list
    choices = df[column].tolist()

    # Perform fuzzy matching
    results = process.extract(query, choices, limit=limit)

    # Extract the matched rows from the DataFrame
    matched_indices = [choices.index(result[0]) for result in results]
    matched_df = df.iloc[matched_indices].copy()

    # Add the score to the matched DataFrame
    matched_df["score"] = [result[1] for result in results]

    return matched_df


def search_products(
    query: str,
    sort_column: str = "average_rating",
    sort_order: str = "desc",
    limit: int = 5,
):
    # Perform fuzzy search on multiple columns
    columns_to_search = [
        "title",
        "description",
        "main_category",
        # "features",
        # "categories",
        # "details",
    ]
    combined_matches = pd.DataFrame()

    for column in columns_to_search:
        matches = fuzzy_search(
            df_product, query, column, limit=1000000000
        )  # a large number to get all matches
        combined_matches = pd.concat([combined_matches, matches]).drop_duplicates()
    
    # Sort the combined matches by score in descending order
    combined_matches = combined_matches.sort_values(by="score", ascending=False)
    
    # Limit the results
    combined_matches = combined_matches.head(limit)

    # Sort the combined matches based on the specified column and order
    top_results = combined_matches.sort_values(by=sort_column, ascending=(sort_order == "asc"))
    # Fill NaN values to avoid JSON serialization issues
    top_results = top_results.fillna("")

    return top_results.to_dict(orient="records")

In [11]:
query = "BOYA BYM1" # BOYA BYM1 Microphone doesn't find it
sort_column = "price"
sort_order = "asc"
limit = 10

results = search_products(query, sort_column, sort_order, limit)
# Save results to a CSV file
results_df = pd.DataFrame(results)
results_df.to_csv('search_results.csv', index=False)

In [67]:
choices = ['horse', 'dog', 'fish', 'cat']
query = 'catdf'
results = process.extract(query, choices, limit=5)
print(results)

[('cat', 90), ('dog', 36), ('fish', 22), ('horse', 0)]
