In [33]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [34]:
def preprocess_text(text):
    # convert to lowercase
    text = text.lower()
    # remove punctuation and digits
    text = "".join(char for char in text if char.isalpha() or char.isspace())
    # remove stopwords
    stopwords = ["a", "an", "the"]
    words = text.split()
    words = [word for word in words if word not in stopwords]
    # join words back to text
    text = " ".join(words)
    return text

In [35]:
def calculate_similarity(tfidf_matrix, index):
    # calculate cosine similarity between the product at the given index and all other products
    similarity_scores = cosine_similarity(tfidf_matrix[index:index+1], tfidf_matrix).flatten()
    # sort the similarity scores in descending order and get the indices of the top 25 most similar products
    top_indices = similarity_scores.argsort()[::-1][1:26]
    # return the top 25 most similar products
    return top_indices

In [36]:
# load the data from a CSV file
df = pd.read_csv("C:\\Users\\20100\\Desktop\\graduation project\\ML Model\\furniture_store - Copy_db.csv",
                 usecols=["product_category","product_name","product_description"])

# preprocess the product description column
df["product_description"] = df["product_description"].apply(preprocess_text)

# create a tf-idf vectorizer and fit it on the product description column
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(df["product_description"])

In [37]:
def get_related_products(product_name):
    # preprocess the input product name
    product_name = preprocess_text(product_name)
    # transform the input product name to a tf-idf vector
    product_vector = vectorizer.transform([product_name])
    # calculate cosine similarity between the input product and all other products
    similarity_scores = cosine_similarity(product_vector, tfidf_matrix).flatten()
    # sort the similarity scores in descending order and get the indices of the top 25 most similar products
    top_indices = similarity_scores.argsort()[::-1][:25]
    # create a dataframe containing the top 25 most similar products
    related_products = df.iloc[top_indices][["product_category", "product_name", "product_description"]]
    # reset the index of the dataframe and rename the index column
    related_products = related_products.reset_index(drop=True)
    related_products.index.name = "rank"
    # return the dataframe
    return related_products

In [38]:
get_related_products("BRIMNES")

Unnamed: 0_level_0,product_category,product_name,product_description
rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,Wardrobes,PAX / MEHAMN/AULI,wardrobe combination xx cm
1,Cabinets & cupboards,LIXHULT,storage combination x cm
2,Cabinets & cupboards,GALANT,cabinet with doors x cm
3,Cabinets & cupboards,HÄLLAN,storage combination with doors xx cm
4,Cabinets & cupboards,HÄLLAN,storage combination with doors xx cm
5,Cabinets & cupboards,BESTÅ,frame xx cm
6,Cabinets & cupboards,FABRIKÖR,glassdoor cabinet x cm
7,Cabinets & cupboards,EKET,cabinet combination with feet xx cm
8,Cabinets & cupboards,IDÅSEN,high cabinet with drawer and doors x cm
9,Cabinets & cupboards,HAVSTA,glassdoor cabinet with plinth xx cm


In [44]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Load the data
data = pd.read_csv("C:\\Users\\20100\\Desktop\\graduation project\\ML Model\\furniture_store - Copy_db.csv",
                 usecols=["product_category","product_name","product_description"]) # change the filename to match your data

# Define the content-based filtering algorithm
tfidf = TfidfVectorizer(stop_words='english')
data['product_description'] = data['product_description'].fillna('')
tfidf_matrix = tfidf.fit_transform(data['product_description'])
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

def get_related_products(product_name, num_products=25):
    # Find the index of the input product
    idx = data[data['product_name']==product_name].index[0]

    # Get the cosine similarity scores of all products to the input product
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the products by their similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the indices of the top num_products similar products
    sim_indices = [i[0] for i in sim_scores[1:num_products+1]]

    # Return the top num_products similar products
    return data.iloc[sim_indices][['product_category', 'product_name', 'product_description']]

# Example usage
related_products = get_related_products('INGOLF')
related_products

Unnamed: 0,product_category,product_name,product_description
12,Bar furniture,RÅSKOG,"Bar stool, 63 cm"
21,Bar furniture,RÅSKOG,"Bar stool, 63 cm"
2,Bar furniture,FRANKLIN,"Bar stool with backrest, foldable, ..."
3,Bar furniture,FRANKLIN,"Bar stool with backrest, foldable, ..."
5,Bar furniture,INGOLF,"Bar stool with backrest, 74 cm"
6,Bar furniture,NORRARYD,"Bar stool with backrest, 74 cm"
16,Bar furniture,HENRIKSDAL,"Bar stool with backrest, 74 cm"
17,Bar furniture,HENRIKSDAL,"Bar stool with backrest, 74 cm"
18,Bar furniture,NORRÅKER,"Bar stool with backrest, 74 cm"
675,Chairs,STIG,"Bar stool with backrest, 74 cm"
