In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import warnings 
warnings.filterwarnings('ignore')
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("atharvjairath/flipkart-ecommerce-dataset")

print("Path to dataset files:", path)

In [None]:
df = pd.read_csv("/kaggle/input/flipkart-ecommerce-dataset/flipkart_com-ecommerce_sample.csv")
df.info()

In [None]:
df.head()

In [None]:
df.isnull().sum()

### Handling missing values

In [None]:
df["brand"].head()

In [None]:
df["brand"].value_counts()

In [None]:
df['image'].fillna('["http://img5a.flixcart.com/image/short/u/4/a/altht-3p-21-alisha-38-original-imaeh2d5vm5zbtgg.jpeg", "http://img5a.flixcart.com/image/short/p/j/z/altght4p-26-alisha-38-original-imaeh2d5kbufss6n.jpeg", "http://img5a.flixcart.com/image/short/p/j/z/altght4p-26-alisha-38-original-imaeh2d5npdybzyt.jpeg", "http://img5a.flixcart.com/image/short/z/j/7/altght-7-alisha-38-original-imaeh2d5jsz2ghd6.jpeg"]',inplace = True)

In [None]:
df['product_specifications'].fillna("",inplace =True)
df['description'].fillna("",inplace =True)
df["brand"].fillna("Allure Auto", inplace =True)

In [None]:
df['retail_price'].describe()

In [None]:
df['retail_price'].fillna(df['retail_price'].mean(),inplace =True)
df['discounted_price'].fillna(df['discounted_price'].mean(),inplace =True)

In [None]:
df.isnull().sum()

In [None]:
df.shape

In [None]:
df =  df.dropna()
df.shape

## Data Processing 

In [None]:
df.head()

In [None]:
df['overall_rating'].value_counts()

In [None]:
df['overall_rating'] = df['overall_rating'].replace('No rating available', 0)
df['product_rating'] = df['product_rating'].replace('No rating available', 0)

In [None]:
df.head()

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
tf = TfidfVectorizer(stop_words='english')

In [None]:
documents = df['description']
documents

In [None]:
tfidf_matrix = tf.fit_transform(documents)


In [None]:

from sklearn.metrics.pairwise import cosine_similarity
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
cosine_sim

In [None]:
tfidf_matrix.shape

In [None]:
sim_df = pd.DataFrame(cosine_sim)

sim_df.head(10)

In [None]:

indices = pd.Series(df.index, index = df['product_name']).drop_duplicates()
indices

## Recomendation of product

### 1. bought product based recommendation

In [None]:

def recommend_similar_products(product_name, cosine_sim = cosine_sim):
  idx = indices[product_name]
  linear_scores = list(enumerate(cosine_sim[idx]))
  linear_scores = sorted(linear_scores, key = lambda x: x[1], reverse = True)
  linear_scores = linear_scores[0:11]
  product_indices = [x[0] for x in linear_scores]
  scores = [x[1] for x in linear_scores]
  lists = list(df['product_name'].iloc[product_indices])
  return lists

In [None]:
product_list = pd.Series(df['product_name'], index = df.index).drop_duplicates()
product_list.head(5)

In [None]:
import random
random_number = random.randint(0, 100)
random_product_name = product_list[random_number]
#random_product_name = "Sicons All Purpose Arnica Dog Shampoo"
random_product_name

In [None]:
# Example of recommendation 
print("Below are the recommendations for the product -",  random_product_name, "\n")
print(recommend_similar_products(random_product_name))

### 2. search based recommendation

In [None]:

def find_similar_products(search_term, data_frame, top_n=25):
    descriptions = data_frame['description'].fillna('').str.lower()
    vectorizer = TfidfVectorizer(stop_words='english')
    tfidf_matrix = vectorizer.fit_transform(descriptions)
    search_vector = vectorizer.transform([search_term.lower()])
    similarity_scores = cosine_similarity(search_vector, tfidf_matrix).flatten()
    data_frame['similarity_score'] = similarity_scores
    # df = df.sort_values(by='similarity_score', ascending=False)
    similar_products = data_frame.drop_duplicates(subset='similarity_score',keep='first').sort_values(by='similarity_score',ascending = False).head(top_n).reset_index(drop=True)
    # similar_products = df.sort_values(by='similarity_score', ascending=False).unique().head(top_n).reset_index(drop=True)
    return similar_products['product_name']

In [None]:
similar_products = find_similar_products('shoes', df)
similar_products.to_list()

## Recent trend products
#### 2 months trend .


In [None]:
import re
def process_row(text):
  pattern = r'(\d+\.?\d*)\s*(mg|ml|l|ML|MG|L|Mg|Tablet)(?:\b|$)'
  match = re.search(pattern, text)

  if match:
      value, unit = match.groups()
      # Standardize unit to lowercase
      unit = unit.lower()
      # Remove the quantity part from product name
      name = re.sub(pattern, '', text).strip()
      return pd.Series([name, float(value), unit])
  else:
      # If no quantity found, return original text as name
      return pd.Series([text, None, ''])


In [None]:
from datetime import datetime, timedelta

def find_trending_products(df):
  df['crawl_timestamp'] = pd.to_datetime(df['crawl_timestamp'])
  latest_date = df['crawl_timestamp'].max()
  two_months_ago = latest_date - timedelta(days=62)
  recent_transactions = df[df['crawl_timestamp'] >= two_months_ago]
  trending_products = recent_transactions['product_name'].value_counts().sort_values(ascending=False)
  trending_products
  return trending_products

In [None]:
# testing the trend product.
trending_products = find_trending_products(df)
trending_products

In [None]:
df.head()

In [None]:
df['uniq_id'].value_counts()