In [7]:
# ===========================================
# STEP 0: INSTALL & IMPORT REQUIRED LIBRARIES
# ===========================================
!pip install scikit-surprise  # For collaborative filtering

import numpy as np
import pandas as pd
from surprise import SVD, Dataset, Reader
from surprise.model_selection import train_test_split
from surprise import accuracy

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# For demonstration:
import warnings
warnings.filterwarnings("ignore")

# ===========================================
# STEP 1: CREATE A SYNTHETIC DATASET
# ===========================================
# We simulate user-product ratings (collaborative filtering)
# and product metadata (content-based filtering).

# ---- 1A. Synthetic Ratings Dataset ----
ratings_dict = {
    "user_id":   [1, 1, 1, 2, 2, 3, 3, 4, 5, 5, 6, 6],
    "product_id":[101,102,103,101,103,101,102,102,104,105,104,105],
    "rating":    [4,   5,   3,   2,   4,   5,   2,   3,   4,   1,   5,   3]
}
ratings_df = pd.DataFrame(ratings_dict)

# ---- 1B. Synthetic Product Metadata ----
# Example fields: "Category", "Brand", "Key Ingredients", etc.
product_info_dict = {
    "product_id": [101, 102, 103, 104, 105],
    "name":       ["Aloe Day Cream", "Rose Night Cream", "Vitamin C Serum",
                   "Anti-Aging Serum", "Matte Lipstick"],
    "category":   ["Cream", "Cream", "Serum", "Serum", "Lipstick"],
    "brand":      ["BrandX", "BrandY", "BrandX", "BrandZ", "BrandY"],
    "ingredients":["aloe vera for hydration",
                   "rose extract for soothing",
                   "vitamin c, antioxidants",
                   "retinol, hyaluronic acid",
                   "matte formula, hydrating oils"]
}
products_df = pd.DataFrame(product_info_dict)

print("Ratings DataFrame:")
print(ratings_df)
print("\nProduct Metadata DataFrame:")
print(products_df)

# ===========================================
# STEP 2: COLLABORATIVE FILTERING (Surprise)
# ===========================================
# We'll train an SVD model on user-product rating data.

# 2A. Prepare data for Surprise library
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(ratings_df[['user_id', 'product_id', 'rating']], reader)
trainset, testset = train_test_split(data, test_size=0.25, random_state=42)

# 2B. Build and train SVD model
svd_model = SVD(n_factors=50, random_state=42)
svd_model.fit(trainset)

# 2C. Evaluate model performance on testset
predictions = svd_model.test(testset)
rmse = accuracy.rmse(predictions)

# 2D. Function to get top-N product recommendations for a given user
def recommend_collaborative(user_id, df_products, model, n_recommend=3):
    # We'll assume product IDs range over the existing product set.
    existing_product_ids = df_products['product_id'].unique()

    # Predict rating for each product
    predictions = []
    for pid in existing_product_ids:
        pred = model.predict(user_id, pid)
        predictions.append((pid, pred.est))

    # Sort predictions by estimated rating (descending)
    predictions.sort(key=lambda x: x[1], reverse=True)

    top_n = predictions[:n_recommend]

    # Create a DataFrame for recommended products
    top_n_products = pd.DataFrame(top_n, columns=['product_id', 'estimated_rating'])
    top_n_products = top_n_products.merge(df_products, on='product_id', how='left')
    return top_n_products

# Example: Let's recommend products for user_id=1
print("\nCollaborative Filtering Recommendations for User 1:")
cf_recs_user1 = recommend_collaborative(1, products_df, svd_model, n_recommend=3)
print(cf_recs_user1)

# ===========================================
# STEP 3: CONTENT-BASED RECOMMENDATION
# ===========================================
# We'll create a simple text representation of each product (category + brand + ingredients + name).
# Then we'll apply TF-IDF and compute cosine similarity.

# 3A. Create a combined text column for each product
products_df['combined_text'] = (products_df['category'] + ' ' +
                                products_df['brand'] + ' ' +
                                products_df['ingredients'] + ' ' +
                                products_df['name'])

# 3B. Vectorize with TF-IDF
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(products_df['combined_text'])

# 3C. Function to get top-N similar products given a product ID
def recommend_content_based(product_id, df, tfidf_mat, n_recommend=3):
    # Map product_id to index in DataFrame
    idx = df.index[df['product_id'] == product_id].tolist()[0]

    # Compute cosine similarity with all products
    cos_similarities = cosine_similarity(tfidf_mat[idx], tfidf_mat).flatten()

    # Sort by similarity (descending), skip itself
    similar_indices = cos_similarities.argsort()[::-1]
    similar_indices = similar_indices[similar_indices != idx]

    # Get top-N
    top_n_indices = similar_indices[:n_recommend]

    # Return recommended products
    recommended_products = df.iloc[top_n_indices][['product_id','name','category','brand','ingredients']]
    recommended_products['similarity_score'] = cos_similarities[top_n_indices]
    return recommended_products

# Example: Let's find similar products to product_id=103 ("Vitamin C Serum")
print("\nContent-Based Recommendations for Product 103 (Vitamin C Serum):")
cb_recs_103 = recommend_content_based(103, products_df, tfidf_matrix, n_recommend=2)
print(cb_recs_103)

# ===========================================
# STEP 4: HYBRID APPROACH (Optional Concept)
# ===========================================
# You could combine CF and CB approaches by:
#  - Using CF to find a set of top items for a user
#  - Then re-ranking them based on content similarity
# or vice versa.

# Example (simplified, conceptual):
# Let's take the top CF recommendations for user 1,
# and then pick the best match based on a product that user liked the most.

# For instance, user 1 rated product 102 with a 5 (highest rating)
favorite_product_id = 102  # the user liked "Rose Night Cream" best
# We'll rank the CF recommendations by content similarity to the user's favorite product.
favorite_idx = products_df.index[products_df['product_id'] == favorite_product_id].tolist()[0]
favorite_vector = tfidf_matrix[favorite_idx]

def hybrid_rerank(cf_recommendations, favorite_vector, df):
    # For each item in CF recommendations, compute similarity to user's favorite item
    scores = []
    for i, row in cf_recommendations.iterrows():
        pid = row['product_id']
        product_idx = df.index[df['product_id'] == pid].tolist()[0]
        sim = cosine_similarity(favorite_vector, tfidf_matrix[product_idx])[0][0]
        scores.append(sim)
    cf_recommendations['content_similarity_to_favorite'] = scores
    return cf_recommendations.sort_values('content_similarity_to_favorite', ascending=False)

print("\nHybrid Re-ranking of CF Recs for User 1, based on similarity to user's favorite product (ID=102):")
cf_recs_user1_hybrid = hybrid_rerank(cf_recs_user1, favorite_vector, products_df)
print(cf_recs_user1_hybrid)

# ===========================================
# DONE: We have demonstrated:
#  1) Collaborative Filtering using SVD (Surprise)
#  2) Content-Based Recommendation using TF-IDF + Cosine Similarity
#  3) A simple Hybrid Re-ranking approach
# ===========================================


Ratings DataFrame:
    user_id  product_id  rating
0         1         101       4
1         1         102       5
2         1         103       3
3         2         101       2
4         2         103       4
5         3         101       5
6         3         102       2
7         4         102       3
8         5         104       4
9         5         105       1
10        6         104       5
11        6         105       3

Product Metadata DataFrame:
   product_id              name  category   brand  \
0         101    Aloe Day Cream     Cream  BrandX   
1         102  Rose Night Cream     Cream  BrandY   
2         103   Vitamin C Serum     Serum  BrandX   
3         104  Anti-Aging Serum     Serum  BrandZ   
4         105    Matte Lipstick  Lipstick  BrandY   

                     ingredients  
0        aloe vera for hydration  
1      rose extract for soothing  
2        vitamin c, antioxidants  
3       retinol, hyaluronic acid  
4  matte formula, hydrating oils  
RMSE: 1

In [10]:
# ===========================================
# STEP 1: INSTALL & IMPORT REQUIRED LIBRARIES
# ===========================================
!pip install scikit-surprise --quiet

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Surprise for collaborative filtering
from surprise import SVD, Dataset, Reader
from surprise.model_selection import train_test_split
from surprise import accuracy

# For content-based filtering
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

import warnings
warnings.filterwarnings("ignore")

# ===========================================
# STEP 2: LOADING THE CSV
# ===========================================
# If you're running this code in Colab, you have two options to get the CSV file:
#  (A) Upload the CSV manually in the Colab file explorer (left side) and then
#      use the local relative path, e.g. pd.read_csv('/content/cosmetics.csv')
#  (B) Mount your Google Drive and use a path like: /content/drive/MyDrive/cosmetics.csv
#
# If you're running locally in a Jupyter notebook, you can directly use your Windows path:
# df = pd.read_csv(r"C:\Users\91936\Desktop\AIRPLANE\cosmetics.csv")
#
# For demonstration, let's assume you have columns like:
# [user_id, product_id, rating, brand, category, ingredients, ...]
# We'll show placeholder read below. Modify the path to your actual CSV.

csv_path = r"C:\Users\91936\Desktop\AIRPLANE\cosmetics.csv"  # Adjust as needed

try:
    df = pd.read_csv(csv_path)
except:
    # For demonstration in Colab, we simulate a small example DataFrame:
    print("WARNING: Could not read the local path. Using a synthetic DataFrame for demo.\n")
    sample_data = {
        'user_id':    [1, 1, 2, 2, 3, 4],
        'product_id': [101, 102, 101, 103, 104, 105],
        'rating':     [5, 4, 3, 4, 2, 5],
        'brand':      ['BrandX', 'BrandY', 'BrandX', 'BrandX', 'BrandZ', 'BrandY'],
        'category':   ['Cream', 'Cream', 'Cream', 'Serum', 'Serum', 'Lipstick'],
        'ingredients':['aloe vera', 'rose extract', 'aloe vera',
                       'vitamin c, antioxidants', 'retinol, hyaluronic acid',
                       'matte formula, hydrating oils']
    }
    df = pd.DataFrame(sample_data)

print("Sample of loaded DataFrame:")
display(df.head())

# ===========================================
# STEP 3: BASIC DATA PREPARATION
# ===========================================
# Check if the CSV has all required columns. Adjust names as needed.
required_columns = ['user_id', 'product_id', 'rating', 'brand', 'category', 'ingredients']
for col in required_columns:
    if col not in df.columns:
        print(f"WARNING: Column '{col}' not found in DataFrame. Please adjust code/column names.")

# Let's separate the "ratings" part from the "product metadata" part
# If your CSV is purely user-product interactions with rating (and no repeated product rows),
# you might need a different approach. We'll assume each row is a rating from a user for a product.
# -> We'll group or drop duplicates for the product metadata.

# 3A. Basic cleaning (optional, adapt to your data)
df.dropna(subset=['user_id','product_id','rating'], inplace=True)  # drop rows missing these
df['user_id'] = df['user_id'].astype(str)       # Surprise expects user_id as string or integer
df['product_id'] = df['product_id'].astype(str) # same for product_id

# 3B. Extract product metadata by dropping duplicates
product_cols = ['product_id', 'brand', 'category', 'ingredients']
products_df = df[product_cols].drop_duplicates(subset=['product_id']).reset_index(drop=True)

# The rating dataset for Surprise
ratings_df = df[['user_id', 'product_id', 'rating']]

print("\nRatings DataFrame:")
display(ratings_df.head())

print("\nUnique Product Metadata:")
display(products_df)

# ===========================================
# STEP 4: COLLABORATIVE FILTERING (Surprise)
# ===========================================
# 4A. Convert the rating data to Surprise format
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(ratings_df[['user_id','product_id','rating']], reader)

# 4B. Train-test split
trainset, testset = train_test_split(data, test_size=0.25, random_state=42)

# 4C. Train an SVD model
svd_model = SVD(n_factors=50, random_state=42)
svd_model.fit(trainset)

# 4D. Evaluate on testset
predictions = svd_model.test(testset)
rmse = accuracy.rmse(predictions)

# 4E. Collaborative Filtering Recommendation Function
def recommend_collaborative(user_id, products_df, model, n_recommend=3):
    # Get unique product IDs
    unique_pids = products_df['product_id'].unique()
    preds = []
    for pid in unique_pids:
        pred = model.predict(str(user_id), str(pid))
        preds.append((pid, pred.est))
    # Sort by estimated rating descending
    preds.sort(key=lambda x: x[1], reverse=True)

    top_n = preds[:n_recommend]
    top_n_df = pd.DataFrame(top_n, columns=['product_id','estimated_rating'])
    # Merge with product metadata
    top_n_df = top_n_df.merge(products_df, on='product_id', how='left')
    return top_n_df

# Example recommendation for user_id=1
user_id_example = 1
print(f"\nCollaborative Filtering Recommendations for User {user_id_example}:")
cf_recs = recommend_collaborative(user_id_example, products_df, svd_model, n_recommend=3)
display(cf_recs)

# ===========================================
# STEP 5: CONTENT-BASED FILTERING (TF-IDF)
# ===========================================
# We'll combine brand, category, ingredients into a single text column
products_df['combined_text'] = (products_df['brand'].fillna('') + ' ' +
                                products_df['category'].fillna('') + ' ' +
                                products_df['ingredients'].fillna(''))

# 5A. Create TF-IDF matrix
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(products_df['combined_text'])

# 5B. Content-Based Recommendation Function
def recommend_content_based(product_id, df_products, tfidf_mat, n_recommend=3):
    # Find the index of the given product_id
    indices = df_products.index[df_products['product_id'] == product_id].tolist()
    if len(indices) == 0:
        print(f"Product ID {product_id} not found in product list.")
        return pd.DataFrame()
    idx = indices[0]

    # Compute similarity
    cos_sims = cosine_similarity(tfidf_mat[idx], tfidf_mat).flatten()

    # Sort by similarity desc, exclude the product itself
    similar_indices = cos_sims.argsort()[::-1]
    similar_indices = similar_indices[similar_indices != idx]

    top_n_indices = similar_indices[:n_recommend]
    recs = df_products.iloc[top_n_indices].copy()
    recs['similarity_score'] = cos_sims[top_n_indices]
    return recs[['product_id','brand','category','ingredients','similarity_score']]

# Example: find similar products to '101'
product_id_example = '101'
print(f"\nContent-Based Recommendations similar to Product ID {product_id_example}:")
cb_recs = recommend_content_based(product_id_example, products_df, tfidf_matrix, n_recommend=3)
display(cb_recs)

# ===========================================
# STEP 6: DONE
# ===========================================
print("\nPROCESS COMPLETED.\n")



Sample of loaded DataFrame:


Unnamed: 0,user_id,product_id,rating,brand,category,ingredients
0,1,101,5,BrandX,Cream,aloe vera
1,1,102,4,BrandY,Cream,rose extract
2,2,101,3,BrandX,Cream,aloe vera
3,2,103,4,BrandX,Serum,"vitamin c, antioxidants"
4,3,104,2,BrandZ,Serum,"retinol, hyaluronic acid"



Ratings DataFrame:


Unnamed: 0,user_id,product_id,rating
0,1,101,5
1,1,102,4
2,2,101,3
3,2,103,4
4,3,104,2



Unique Product Metadata:


Unnamed: 0,product_id,brand,category,ingredients
0,101,BrandX,Cream,aloe vera
1,102,BrandY,Cream,rose extract
2,103,BrandX,Serum,"vitamin c, antioxidants"
3,104,BrandZ,Serum,"retinol, hyaluronic acid"
4,105,BrandY,Lipstick,"matte formula, hydrating oils"


RMSE: 1.5939

Collaborative Filtering Recommendations for User 1:


Unnamed: 0,product_id,estimated_rating,brand,category,ingredients
0,105,4.445546,BrandY,Lipstick,"matte formula, hydrating oils"
1,102,4.351855,BrandY,Cream,rose extract
2,101,4.335792,BrandX,Cream,aloe vera



Content-Based Recommendations similar to Product ID 101:


Unnamed: 0,product_id,brand,category,ingredients,similarity_score
2,103,BrandX,Serum,"vitamin c, antioxidants",0.197138
1,102,BrandY,Cream,rose extract,0.197138
4,105,BrandY,Lipstick,"matte formula, hydrating oils",0.0



PROCESS COMPLETED.



In [12]:
from google.colab import sheets
sheet = sheets.InteractiveSheet(df=cf_recs)

https://docs.google.com/spreadsheets/d/1j4om1uci0J_yjIrZPngwugS2Cioeq43IJeAeikgw2-0#gid=0


In [11]:
from google.colab import sheets
sheet = sheets.InteractiveSheet(df=products_df)

https://docs.google.com/spreadsheets/d/1RSXm8AShCDJNy6LsMHp6j2CwtQHJjVgHcuXr7NQCu7o#gid=0
