In [2]:
pip install implicit==0.7.2 --prefer-binary

Collecting implicit==0.7.2
  Downloading implicit-0.7.2.tar.gz (70 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/70.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m70.3/70.3 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: implicit
  Building wheel for implicit (pyproject.toml) ... [?25l[?25hdone
  Created wheel for implicit: filename=implicit-0.7.2-cp312-cp312-linux_x86_64.whl size=10855112 sha256=a30332c416fe9ed82bc130bc90ab2f8db19475ee66969e19241be30bb204b149
  Stored in directory: /root/.cache/pip/wheels/b2/00/4f/9ff8af07a0a53ac6007ea5d739da19cfe147a2df542b6899f8
Successfully built implicit
Installing collected packages: implicit
Successfully installed implicit-0.7.2


In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy.sparse as sp
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity

from implicit.als import AlternatingLeastSquares

In [24]:
import seaborn as sns

In [4]:
data = pd.read_csv("ecommerce_recommendation_dataset.csv")
df=pd.DataFrame(data)
print(df.shape)
print(df.columns)
df.head()

(60000, 51)
Index(['user_id', 'product_id', 'category', 'price', 'rating', 'review_count',
       'user_age', 'user_gender', 'user_location', 'purchase_history',
       'time_on_page', 'add_to_cart_count', 'search_keywords',
       'discount_applied', 'user_membership', 'user_browser', 'user_device',
       'purchase_time', 'session_duration', 'clicks_on_ads', 'page_views',
       'referral_source', 'wishlist_additions', 'cart_abandonment_rate',
       'average_spent', 'user_income', 'user_education', 'user_marital_status',
       'product_availability', 'stock_status', 'product_return_rate',
       'product_color', 'product_size', 'is_top_seller', 'discount_percentage',
       'time_to_purchase', 'delivery_time', 'shipping_fee', 'seller_rating',
       'seller_response_time', 'seller_location', 'product_rating_variance',
       'review_sentiment_score', 'user_engagement_score', 'ad_click_rate',
       'time_of_day', 'day_of_week', 'season', 'payment_method', 'coupon_used',
       'pro

Unnamed: 0,user_id,product_id,category,price,rating,review_count,user_age,user_gender,user_location,purchase_history,...,product_rating_variance,review_sentiment_score,user_engagement_score,ad_click_rate,time_of_day,day_of_week,season,payment_method,coupon_used,product_popularity
0,78517,1645,Books,842.23,2,155,24,Other,Urban,False,...,0.13,-0.28,0.68,0.04,Night,Thursday,Summer,Debit Card,False,0.54
1,52887,100,Books,253.76,3,331,43,Other,Suburban,False,...,0.02,0.28,0.11,0.89,Morning,Saturday,Summer,Debit Card,False,0.77
2,59395,585,Books,483.65,2,236,64,Female,Rural,True,...,1.55,0.23,0.35,0.99,Evening,Tuesday,Fall,Debit Card,False,0.14
3,54739,3774,Groceries,459.37,2,227,34,Female,Urban,False,...,1.41,0.93,0.73,0.16,Afternoon,Tuesday,Spring,Credit Card,False,0.18
4,42723,2119,Groceries,150.11,2,214,51,Female,Urban,True,...,1.29,0.11,0.26,0.17,Night,Wednesday,Spring,PayPal,False,0.66


In [5]:
cols_needed = [
    'user_id', 'product_id', 'purchase_history', 'category', 'price', 'discount_percentage',
    'is_top_seller', 'rating', 'review_count', 'product_popularity',
    'product_availability', 'stock_status', 'delivery_time', 'shipping_fee',
    'seller_rating', 'review_sentiment_score', 'product_color', 'product_size'
]

df = df[cols_needed].dropna(subset=['user_id', 'product_id'])

#Collaborative filtering

In [6]:
# Create user–item interaction matrix (implicit feedback)
user_item_matrix = df.pivot_table(
    index='user_id',
    columns='product_id',
    values='purchase_history',
    fill_value=0
)

# Convert to sparse matrix
sparse_user_item = sp.csr_matrix(user_item_matrix.values)

# Initialize and train ALS model
als_model = AlternatingLeastSquares(
    factors=50, regularization=0.1, iterations=20, random_state=42
)
als_model.fit(sparse_user_item)

print("ALS model trained. Matrix shape:", sparse_user_item.shape)

  0%|          | 0/20 [00:00<?, ?it/s]

ALS model trained. Matrix shape: (45154, 4999)


#Content Based filtering

In [14]:
product_features = [
    'product_id', 'category', 'price', 'discount_percentage', 'is_top_seller',
    'rating', 'review_count', 'product_popularity', 'product_availability',
    'stock_status', 'delivery_time', 'shipping_fee', 'seller_rating',
    'review_sentiment_score', 'product_color', 'product_size'
]

product_df = df[product_features].drop_duplicates(subset=['product_id']).set_index('product_id')

# Encode categorical columns
availability_map = {
    'In Stock': 1,
    'Out of Stock': 0,
    'Pre-order': 0.5,
    'Low':0,
    'Medium':0.5,
    'High':1
}
product_df['product_availability'] = product_df['product_availability'].map(availability_map)
product_df['stock_status'] = product_df['stock_status'].map(availability_map)
cat_cols = ['category', 'product_color', 'product_size']
product_df = pd.get_dummies(product_df, columns=[c for c in cat_cols if c in product_df.columns], drop_first=True)

# Normalize numeric columns
scaler = MinMaxScaler()
product_df[product_df.columns] = scaler.fit_transform(product_df)



In [17]:
product_df.isnull().sum().sort_values(ascending=False).head(10)

Unnamed: 0,0
price,0
discount_percentage,0
is_top_seller,0
rating,0
review_count,0
product_popularity,0
product_availability,0
stock_status,0
delivery_time,0
shipping_fee,0


In [18]:
# Compute product similarity matrix
similarity_matrix = cosine_similarity(product_df)
similarity_df = pd.DataFrame(similarity_matrix, index=product_df.index, columns=product_df.index)

print("Product similarity matrix created:", similarity_df.shape)

Product similarity matrix created: (4999, 4999)


In [19]:
def get_cold_start_recommendations(top_n=5):
    cold_recs = (
        df.groupby('product_id')
        .agg({'is_top_seller': 'mean', 'discount_percentage': 'mean', 'purchase_history': 'mean'})
        .assign(score=lambda x: 0.6*x['is_top_seller'] + 0.4*x['discount_percentage'])
        .sort_values('score', ascending=False)
        .head(top_n)
        .index.tolist()
    )
    return cold_recs

In [25]:
def recommend_for_user(user_id, top_n=5, alpha=0.7):
    """
    Hybrid Recommender:
    - alpha: weight for Collaborative Filtering
    - (1 - alpha): weight for Content-Based Filtering
    """
    # Check if user exists in training matrix
    if user_id not in user_item_matrix.index:
        return get_cold_start_recommendations(top_n)

    user_idx = list(user_item_matrix.index).index(user_id)

    # --- Collaborative Filtering Recommendations ---
    cf_recs = als_model.recommend(user_idx, sparse_user_item[user_idx], N=top_n)

# Handle both array and list-of-tuples formats
    if isinstance(cf_recs, tuple) or isinstance(cf_recs, np.ndarray):
    # implicit>=0.7 returns two arrays: (product_ids, scores)
        product_indices, scores = cf_recs
        cf_products = [user_item_matrix.columns[i] for i in product_indices]
        cf_scores = dict(zip(cf_products, scores))
    else:
        # implicit<0.7 returns list of (product_id, score)
        cf_products = [user_item_matrix.columns[i] for i, score in cf_recs]
        cf_scores = {user_item_matrix.columns[i]: score for i, score in cf_recs}
    # --- Content-Based Recommendations (based on CF results) ---
    similar_items = []
    for pid in cf_products:
        if pid in similarity_df.index:
            sims = similarity_df[pid].sort_values(ascending=False)[1:top_n+1]
            similar_items.extend(sims.index.tolist())
    cb_scores = pd.Series(similar_items).value_counts(normalize=True).to_dict()

    # --- Combine CF + CBF ---
    final_scores = {}
    for pid in set(list(cf_scores.keys()) + list(cb_scores.keys())):
        cf_val = cf_scores.get(pid, 0)
        cb_val = cb_scores.get(pid, 0)
        final_scores[pid] = alpha * cf_val + (1 - alpha) * cb_val

    recommended_products = sorted(final_scores.items(), key=lambda x: x[1], reverse=True)[:top_n]
    return [pid for pid, score in recommended_products]

In [34]:
# Checking for known/repeated user
user_id = df['user_id'].sample(1).iloc[0]

print(f"\n Recommendations for User {user_id}:")
print(recommend_for_user(user_id, top_n=5))

# checking for completely new user (cold start)
print("\n Recommendations for Cold User:")
print(recommend_for_user(user_id=999999, top_n=5))


 Recommendations for User 4171:
[1798, 396, 1165, 397, 3729]

 Recommendations for Cold User:
[1333, 1931, 4917, 174, 4814]


In [36]:
def get_product_details(product_ids, df, top_n=5):
    """
    Fetch product details (name, category, price, rating, etc.)
    for the given list of product_ids.
    """
    rec_details = (
        df[df['product_id'].isin(product_ids)]
        [['product_id', 'category', 'price', 'rating']]
        .drop_duplicates(subset=['product_id'])
        .set_index('product_id')
        .reindex(product_ids)
    )
    return rec_details.head(top_n)

In [37]:
user_id = 65217
recommended_ids = recommend_for_user(user_id, top_n=5)
print(f"Recommendations for User {user_id}: {recommended_ids}")

# Get detailed info
rec_details = get_product_details(recommended_ids, df)
display(rec_details)

Recommendations for User 65217: [4356, 2438, 2450, 1823, 1954]


Unnamed: 0_level_0,category,price,rating
product_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
4356,Electronics,469.64,3
2438,Groceries,609.16,4
2450,Furniture,348.45,4
1823,Books,510.82,4
1954,Groceries,804.62,3


In [38]:
cold_recs = get_cold_start_recommendations(top_n=5)
print("Recommendations for Cold User:", cold_recs)

cold_details = get_product_details(cold_recs, df)
display(cold_details)

Recommendations for Cold User: [1333, 1931, 4917, 174, 4814]


Unnamed: 0_level_0,category,price,rating
product_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1333,Electronics,54.84,5
1931,Books,891.69,3
4917,Groceries,30.4,2
174,Clothing,489.06,3
4814,Furniture,73.87,3
