In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/ecommerce-dataset/item_properties_part1.csv
/kaggle/input/ecommerce-dataset/category_tree.csv
/kaggle/input/ecommerce-dataset/item_properties_part2.csv
/kaggle/input/ecommerce-dataset/events.csv


In [2]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from scipy.spatial.distance import pdist, squareform
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold

In [3]:
# Read data
events = pd.read_csv('/kaggle/input/ecommerce-dataset/events.csv')

# Data preprocessing
events = events.dropna(subset=['event'])
events['event'] = events['event'].replace({'view': 1.0, 'addtocart': 2.0, 'transaction': 3.0})
events = events.drop(columns=['timestamp', 'transactionid'])

  events['event'] = events['event'].replace({'view': 1.0, 'addtocart': 2.0, 'transaction': 3.0})


In [4]:
# Data sorting / thinning
agg_events = events.groupby('itemid').agg(mean_event=('event', 'mean'), number_of_events=('event', 'count')).reset_index()
agg_events_GT50 = agg_events[agg_events['number_of_events'] > 50]
df_GT50 = pd.merge(events, agg_events_GT50[['itemid']], on='itemid', how='inner')

# Sampling data for better weighting of view, addtocart events
view_events = df_GT50[df_GT50['event'] == 1.0].sample(frac=0.01, random_state=1)
addtocart_events = df_GT50[df_GT50['event'] == 2.0].sample(frac=0.1, random_state=1)
transaction_events = df_GT50[df_GT50['event'] == 3.0]

# Combining sampled data
df_sampled = pd.concat([view_events, addtocart_events, transaction_events])

# Reducing to users with high interactions
user_interaction_counts = df_sampled['visitorid'].value_counts()
high_interaction_users = user_interaction_counts[user_interaction_counts > 10].index
df_reduced = df_sampled[df_sampled['visitorid'].isin(high_interaction_users)]

# Implementation
matrix = df_reduced.pivot_table(index='visitorid', columns='itemid', values='event')
matrix_std = pd.DataFrame(StandardScaler().fit_transform(matrix.fillna(0)), index=matrix.index, columns=matrix.columns)

# Generating similarity matrices
item_similarity_pearson = matrix_std.corr()
item_similarity_cosine = cosine_similarity(matrix_std.fillna(0).T)
item_similarity_cosine_df = pd.DataFrame(item_similarity_cosine, index=matrix_std.columns, columns=matrix_std.columns)
item_similarity_euclidean = pdist(matrix_std.fillna(0).T, 'euclidean')
# Convert Euclidean distances to similarities
item_similarity_euclidean_df = pd.DataFrame(1 / (1 + squareform(item_similarity_euclidean)), index=matrix_std.columns, columns=matrix_std.columns)

# Hybrid similarity matrix
item_similarity_hybrid = (0.5 * item_similarity_pearson + 
                          0.3 * item_similarity_cosine_df + 
                          0.2 * item_similarity_euclidean_df)

In [5]:
# Function to get top-N recommendations
def get_top_n_recommendations(item_ids, similarity_df, n=15, weights=None):
    similar_items = pd.Series(dtype=float)
    for item in item_ids:
        if item in similarity_df:
            similar_items = pd.concat([similar_items, similarity_df[item]])
    similar_items = similar_items.groupby(similar_items.index).mean()
    similar_items = similar_items.drop(item_ids, errors='ignore')
    similar_items = similar_items.sort_values(ascending=False)
    if weights is not None:
        similar_items *= weights
    return similar_items.head(n)

# Leave-One-Out Cross Validation (LOOCV) function
def loocv_recommendation(user_viewed_items, similarity_df, n=15, weights=None):
    if not user_viewed_items:
        print("No viewed items for LOOCV.")
        return 0
    hit_count = 0
    for i in range(len(user_viewed_items)):
        test_item = user_viewed_items[i]
        train_items = user_viewed_items[:i] + user_viewed_items[i+1:]
        if not train_items:
            continue
        recommendations = get_top_n_recommendations(train_items, similarity_df, n, weights)
        if test_item in recommendations.index:
            hit_count += 1
    success_rate = hit_count / len(user_viewed_items) if user_viewed_items else 0
    print(f"LOOCV Success Rate: {success_rate}")
    return success_rate

In [6]:
# Example: List of items viewed or purchased by a user
user_viewed_items = df_reduced[df_reduced['visitorid'] == df_reduced['visitorid'].iloc[0]]['itemid'].tolist()
print(f"User viewed items: {user_viewed_items}")


User viewed items: [392074, 134191, 314952, 338395, 280888, 361656, 46156, 85334, 119736, 458637, 186702, 371390, 353111, 119736, 422561, 359336, 448453, 86824, 403969, 287449, 280888, 141090, 321850, 314952, 72640, 338395]


In [7]:
# Calculate success rate for each similarity metric
results_cf = {}
for name, similarity_df in [('Pearson', item_similarity_pearson), 
                            ('Cosine', item_similarity_cosine_df), 
                            ('Euclidean', item_similarity_euclidean_df)]:
    print(f"Calculating success rate for {name} similarity...")
    success_rate = loocv_recommendation(user_viewed_items, similarity_df, n=15)
    results_cf[name] = success_rate

Calculating success rate for Pearson similarity...
LOOCV Success Rate: 0.6923076923076923
Calculating success rate for Cosine similarity...
LOOCV Success Rate: 0.6923076923076923
Calculating success rate for Euclidean similarity...
LOOCV Success Rate: 0.6923076923076923


In [8]:
# Calculate success rate using hybrid similarity
print("Calculating success rate for Hybrid similarity...")
hybrid_success_rate = loocv_recommendation(user_viewed_items, item_similarity_hybrid, n=15)

# Output results
for name, success_rate in results_cf.items():
    print(f"\n{name} Similarity Success Rate: {success_rate}")
print(f"\nHybrid Similarity Success Rate: {hybrid_success_rate}")

Calculating success rate for Hybrid similarity...
LOOCV Success Rate: 0.6923076923076923

Pearson Similarity Success Rate: 0.6923076923076923

Cosine Similarity Success Rate: 0.6923076923076923

Euclidean Similarity Success Rate: 0.6923076923076923

Hybrid Similarity Success Rate: 0.6923076923076923
