In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/ecommerce-dataset/item_properties_part1.csv
/kaggle/input/ecommerce-dataset/category_tree.csv
/kaggle/input/ecommerce-dataset/item_properties_part2.csv
/kaggle/input/ecommerce-dataset/events.csv


In [2]:
! pip install implicit

Collecting implicit
  Downloading implicit-0.7.2-cp310-cp310-manylinux2014_x86_64.whl.metadata (6.1 kB)
Downloading implicit-0.7.2-cp310-cp310-manylinux2014_x86_64.whl (8.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.9/8.9 MB[0m [31m28.8 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: implicit
Successfully installed implicit-0.7.2
Note: you may need to restart the kernel to use updated packages.


In [3]:
# Import notwendige Bibliotheken
import pandas as pd
import numpy as np0
from sklearn.metrics.pairwise import cosine_similarity
from scipy.spatial.distance import pdist, squareform
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import mean_squared_error, mean_absolute_error, precision_score, recall_score

from sklearn.decomposition import NMF
import scipy.sparse as sparse
from implicit.als import AlternatingLeastSquares


In [4]:
# Laden und Vorbereiten der Daten
events = pd.read_csv('/kaggle/input/ecommerce-dataset/events.csv')
items_raw_df = pd.read_csv('/kaggle/input/ecommerce-dataset/item_properties_part1.csv')
items_raw1_df = pd.read_csv('/kaggle/input/ecommerce-dataset/item_properties_part2.csv')
items_raw_df = pd.concat([items_raw_df, items_raw1_df])

In [5]:
# Datenvorverarbeitung
events = events.dropna(subset=['event'])
events['event'] = events['event'].replace({'view': 1.0, 'addtocart': 2.0, 'transaction': 3.0})
events = events.drop(columns=['timestamp', 'transactionid'])

agg_events = events.groupby('itemid').agg(mean_event=('event', 'mean'), number_of_events=('event', 'count')).reset_index()
agg_events_GT50 = agg_events[agg_events['number_of_events'] > 50]
df_GT50 = pd.merge(events, agg_events_GT50[['itemid']], on='itemid', how='inner')

view_events = df_GT50[df_GT50['event'] == 1.0].sample(frac=0.01, random_state=1)
addtocart_events = df_GT50[df_GT50['event'] == 2.0].sample(frac=0.1, random_state=1)
transaction_events = df_GT50[df_GT50['event'] == 3.0]
df_sampled = pd.concat([view_events, addtocart_events, transaction_events])

user_interaction_counts = df_sampled['visitorid'].value_counts()
high_interaction_users = user_interaction_counts[user_interaction_counts > 10].index
df_reduced = df_sampled[df_sampled['visitorid'].isin(high_interaction_users)]

  events['event'] = events['event'].replace({'view': 1.0, 'addtocart': 2.0, 'transaction': 3.0})


In [6]:
# Inhaltsbasiertes Empfehlungssystem: TF-IDF-Vektorisierung
np.random.seed(1)
items_to_keep = np.random.choice(items_raw_df['itemid'].unique(), size=10000, replace=False)
items_df = items_raw_df[items_raw_df['itemid'].isin(items_to_keep)]
items_df = items_df.groupby(['itemid', 'property'])['value'].last().to_frame().reset_index()
items_df = items_df.groupby('itemid')['value'].apply(lambda x: ' '.join(x)).to_frame()

tfidfvec = TfidfVectorizer(min_df=500, max_df=0.7)
vectorized_data = tfidfvec.fit_transform(items_df['value'])
tfidf_df = pd.DataFrame(vectorized_data.toarray(), columns=tfidfvec.get_feature_names_out())
tfidf_df.index = items_df.index

similarity_matrix = cosine_similarity(vectorized_data)
content_similarity_df = pd.DataFrame(similarity_matrix, index=items_df.index, columns=items_df.index)

# Collaborative Filtering: Generierung der Benutzer-Item-Matrix
matrix = df_reduced.pivot_table(index='visitorid', columns='itemid', values='event')
matrix_norm = (matrix - matrix.min()) / (matrix.max() - matrix.min())
matrix_norm = matrix_norm.fillna(0)

# Ähnlichkeitsmatrix nach Pearson
item_similarity_pearson = matrix_norm.corr()
item_similarity_cosine = cosine_similarity(matrix_norm.fillna(0).T)
collab_similarity_df = pd.DataFrame(item_similarity_cosine, index=matrix_norm.columns, columns=matrix_norm.columns)


In [7]:
# Funktion zur Berechnung der Top-N-Empfehlungen
def get_top_n_recommendations(item_ids, similarity_df, n=5):
    similar_items = pd.Series(dtype=float)
    for item in item_ids:
        if item in similarity_df:
            similar_items = pd.concat([similar_items, similarity_df[item]])
    similar_items = similar_items.groupby(similar_items.index).mean()
    similar_items = similar_items.drop(item_ids, errors='ignore')
    similar_items = similar_items.sort_values(ascending=False)
    return similar_items.head(n)

# Switching-Hybrid Empfehlungssystem
def switching_hybrid_recommendation(user_viewed_items, content_similarity_df, collab_similarity_df, threshold=0.3, n=5):
    recommendations = get_top_n_recommendations(user_viewed_items, content_similarity_df, n)
    if recommendations.empty or recommendations.iloc[0] < threshold:
        recommendations = get_top_n_recommendations(user_viewed_items, collab_similarity_df, n)
    return recommendations

# Leave-One-Out Cross Validation (LOOCV) für Switching-Hybrid System
def loocv_recommendation_switching_hybrid(user_viewed_items, content_similarity_df, collab_similarity_df, threshold=0.3, n=5):
    hit_count = 0
    for i in range(len(user_viewed_items)):
        test_item = user_viewed_items[i]
        train_items = user_viewed_items[:i] + user_viewed_items[i+1:]
        recommendations = switching_hybrid_recommendation(train_items, content_similarity_df, collab_similarity_df, threshold, n)
        if test_item in recommendations.index:
            hit_count += 1
    success_rate = hit_count / len(user_viewed_items)
    return success_rate

# Define a simpler LOOCV function for individual similarity measures
def loocv_recommendation(user_viewed_items, similarity_df, n=5):
    hit_count = 0
    for i in range(len(user_viewed_items)):
        test_item = user_viewed_items[i]
        train_items = user_viewed_items[:i] + user_viewed_items[i+1:]
        similar_items = get_top_n_recommendations(train_items, similarity_df, n)
        if test_item in similar_items.index:
            hit_count += 1
    success_rate = hit_count / len(user_viewed_items)
    return success_rate

In [8]:
# Beispiel: Liste von Artikeln, die ein Nutzer bereits gesehen oder gekauft hat
user_viewed_items = df_reduced[df_reduced['visitorid'] == df_reduced['visitorid'].iloc[0]]['itemid'].tolist()

# Berechnung der Erfolgsrate für das Switching-Hybrid Empfehlungssystem
success_rate_switching_hybrid = loocv_recommendation_switching_hybrid(user_viewed_items, content_similarity_df, collab_similarity_df, threshold=0.3, n=5)

print(f"Erfolgsquote des Switching-Hybrid Empfehlungssystems (Top-5): {success_rate_switching_hybrid:.2f}")

# Vergleich mit den anderen Ansätzen
print("\nErgebnisse der Collaborative Filtering Ansätze:")
for name, similarity_df in [('Pearson', item_similarity_pearson), 
                            ('Cosine', collab_similarity_df), 
                            ('Euclidean', item_similarity_cosine)]:
    success_rate = loocv_recommendation(user_viewed_items, similarity_df, n=5)
    print(f"{name} Similarity Success Rate: {success_rate:.2f}")

print("\nErgebnis des Inhaltsbasierten Ansatzes:")
success_rate_content_based = loocv_recommendation(user_viewed_items, content_similarity_df, n=5)
print(f"Content-Based Success Rate: {success_rate_content_based:.2f}")


Erfolgsquote des Switching-Hybrid Empfehlungssystems (Top-5): 0.00

Ergebnisse der Collaborative Filtering Ansätze:
Pearson Similarity Success Rate: 0.19
Cosine Similarity Success Rate: 0.19
Euclidean Similarity Success Rate: 0.00

Ergebnis des Inhaltsbasierten Ansatzes:
Content-Based Success Rate: 0.00


In [9]:
# Matrixfaktorisierung
def loocv_matrix_factorization(matrix, model_func, n_components=15):
    hit_count = 0
    num_interactions = 0
    matrix = matrix.fillna(0)
    for user in range(matrix.shape[0]):
        for item in range(matrix.shape[1]):
            if matrix.iloc[user, item] > 0:
                num_interactions += 1
                test_value = matrix.iloc[user, item]
                matrix.iloc[user, item] = 0
                
                model, user_factors, item_factors = model_func(matrix, n_components)
                
                user_factor = user_factors[user]
                scores = np.dot(item_factors, user_factor)
                top_n_items = np.argsort(scores)[::-1][:5]
                
                if item in top_n_items:
                    hit_count += 1
                
                matrix.iloc[user, item] = test_value
    
    success_rate = hit_count / num_interactions
    return success_rate

In [10]:
def nmf_sgd_model(matrix, n_components):
    nmf_sgd = NMF(n_components=n_components, init='random', random_state=42, solver='mu')
    W_sgd = nmf_sgd.fit_transform(matrix)
    H_sgd = nmf_sgd.components_
    return nmf_sgd, W_sgd, H_sgd.T

def als_model(matrix, n_components):
    train_sparse = sparse.csr_matrix(matrix)
    als = AlternatingLeastSquares(factors=n_components, regularization=0.1, iterations=50)
    als.fit(train_sparse.T)
    return als, als.user_factors, als.item_factors

In [None]:
results_mf = {}
results_mf['SGD'] = loocv_matrix_factorization(matrix_norm, nmf_sgd_model, n_components=15)
results_mf['ALS'] = loocv_matrix_factorization(matrix_norm, als_model, n_components=15)

print("\nErgebnisse der Matrixfaktorisierung Ansätze:")
for name, success_rate in results_mf.items():
    print(f"{name} Matrix Factorization Success Rate: {success_rate:.2f}")

  delta_H /= denominator
  H *= delta_H
