In [19]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/ecommerce-dataset/item_properties_part1.csv
/kaggle/input/ecommerce-dataset/category_tree.csv
/kaggle/input/ecommerce-dataset/item_properties_part2.csv
/kaggle/input/ecommerce-dataset/events.csv


In [26]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity, pairwise_distances
from scipy.spatial.distance import pdist, squareform
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import mean_squared_error, mean_absolute_error, precision_score, recall_score


In [27]:
# Laden und Vorbereiten der Daten
events = pd.read_csv('/kaggle/input/ecommerce-dataset/events.csv')
items_raw_df = pd.read_csv('/kaggle/input/ecommerce-dataset/item_properties_part1.csv')
items_raw1_df = pd.read_csv('/kaggle/input/ecommerce-dataset/item_properties_part2.csv')
items_raw_df = pd.concat([items_raw_df, items_raw1_df])

In [28]:
# Datenvorverarbeitung
events = events.dropna(subset=['event'])
events['event'] = events['event'].replace({'view': 1.0, 'addtocart': 2.0, 'transaction': 3.0})
events = events.drop(columns=['timestamp', 'transactionid'])

agg_events = events.groupby('itemid').agg(mean_event=('event', 'mean'), number_of_events=('event', 'count')).reset_index()
agg_events_GT50 = agg_events[agg_events['number_of_events'] > 50]
df_GT50 = pd.merge(events, agg_events_GT50[['itemid']], on='itemid', how='inner')

# Adjust sample fractions to increase data size
view_events = df_GT50[df_GT50['event'] == 1.0].sample(frac=0.05, random_state=1)
addtocart_events = df_GT50[df_GT50['event'] == 2.0].sample(frac=0.2, random_state=1)
transaction_events = df_GT50[df_GT50['event'] == 3.0]
df_sampled = pd.concat([view_events, addtocart_events, transaction_events])

user_interaction_counts = df_sampled['visitorid'].value_counts()
high_interaction_users = user_interaction_counts[user_interaction_counts > 10].index
df_reduced = df_sampled[df_sampled['visitorid'].isin(high_interaction_users)]

# Collaborative Filtering: Generierung der Benutzer-Item-Matrix
matrix = df_reduced.pivot_table(index='visitorid', columns='itemid', values='event')
matrix_norm = (matrix - matrix.min()) / (matrix.max() - matrix.min())
matrix_norm = matrix_norm.fillna(0)

# Ähnlichkeitsmatrix nach Pearson
item_similarity_pearson = matrix_norm.corr()
item_similarity_cosine = cosine_similarity(matrix_norm.fillna(0).T)
item_similarity_cosine_df = pd.DataFrame(item_similarity_cosine, index=matrix_norm.columns, columns=matrix_norm.columns)

# Using Adjusted Cosine Similarity
item_similarity_adjusted_cosine = 1 - pairwise_distances(matrix_norm.T, metric='cosine')
item_similarity_adjusted_cosine_df = pd.DataFrame(item_similarity_adjusted_cosine, index=matrix_norm.columns, columns=matrix_norm.columns)

# Inhaltsbasiertes Empfehlungssystem: TF-IDF-Vektorisierung
np.random.seed(1)
items_to_keep = np.random.choice(items_raw_df['itemid'].unique(), size=10000, replace=False)
items_df = items_raw_df[items_raw_df['itemid'].isin(items_to_keep)]
items_df = items_df.groupby(['itemid', 'property'])['value'].last().to_frame().reset_index()
items_df = items_df.groupby('itemid')['value'].apply(lambda x: ' '.join(x)).to_frame()

# Adjusted TF-IDF Parameters
tfidfvec = TfidfVectorizer(min_df=100, max_df=0.9)  # Adjusted min_df and max_df
vectorized_data = tfidfvec.fit_transform(items_df['value'])
tfidf_df = pd.DataFrame(vectorized_data.toarray(), columns=tfidfvec.get_feature_names_out())
tfidf_df.index = items_df.index

similarity_matrix = cosine_similarity(vectorized_data)
similarity_df = pd.DataFrame(similarity_matrix, index=items_df.index, columns=items_df.index)


  events['event'] = events['event'].replace({'view': 1.0, 'addtocart': 2.0, 'transaction': 3.0})


In [29]:
# Funktion zur Berechnung der Top-N-Empfehlungen
def get_top_n_recommendations(item_ids, similarity_df, n=5):
    similar_items = pd.Series(dtype=float)
    for item in item_ids:
        if item in similarity_df.index:  # Ensure item is in the index
            similar_items = pd.concat([similar_items, similarity_df[item]])
        else:
            print(f"Item ID {item} not found in similarity DataFrame.")
    similar_items = similar_items.groupby(similar_items.index).mean()
    similar_items = similar_items.drop(item_ids, errors='ignore')
    similar_items = similar_items.sort_values(ascending=False)
    return similar_items.head(n)

# Gewichtetes Hybrides Empfehlungssystem
def hybrid_recommendation(user_viewed_items, similarity_dfs, weights, n=5):
    combined_similar_items = pd.Series(dtype=float)
    for item in user_viewed_items:
        if all(item in similarity_df.index for similarity_df in similarity_dfs):  # Check for all similarity DataFrames
            weighted_similar_items = sum(w * similarity_dfs[i][item] for i, w in enumerate(weights))
            combined_similar_items = pd.concat([combined_similar_items, weighted_similar_items])
        else:
            print(f"Item ID {item} not found in one of the similarity DataFrames.")
    combined_similar_items = combined_similar_items.groupby(combined_similar_items.index).mean()
    combined_similar_items = combined_similar_items.drop(user_viewed_items, errors='ignore')
    combined_similar_items = combined_similar_items.sort_values(ascending=False)
    return combined_similar_items.head(n)

# Leave-One-Out Cross Validation (LOOCV) für hybrides System
def loocv_recommendation_hybrid(user_viewed_items, similarity_dfs, weights, n=5):
    hit_count = 0
    for i in range(len(user_viewed_items)):
        test_item = user_viewed_items[i]
        train_items = user_viewed_items[:i] + user_viewed_items[i+1:]
        recommendations = hybrid_recommendation(train_items, similarity_dfs, weights, n)
        if test_item in recommendations.index:
            hit_count += 1
    success_rate = hit_count / len(user_viewed_items)
    return success_rate

In [30]:
# Beispiel: Liste von Artikeln, die ein Nutzer bereits gesehen oder gekauft hat
user_viewed_items = df_reduced[df_reduced['visitorid'] == df_reduced['visitorid'].iloc[0]]['itemid'].tolist()

# Filter out items not present in similarity DataFrames
filtered_user_viewed_items = [item for item in user_viewed_items if all(item in similarity_df.index for similarity_df in [item_similarity_cosine_df, similarity_df])]
print(f"Filtered user viewed items: {filtered_user_viewed_items}")

# Berechnung der Erfolgsrate für das gewichtete hybride Empfehlungssystem
similarity_dfs = [item_similarity_cosine_df, similarity_df]
weights = [0.7, 0.3]  # Adjusted weights

success_rate_hybrid = loocv_recommendation_hybrid(filtered_user_viewed_items, similarity_dfs, weights, n=5)

print(f"Erfolgsquote des gewichteten hybriden Empfehlungssystems (Top-5): {success_rate_hybrid:.2f}")

# Vergleich mit den anderen Ansätzen
print("\nErgebnisse der Collaborative Filtering Ansätze:")

Filtered user viewed items: [287449, 338395, 458637, 287449, 338395]
Erfolgsquote des gewichteten hybriden Empfehlungssystems (Top-5): 0.20

Ergebnisse der Collaborative Filtering Ansätze:
