In [1]:
# Import necessary libraries
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import csr_matrix
from scipy.sparse.linalg import norm
import gc

In [2]:
# Laden der Daten
items_raw_df = pd.read_csv('item_properties_part1.csv')
# print(items_raw_df)
items_raw1_df = pd.read_csv('item_properties_part2.csv')
# print(items_raw_df)
items_raw_df = pd.concat([items_raw_df, items_raw1_df])

In [3]:
# Select random items for faster execution
np.random.seed(1)
items_to_keep = np.random.choice(items_raw_df['itemid'].unique(), size=2000, replace=False)
items_df = items_raw_df[items_raw_df['itemid'].isin(items_to_keep)]

# Use the last value per item and property
items_df = items_df.groupby(['itemid', 'property'])['value'].last().to_frame().reset_index()
items_df = items_df.groupby('itemid')['value'].apply(lambda x: ' '.join(x)).to_frame()

# Optimize the Tf-idf vectorizer settings
tfidfvec = TfidfVectorizer(min_df=10, max_df=0.9, ngram_range=(1, 2))
vectorized_data = tfidfvec.fit_transform(items_df['value'])

# Free up memory
del items_raw_df, items_raw1_df, items_df
gc.collect()

0

In [4]:
# Improve similarity calculation by normalizing the tf-idf matrix
row_norms = norm(vectorized_data, axis=1)
row_norms[row_norms == 0] = 1  # Avoid division by zero for all-zero vectors
normalized_data = vectorized_data.multiply(1.0 / row_norms.reshape(-1, 1))
similarity_matrix = normalized_data * normalized_data.T

In [8]:
# Function to calculate top-N recommendations
def get_top_n_recommendations(user_items, similarity_matrix, n=5):
    if not isinstance(user_items, (list, np.ndarray)):
        user_items = [user_items]
    
    user_items_matrix = similarity_matrix[user_items]
    similar_items = user_items_matrix.mean(axis=0)
    similar_items = pd.Series(similar_items.A1)
    similar_items.index = range(similarity_matrix.shape[0])
    similar_items = similar_items.drop(user_items, errors='ignore')
    similar_items = similar_items.sort_values(ascending=False)
    
    return similar_items.head(n)

# Leave-One-Out Cross Validation
def loocv_recommendation(user_viewed_items, similarity_matrix, n=5):
    hit_count = 0
    for i in range(len(user_viewed_items)):
        test_item = user_viewed_items[i]
        train_items = user_viewed_items[:i] + user_viewed_items[i + 1:]
        
        recommendations = get_top_n_recommendations(train_items, similarity_matrix, n)
        print(f"Recommendations for train items {train_items}: {recommendations}")
        
        if test_item in recommendations.index:
            hit_count += 1
        else:
            print(f"Test item {test_item} not in recommendations.")

    success_rate = hit_count / len(user_viewed_items)
    return success_rate

# Example user viewed items (replace with actual item indices from your dataset)
user_viewed_items = [649, 335, 133]  # Replace with actual item IDs

# Execute Leave-One-Out Cross Validation
n_recommendations = 5
success_rate_content_based = loocv_recommendation(user_viewed_items, similarity_matrix, n_recommendations)

print(f"Recommendation success rate (Top-{n_recommendations}): {success_rate_content_based:.2f}")

Recommendations for train items [335, 1972]: 1280    0.900456
649     0.880631
735     0.810560
959     0.747593
1336    0.734627
dtype: float64
Recommendations for train items [649, 1972]: 1280    0.922734
335     0.858021
735     0.855283
959     0.745577
1336    0.708639
dtype: float64
Recommendations for train items [649, 335]: 735     0.855342
1972    0.850267
1280    0.837824
959     0.799198
1556    0.726610
dtype: float64
Recommendation success rate (Top-5): 1.00
