In [1]:
import pandas as pd
import numpy as np
from surprise import Dataset, Reader, SVD
from sklearn.model_selection import train_test_split

In [2]:
def load_data():
    dataset_file = 'dataset_purchases.csv'
    df = pd.read_csv(dataset_file)
    df['timestamp'] = pd.to_datetime(df['timestamp'])
    df['usd'] = df['usd'].round(2)
    df['hour'] = df['timestamp'].dt.hour
    df['week'] = df['timestamp'].dt.isocalendar().week
    return df

In [33]:
def prepare_data(df):
    
    # Create item_id from price-coins combination
    df['item_id'] = df['usd'].astype(str) + '_' + df['coins'].astype(str)
    df['user_hour'] = df['user_id'].astype(str) + '_' + df['hour'].astype(str)
    
    # Calculate purchase frequency and normalize VFM
    purchase_frequency = df.groupby(['user_hour', 'item_id']).size().reset_index(name='frequency')
    avg_vfm = df.groupby(['item_id'])['VFM'].mean().reset_index(name='avg_vfm')
    
    # Normalize frequency and VFM to 0-1 scale
    purchase_frequency['norm_frequency'] = purchase_frequency['frequency'] / purchase_frequency['frequency'].max()
    avg_vfm['norm_vfm'] = (avg_vfm['avg_vfm'] - avg_vfm['avg_vfm'].min()) / (avg_vfm['avg_vfm'].max() - avg_vfm['avg_vfm'].min())
    
    # Combine frequency and VFM into final rating
    df = df.merge(purchase_frequency[['user_hour', 'item_id', 'norm_frequency']], on=['user_hour', 'item_id'])
    df = df.merge(avg_vfm[['item_id', 'norm_vfm']], on='item_id')
    
    # Create actual rating based on frequency and VFM (you can adjust the weighting)
    df['rating'] = 0.7 * df['norm_frequency'] + 0.3 * df['norm_vfm']
    
    # Split the data
    train_df, test_df = train_test_split(
        df,
        test_size=0.2,
        random_state=42,
        stratify=df['user_id']
    )
    
    # Create Surprise dataset - FIXED HERE: only pass 3 columns
    reader = Reader(rating_scale=(0, 1))
    trainset = Dataset.load_from_df(
        train_df[['user_hour', 'item_id', 'rating']], 
        reader
    ).build_full_trainset()
    
    # Add analysis of the ratings
    print("\nRating Statistics:")
    print(f"Average rating: {df['rating'].mean():.3f}")
    print(f"Rating standard deviation: {df['rating'].std():.3f}")
    print("\nCorrelations with rating:")
    print(f"Frequency correlation: {df['norm_frequency'].corr(df['rating']):.3f}")
    print(f"VFM correlation: {df['norm_vfm'].corr(df['rating']):.3f}")
    
    return trainset, train_df, test_df

In [34]:
def train_model(trainset):
    """
    Train SVD model with optimized default parameters for the enriched rating system
    """
    # Initialize model with carefully chosen parameters
    model = SVD(
        n_factors=50,     # More factors to capture VFM patterns
        n_epochs=30,      # More epochs for better convergence
        lr_all=0.1,     # Slightly lower learning rate for stability
        reg_all=0.05      # Moderate regularization
    )
    
    # Train the model
    model.fit(trainset)
    return model

def get_recommendations(model, train_df, user_id, hour, n_items=6):
    user_hour = f"{user_id}_{hour}"
    items = train_df['item_id'].unique()
    predictions = [model.predict(user_hour, item_id) for item_id in items]
    top_items = sorted(predictions, key=lambda x: x.est, reverse=True)[:n_items]
    return [pred.iid for pred in top_items]

In [35]:
df = load_data()
trainset, train_df, test_df = prepare_data(df)


Rating Statistics:
Average rating: 0.065
Rating standard deviation: 0.057

Correlations with rating:
Frequency correlation: 0.359
VFM correlation: 0.858


In [36]:
model = train_model(trainset)

In [37]:
%%time
sample_size = 200
hit_rate_list = []
for i in range(2):
    hit_rate = 0
    for index, row  in test_df.sample(sample_size).iterrows():
        user_id = row['user_id']
        hour = row['hour']
        actual_item = row['item_id']
        predictions = get_recommendations(model, train_df, user_id, hour, 6)
        if actual_item in predictions:
            hit_rate += 1
    hit_rate_list.append(hit_rate / sample_size)
print('Avg Hit Rate:', np.mean(hit_rate_list))

Avg Hit Rate: 0.0075
CPU times: total: 55.4 s
Wall time: 1min 58s
