In [19]:
import pandas as pd
from surprise import Dataset, Reader, SVD
# from surprise.model_selection import train_test_split
from sklearn.model_selection import train_test_split
from collections import defaultdict

In [20]:
def load_data():
    dataset_file = 'dataset_purchases.csv'
    df = pd.read_csv(dataset_file)
    df['timestamp'] = pd.to_datetime(df['timestamp'])
    df['usd'] = df['usd'].round(2)
    df['hour'] = df['timestamp'].dt.hour
    df['week'] = df['timestamp'].dt.isocalendar().week
    return df

In [44]:
def prepare_data(df):
    # Create item_id from price-coins combination
    df['item_id'] = df['usd'].astype(str) + '_' + df['coins'].astype(str)
    df['user_hour'] = df['user_id'].astype(str) + '_' + df['hour'].astype(str)
    df['rating'] = 1
    
    # First split the DataFrame with stratification
    train_df, test_df = train_test_split(
        df,
        test_size=0.2,
        random_state=42,
        stratify=df['user_id']
    )
    
    # Then create Surprise datasets
    reader = Reader(rating_scale=(0, 1))
    trainset = Dataset.load_from_df(train_df[['user_hour', 'item_id', 'rating']], reader).build_full_trainset()
    # Keep test_df as DataFrame instead of converting to Surprise format
    
    return trainset, train_df, test_df

def train_model(trainset):
    model = SVD(n_factors=30)
    model.fit(trainset)
    return model

def get_recommendations(model, train_df, user_id, hour, n_items=6):
    user_hour = f"{user_id}_{hour}"
    items = train_df['item_id'].unique()
    predictions = [model.predict(user_hour, item_id) for item_id in items]
    top_items = sorted(predictions, key=lambda x: x.est, reverse=True)[:n_items]
    return [pred.iid for pred in top_items]

In [49]:
def evaluate_recommendations(model, train_df, test_df, n_items=6):
    hits = 0
    total = 0
    
    for user_id in test_df['user_id'].unique():
        for hour in test_df[test_df['user_id'] == user_id]['hour'].unique():
            # Get actual items bought by user at this hour
            actual_items = test_df[
                (test_df['user_id'] == user_id) & 
                (test_df['hour'] == hour)
            ]['item_id'].values
            
            # Get recommendations
            predictions = get_recommendations(model, train_df, user_id, hour, n_items)
            
            # Check if any actual item is in predictions
            hits += any(item in predictions for item in actual_items)
            total += 1
    
    hit_rate = hits / total if total > 0 else 0
    print(f"Hit Rate @ {n_items}: {hit_rate:.3f}")
    return hit_rate

In [46]:
df = load_data()
trainset, train_df, test_df = prepare_data(df)

In [47]:
model = train_model(trainset)

In [51]:
test_df

Unnamed: 0,user_id,timestamp,usd,coins,VFM,hour,week,item_id,user_hour,rating
1525029,12644735767,2024-02-06 21:29:59+00:00,4.99,500,100.20,21,6,4.99_500,12644735767_21,1
542191,10363295563,2024-02-13 00:07:44+00:00,4.99,575,115.23,0,7,4.99_575,10363295563_0,1
47308,11200437351,2024-02-05 19:48:19+00:00,34.99,3500,100.03,19,6,34.99_3500,11200437351_19,1
883422,14173101663,2024-01-31 00:34:49+00:00,0.99,100,101.01,0,5,0.99_100,14173101663_0,1
481335,14073898879,2024-01-21 18:26:18+00:00,1.99,200,100.50,18,3,1.99_200,14073898879_18,1
...,...,...,...,...,...,...,...,...,...,...
946784,10970021727,2024-01-25 11:01:33+00:00,0.99,100,101.01,11,4,0.99_100,10970021727_11,1
891813,13957047303,2024-01-29 19:06:10+00:00,0.99,100,101.01,19,5,0.99_100,13957047303_19,1
1649427,12965882919,2024-01-15 02:38:08+00:00,9.99,1500,150.15,2,3,9.99_1500,12965882919_2,1
357393,12072550119,2024-02-03 19:44:44+00:00,1.99,200,100.50,19,5,1.99_200,12072550119_19,1


In [None]:
for index, row  in test_df.iterrows():
    print(index)

1525029
542191
47308
883422
481335
840451
295341
1092049
793128
1586114
1322240
10372
605580
732009
989177
1345292
238514
142035
435844
702456
204213
997
1081659
321980
819390
1194053
696958
1597059
1621927
942600
1365520
840320
1432399
219013
1274363
606919
1095043
1405698
1144469
1467388
1606485
434528
1497355
231917
1191477
1657055
341220
1665758
19908
22337
69062
1010428
475736
1688164
88571
22270
907893
744265
901012
1570074
704530
134711
320128
1214782
1343606
658597
27343
1365494
660699
247237
1504381
1724389
1539089
950142
1298229
344584
625030
1180473
1672470
1016219
1040991
1366652
1726982
1083471
1669927
1506626
846460
1007071
337346
107887
1735786
663172
344312
18297
449641
1479472
1334343
740943
709584
866515
1382098
783985
1695289
1635657
503661
951626
155826
836637
173786
676121
1228835
1047698
23031
418763
1740186
535539
1540779
576622
1116840
428503
235928
585676
988962
1394538
483557
1202860
938203
851670
1598945
805178
295427
121735
1199241
911397
51951
1237985
83429

In [50]:
# Evaluate
hit_rate = evaluate_recommendations(model, train_df, test_df, n_items=6)

KeyboardInterrupt: 

In [27]:
user_id = '10035998487 '
hour = 14
user_hour = f"{user_id}_{hour}"
predictions = get_recommendations(model, df, user_id, hour)

In [28]:
predictions

['4.99_775', '1.99_290', '4.99_875', '4.99_850', '1.99_360', '249.99_42000']