In [None]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction import DictVectorizer

In [None]:
MOVIELENS = True
LASTFM = True

In [None]:
if MOVIELENS == True:
    # read the data
    df = pd.read_csv('/data/ml-1m/ratings.dat', sep='::', engine='python')
    df.columns = ['user_id', 'item_id', 'rating', 'timestamp']
    
    # only consider implicit ratings: ratings larger or equal than 3 are considered positive interactions
    threshold = 3
    df = df[df['rating']>=threshold]
    df['rating'] = 1
    
    # split train and test sets for training the RecSys
    # leave-one-out strategy: use the last interactions (according to time stamp) of each user as test
    test_df = pd.DataFrame()
    for user_id in df.user_id.unique():
        tmp = df[df['user_id']==user_id]
        test_df = pd.concat([test_df, tmp.sort_values('timestamp').iloc[-1:]])
    train_df = pd.concat([df,test_df]).drop_duplicates(keep=False)
    
    # delete users that only appear in test but not in training
    test_users = set(test_df.user_id.unique())
    train_users = set(train_df.user_id.unique())
    user_to_delete = list(test_users.difference(train_users))
    test_df = test_df[~test_df.user_id.isin(user_to_delete)]
    
    # save splits
    train_df[['user_id','item_id','rating','timestamp']].to_csv('data/ml-1m/implicit_train_df.csv', index=False)
    test_df[['user_id','item_id','rating','timestamp']].to_csv('data/ml-1m/implicit_test_df.csv', index=False)

In [None]:
if LASTFM == True:
    df = pd.read_csv('data/lastfm-dataset-1K/userid-timestamp-artid-artname-traid-traname.tsv', sep='\t', 
                     header=None, names=['user_id', 'timestamp', 'artist_id', 'artist_name', 'item_id', 'item_name'],
                     on_bad_lines='skip')
    
    df = df.dropna()
    df = df[['user_id','item_id','timestamp']]
    
    # 1.  retain only the initial occurrence of each interaction (given by the time stamp)
    df['timestamp'] = pd.to_datetime(df['timestamp'])
    unique_interactions = df.groupby(['user_id','item_id']).min().reset_index()
    
    # 2. 3000 most frequently ocurring songs
    top_k = 3000
    itemid_to_save = df.groupby('item_id').count().sort_values('user_id', ascending=False)[:top_k].index
    df = unique_interactions[unique_interactions['item_id'].isin(itemid_to_save)]
    
    # 3. users with at least 20 interactions
    min_interactions = 20
    tmp = df.groupby('user_id').count().sort_values('item_id')
    userid_to_save = tmp[tmp['item_id']>=min_interactions].index
    df = df[df['user_id'].isin(userid_to_save)]
    
    # map item_id and user_id to numerical id for simplicity
    map_dict_items = {item:index for index,item in enumerate(df.item_id.unique())}
    df.replace({'item_id':map_dict_items}, inplace=True)
    map_dict_users = {user:index for index,user in enumerate(df.user_id.unique())}
    df.replace({'user_id':map_dict_users}, inplace=True)
    df.reset_index(inplace=True, drop=True)
    
    # split train and test sets for training the RecSys
    # leave-one-out strategy: use the last interactions (according to time stamp) of each user as test
    test_df = pd.DataFrame()
    for user_id in df.user_id.unique():
        tmp = df[df['user_id']==user_id]
        test_df = pd.concat([test_df, tmp.sort_values('timestamp').iloc[-1:]])
    train_df = pd.concat([df,test_df]).drop_duplicates(keep=False)
    
    # save splits
    train_df[['user_id','item_id','timestamp']].to_csv('data/lastfm-dataset-1K/implicit_train_df.csv', index=False)
    test_df[['user_id','item_id','timestamp']].to_csv('data/lastfm-dataset-1K/implicit_test_df.csv', index=False)