In [39]:
import numpy as np
import pandas as pd
from lightfm import LightFM
from lightfm.evaluation import precision_at_k
from lightfm.evaluation import recall_at_k

np.random.seed(42)

In [3]:
ratings = pd.read_csv("../ml-100k/u.data", sep='\t',
                   header=None, names='user_id | item_id | rating | timestamp'.split(' | '))
ratings['date'] = pd.to_datetime(ratings['timestamp'], unit='s').min()
ratings.drop('timestamp', axis=1, inplace=True)
ratings.sort_values('date', inplace=True)
ratings.head()

Unnamed: 0,user_id,item_id,rating,date
0,196,242,3,1997-09-20 03:05:10
66671,334,129,4,1997-09-20 03:05:10
66670,806,1074,3,1997-09-20 03:05:10
66669,606,585,4,1997-09-20 03:05:10
66668,788,227,3,1997-09-20 03:05:10


In [4]:
#number_of_items = ratings.shape[0]
#index_to_train = int(round(number_of_items * 0.2, 0))
#ratings.iloc[:-index_to_train].drop('date', axis=1).to_csv('train_ratings.csv', sep='\t', index=False)
#ratings.iloc[-index_to_train:].drop('date', axis=1).to_csv('test_ratings.csv', sep='\t', index=False)

train_ratings = pd.read_csv('train_ratings.csv', sep='\t')
test_ratings = pd.read_csv('test_ratings.csv', sep='\t')

In [5]:
item_features_names = """movie id | movie title | release date | video release date | IMDb URL | unknown | Action | Adventure | Animation | Children's | Comedy | Crime | Documentary | Drama | Fantasy | Film-Noir | Horror | Musical | Mystery | Romance | Sci-Fi | Thriller | War | Western""".split(' | ')

item_features = pd.read_csv("../ml-100k/u.item", sep='|',
                             encoding='latin-1', names=item_features_names)

item_features['release date'] = pd.to_datetime(item_features['release date'])
most_frequent_date = item_features['release date'].value_counts().idxmax()
item_features['release date'].fillna(most_frequent_date, inplace=True)

item_features['release_year'] = item_features['release date'].dt.year
item_features['release_month'] = item_features['release date'].dt.month
item_features['release_day'] = item_features['release date'].dt.day

item_features = item_features.drop(['release date', 'video release date', 'movie title', 'IMDb URL'], axis=1)
item_features = item_features.rename(columns={'movie id': 'item_id'})

item_features_names = item_features.columns
item_features.head()

Unnamed: 0,item_id,unknown,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,Drama,...,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,release_year,release_month,release_day
0,1,0,0,0,1,1,1,0,0,0,...,0,0,0,0,0,0,0,1995,1,1
1,2,0,1,1,0,0,0,0,0,0,...,0,0,0,0,1,0,0,1995,1,1
2,3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,1995,1,1
3,4,0,1,0,0,0,1,0,0,1,...,0,0,0,0,0,0,0,1995,1,1
4,5,0,0,0,0,0,0,1,0,1,...,0,0,0,0,1,0,0,1995,1,1


In [24]:
user_features_names = 'user_id | age | gender | occupation | zip_code'.split(' | ')
user_features = pd.read_csv("../ml-100k/u.user", sep='|',
                             encoding='latin-1', names=user_features_names)

user_features['age_bin'] = pd.qcut(user_features['age'], 10, labels=False)
user_features.drop('age', axis=1, inplace=True)
user_features.head()

Unnamed: 0,user_id,gender,occupation,zip_code,age_bin
0,1,M,technician,85711,2
1,2,F,other,94043,9
2,3,M,writer,32067,1
3,4,M,technician,43537,2
4,5,F,other,15213,5


In [28]:
from lightfm.data import Dataset
from lightfm.cross_validation import random_train_test_split
from sklearn.model_selection import train_test_split


def get_item_feature_values(item_features):
    GENRES = item_features.columns[1:-3]
    genres = GENRES.tolist()
    years = ['release_year:' + str(year) for year in item_features['release_year'].unique()]
    months = ['release_month:' + str(month) for month in item_features['release_month'].unique()]
    days = ['release_day:' + str(day) for day in item_features['release_day'].unique()]
    
    return genres + years + months + days


def get_user_feature_values(user_features):
    genders = user_features['gender'].unique().tolist()
    occupations = user_features['occupation'].unique().tolist()
    zip_codes = ['zip_code:' + str(zip_code) for zip_code in user_features['zip_code'].unique()]
    age_bins = ['age_bin:' + str(age_bin) for age_bin in user_features['age_bin'].unique()]
    
    return genders + occupations + zip_codes + age_bins


def collect_item_feature_values(item_features):
    res = []
    GENRES = item_features.columns[1:-3]
    for _, row in item_features.iterrows():
        feature_values = []
        
        feature_values += GENRES[row[GENRES] == 1].tolist()
        
        feature_values.append('release_year:' + str(row['release_year']))
        feature_values.append('release_month:' + str(row['release_month']))
        feature_values.append('release_day:' + str(row['release_day']))
        
        res.append(feature_values)
        
    return zip(item_features['item_id'], res)


def collect_user_feature_values(user_features):
    res = []
    
    for _, row in user_features.iterrows():
        feature_values = []
        
        feature_values.append(row['gender'])
        feature_values.append(row['occupation'])
        feature_values.append('zip_code:' + row['zip_code'])
        feature_values.append('age_bin:' + str(row['age_bin']))
        
        res.append(feature_values)
        
    return zip(user_features['user_id'], res)


def build_dataset(ratings, user_features, item_features):
    user_feature_values = get_user_feature_values(user_features)
    item_feature_values = get_item_feature_values(item_features)
    
    dataset = Dataset()
    dataset.fit(
        users=ratings['user_id'].unique(),
        items=ratings['item_id'].unique(),
        user_features=user_feature_values,
        item_features=item_feature_values
    )
    
    dataset.fit_partial(items=item_features['item_id'])
    dataset.fit_partial(users=user_features['user_id'])
    
    item_features_b = dataset.build_item_features(collect_item_feature_values(item_features))
    user_features_b = dataset.build_user_features(collect_user_feature_values(user_features))
    
    return dataset, item_features_b, user_features_b


def prepare_data_for_lightfm(train_ratings, test_ratings, user_features, item_features):

    ratings = pd.concat([train_ratings, test_ratings])
    
    dataset, item_features_b, user_features_b = build_dataset(ratings, user_features, item_features)
    
    train, train_weights = dataset.build_interactions(
        (user_id, item_id, rating) for _, (user_id, item_id, rating) in train_ratings.iterrows()
    )
        
    test, test_weights = dataset.build_interactions(
        (user_id, item_id, rating) for _, (user_id, item_id, rating) in test_ratings.iterrows()
    )
    
    return train, test, train_weights, test_weights, item_features_b, user_features_b

In [31]:
(train, test,
train_weights, test_weights,
item_features_b, user_features_b) = prepare_data_for_lightfm(train_ratings, test_ratings, user_features, item_features)

In [40]:
def test_lightfm_no_components(no_components_range=[ 10, 20, 30, 40, 50, 100], epochs = [50, 80, 100]):
    train, test, train_weights, test_weights, item_features_b, user_features_b = prepare_data_for_lightfm(train_ratings, test_ratings, user_features, item_features)
    results = list()
    for epoch in epochs:
        print('no_epochs = ',epoch)
        for no_components in no_components_range:
            print('no_components = ', no_components)
            model = LightFM(learning_rate=0.01, loss='warp', no_components=no_components)
            model.fit(train,
                      user_features=user_features_b, 
                      item_features=item_features_b,
                      sample_weight=train_weights,
                      epochs=epoch)
        
            train_precision = precision_at_k(model, train, user_features=user_features_b, item_features=item_features_b, k=20).mean()
            test_precision = precision_at_k(model, test, train_interactions=train, user_features=user_features_b, item_features=item_features_b, k=20).mean()

            train_recall = recall_at_k(model, train, user_features=user_features_b, item_features=item_features_b, k=20).mean()
            test_recall = recall_at_k(model, test, train_interactions=train, user_features=user_features_b, item_features=item_features_b, k=20).mean()
            
            answers = [epoch, no_components, train_precision, test_precision, train_recall, test_recall]
            results.append(answers)
    return results

In [44]:
data = test_lightfm_no_components()

no_epochs =  50
no_components =  10
no_components =  20
no_components =  30
no_components =  40
no_components =  50
no_components =  100
no_epochs =  80
no_components =  10
no_components =  20
no_components =  30
no_components =  40
no_components =  50
no_components =  100
no_epochs =  100
no_components =  10
no_components =  20
no_components =  30
no_components =  40
no_components =  50
no_components =  100


In [46]:
columns = ['no_epochs', 'no_components', 'train_precision', 'test_precision', 'train_recall', 'test_recall']

In [48]:
pd.DataFrame(data, columns=columns).to_csv('lightFM_test.csv', index=False)