In [4]:
# IMPORTS
from itertools import chain, zip_longest
from datetime import datetime
import pandas as pd
import numpy as np
import gc
import random
import os
import json
import tensorflow as tf

gc.collect()
start = datetime.now()
print('{} - Initialized environment'.format(
    datetime.now() - start
))

0:00:00.000075 - Initialized environment


## Quick Guide
Here we will train a Bayesian Personalized Ranking (BPR-MF) model on the MovieLens 1M dataset.

To start, we assume that the data (ratings.dat, etc.) files have been extracted to the `datasets/ml_1m` folder.

The following block will process the raw data file to make it into a pandas DataFrame.

In [2]:
def get_interactions(filename):
    # Separators are ::, but the c engine doesn't handle multiple-char separators. Thus we're just going to have to manually interleave and break.
    columns = ['user_id', 'item_id', 'rating', 'timestamp']
    interleave = list(map(str, np.arange(len(columns)-1)))
    read_names = [x for x in chain(*zip_longest(columns, interleave)) if x is not None]
    # Read the actual file in
    interactions_df = pd.read_csv(filename, sep=':', header=None, names=read_names).drop(columns=interleave)
    return interactions_df

movie_dir = os.path.join('datasets', 'ml-1m')
ratings_file = os.path.join(movie_dir, 'ratings.dat')
interactions_loc = os.path.join(movie_dir, 'interactions.msgpack')

start = datetime.now()
try:
    df = pd.read_msgpack(interactions_loc)
#     df=df = get_interactions(ratings_file)
    print('{} - Retrieved interactions df.'.format(datetime.now() - start))
except Exception as e:
    print('Error unpickling {}, reconstructing from ratings.dat: {}'.format(interactions_loc, e))
    df = get_interactions(ratings_file)
    print('{} - Processed interactions from ratings.dat'.format(datetime.now() - start))
    df.to_msgpack(interactions_loc)
    print('{} - Serialized interactions to {}'.format(datetime.now() - start, interactions_loc))

Error unpickling datasets/ml-1m/interactions.msgpack, reconstructing from ratings.dat: path_or_buf needs to be a string file path or file-like
0:00:00.734927 - Processed interactions from ratings.dat
0:00:00.844579 - Serialized interactions to datasets/ml-1m/interactions.msgpack


In [3]:
df.head(5)

Unnamed: 0,user_id,item_id,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


## Train/Test/Validation Splits
Perform the following operations to prepare our MovieLens interactions data for the recommender system model:
- K-core (preserve only users and items with more than 5 interactions)
- Create temporal columns
- Map user/item IDs to contiguous integer series

In [4]:
'''
INITIALIZE data and train/validation/test splits
'''
from recsys_models.data import process_temporal_columns, kcore, map_user_items, print_basic_stats
from recsys_models.data.sampling import train_test_validation_split, get_user_interactions_df

user_col = 'user_id'
item_col = 'item_id'

# Get temporal columns
df['date'] = pd.to_datetime(df['timestamp'].apply(datetime.utcfromtimestamp))
df = process_temporal_columns(df)

# K-core
cores = 5
df = kcore(df, user_col, item_col, cores)

# User/item ix -> id mappings
df = map_user_items(df, user_col, item_col)

# Get stats
gc.collect()
print_basic_stats(df, user_col, item_col)
n_users = df[user_col].nunique()
n_items = df[item_col].nunique()

# Create train, validation, and test DFs by holding out the latest interaction per user for test and second-to-last for validation
start = datetime.now()
eval_size = 2000000
train_df, validation_df, test_df, all_int_by_user_df = train_test_validation_split(df, eval_size)
train_items_by_user = train_df.groupby(['u'])['i'].agg(lambda x: set(x)).to_dict()
print('{} - Generated train/validation/test splits and user : items dictionary mappings'.format(
    datetime.now() - start
))

0:00:01.240343 - Added proper temporal columns to df
Removing 0/6040 users (0.00 %) and 290/3706 items (7.83 %) from 1000209 total interactions (95.53164% Sparsity)
Removing 0/6040 users (0.00 %) and 0/3416 items (0.00 %) from 999611 total interactions (95.15520% Sparsity)
0:00:01.146771 - Done: 5-core decomposition after 2 iterations
0:00:01.207438 - Mapped u-i indices
0:00:01.810072 - Created "prior" column
6040 Users interacted with 3416 items 993571 times (95.1845% sparsity, 164.499 actions/user, 290.858 actions/item)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 993571 entries, 289422 to 856074
Data columns (total 12 columns):
user_id        993571 non-null int64
item_id        993571 non-null int64
rating         993571 non-null int64
timestamp      993571 non-null int64
date           993571 non-null datetime64[ns]
year           993571 non-null int64
month          993571 non-null int64
day_of_week    993571 non-null int64
day_of_year    993571 non-null int64
u            

In [5]:
display(train_df.head(3))
display(validation_df.head(3))
display(test_df.head(3))

Unnamed: 0,user_id,item_id,rating,timestamp,date,year,month,day_of_week,day_of_year,u,i,prior
289422,6040,2384,4,956703954,2000-04-25 23:05:54,2000,4,1,115,6039,323,669
138498,6040,593,5,956703954,2000-04-25 23:05:54,2000,4,1,115,6039,128,323
45032,6040,1961,4,956703977,2000-04-25 23:06:17,2000,4,1,115,6039,41,128


Unnamed: 0,user_id,item_id,rating,timestamp,date,year,month,day_of_week,day_of_year,u,i,prior,j
0,1875,1416,3,975769669,2000-12-02 15:07:49,2000,12,5,336,1874,2138,1644,337
1,1455,3471,3,978990819,2001-01-08 21:53:39,2001,1,0,8,1454,96,209,69
2,4670,2565,3,963817553,2000-07-17 07:05:53,2000,7,0,198,4669,1531,642,56


Unnamed: 0,user_id,item_id,rating,timestamp,date,year,month,day_of_week,day_of_year,u,i,prior,j
0,4585,1073,4,1005533993,2001-11-12 02:59:53,2001,11,0,316,4584,741,1297,1336
1,5692,2491,5,958678637,2000-05-18 19:37:17,2000,5,3,138,5691,1750,225,202
2,5879,3300,2,957546719,2000-05-05 17:11:59,2000,5,4,125,5878,993,188,1890


## Training Models
Now we can initialize our model with a few parameters, train a BPR-MF model, and compare it to the PopRec baseline (pick the most popular item as seen in the training set):

In [6]:
'''
PopRec Baseline - Pick the more popular item based on training interactions
'''
from recsys_models.models import pop_rec
from recsys_models.data.sampling import sample_unobserved

start = datetime.now()
pop_auc_tr = pop_rec(train_df, sample_unobserved(train_df, train_items_by_user, n_items, len(test_df)))
pop_auc_v = pop_rec(train_df, validation_df)
pop_auc_t = pop_rec(train_df, test_df)
print('{} - PopRec:\nTraining AUC:\t\t{:.5f}\nValidation AUC:\t\t{:.5f}\nTesting AUC:\t\t{:.5f}'.format(
    datetime.now() - start,
    pop_auc_tr,
    pop_auc_v,
    pop_auc_t
))

0:00:15.724033 - PopRec:
Training AUC:		0.84727
Validation AUC:		0.80068
Testing AUC:		0.79472


In [11]:
'''
Run FPMC
'''
from recsys_models.models.fpmc import FPMC
from recsys_models.pipeline import train_model

# Set training parameters
max_epochs = 200
n_iterations = 1000
batch_size = 512
stopping_threshold = 1e-5

# Get the validation and testing matrices
start = datetime.now()
validation_data = validation_df[['u', 'prior', 'i', 'j']].values
test_data = test_df[['u', 'prior', 'i', 'j']].values
print('{} - Generated u-i-j matrices for validation and testing'.format(
    datetime.now() - start
))

# Initialize the graph
tf.reset_default_graph()
model = FPMC(n_users, n_items, k=5, lambda_emb=1e-4, lambda_bias=1e-4,
               opt_type=tf.contrib.opt.LazyAdamOptimizer, opt_args={'learning_rate': 0.007})
print('\n=== BEGIN Optimization for {} ==='.format(model.model_id))
print('    {} Max epochs, with early stoppage at {} Validation AUC change'.format(max_epochs, stopping_threshold))
print('    {} Iterations per epoch with {}-sized batches'.format(n_iterations, batch_size))

# Open session and initialize graph weights
session = tf.Session()
session.run(tf.global_variables_initializer())

# Train the model!
model, train_auc, validation_auc, test_auc = train_model(
    session, model, train_df, validation_data, test_data,
    n_iterations=n_iterations, batch_size=batch_size,
    min_epochs=10, max_epochs=max_epochs,
    stopping_threshold=stopping_threshold,
    sample_columns=['u','prior', 'i'], column_order=['u', 'prior', 'i', 'j'],
    n_items=n_items, items_by_user=train_items_by_user
)

# Save model
suffix = '_ml-1m'
full_model_id = '{}{}'.format(model.model_id, suffix)
model_folder = os.path.join('tf_models', full_model_id)
if not os.path.exists(model_folder):
    os.makedirs(model_folder)
model.save(session, 'tf_models', suffix=suffix)
print('{} - Saved model to {}'.format(
    datetime.now() - start, model_folder
))

# Cleanup
session.close()
gc.collect()
print()

0:00:00.209102 - Generated u-i-j matrices for validation and testing

=== BEGIN Optimization for fpmc_5k_0.0001l2_0.0001l2bias ===
    200 Max epochs, with early stoppage at 1e-05 Validation AUC change
    1000 Iterations per epoch with 512-sized batches
0:00:01.132435 - Prior: 0.50019 Validation AUC, 0.50303 Testing AUC
[0:00:05.196760 - Epoch 1] 0.57655 Loss, 0.86135 Training AUC, 0.80213 Validation AUC (0.30194 Change)
[0:00:09.023570 - Epoch 2] 0.43467 Loss, 0.88797 Training AUC, 0.83548 Validation AUC (0.03335 Change)
[0:00:12.296919 - Epoch 3] 0.40422 Loss, 0.90188 Training AUC, 0.86033 Validation AUC (0.02485 Change)
[0:00:15.598650 - Epoch 4] 0.39443 Loss, 0.90944 Training AUC, 0.87093 Validation AUC (0.01059 Change)
[0:00:18.901762 - Epoch 5] 0.39030 Loss, 0.91366 Training AUC, 0.87654 Validation AUC (0.00561 Change)
[0:00:22.341142 - Epoch 6] 0.38729 Loss, 0.91787 Training AUC, 0.87962 Validation AUC (0.00308 Change)
[0:00:25.629051 - Epoch 7] 0.38557 Loss, 0.92047 Training A

## Loading Weights from Pretrained Model
We can load weights from another model and initialize the weight matrices.

We can thusly evaluate existing models using the RecSysModels framework:

In [6]:
'''
Retrieve pretrained weights and evaluate with model
'''
start = datetime.now()

# Initialize TF session
tf.reset_default_graph()
session = tf.Session()

# Retrieve the model we just trained
model2 = FPMC.load(model_folder)
session.run(tf.global_variables_initializer())

# Evaluate on the test data:
test_auc_2 = model2.evaluate_auc(session, test_data)
print('{} - Pretrained model from {} evaluated on test data, with AUC: {:.5f}'.format(
    datetime.now() - start,
    model_folder,
    test_auc_2
))

# Cleanup
session.close()
gc.collect()
print()

The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.



NameError: name 'model_folder' is not defined