In [None]:
# code for training a movie recommendation model with wikidata item features

# 1. train and eval a baseline model
# 2. train/eval a model with grouplens user and item features
# 3. train/eval a model with additonal features from wikidata
# 4. plot model performance and compare


In [124]:
from typing import Dict, List

from more_itertools import flatten
from itertools import chain

from tqdm import tqdm
import pandas as pd
import numpy as np
import altair as alt

tqdm.pandas()

from lightfm import LightFM
from lightfm.data import Dataset
from lightfm import cross_validation

# lightfm's evaluation metrics
from lightfm.evaluation import precision_at_k as lightfm_prec_at_k
from lightfm.evaluation import recall_at_k as lightfm_recall_at_k

from recommenders.datasets import movielens


In [50]:

MOVIELENS_DATA_SIZE = '100k'
K = 10 # default number of recommendations
SEED = 143

NO_COMPONENTS = 20
NO_THREADS = 32
NO_EPOCHS = 20
LEARNING_RATE = 0.25
# regularisation for both user and item features
ITEM_ALPHA = 1e-6
USER_ALPHA = 1e-6
TEST_PERCENTAGE = 0.25 # percentage of data used for testing

item_data_URL = "https://raw.githubusercontent.com/c-koster/movielens-wikidata/master/data/items_movielens_{size}.csv".format(
    size=MOVIELENS_DATA_SIZE
)

In [6]:
data = movielens.load_pandas_df(
    size=MOVIELENS_DATA_SIZE,
    genres_col='genre',
    title_col='title',
    year_col='year',
    header=["userID", "itemID", "rating"]
)
# quick look at the data
data.sample(5, random_state=SEED)

100%|██████████| 4.81k/4.81k [00:00<00:00, 8.25kKB/s]


Unnamed: 0,userID,itemID,rating,title,genre,year
17015,148,174,5.0,Raiders of the Lost Ark (1981),Action|Adventure,1981
32255,144,68,2.0,"Crow, The (1994)",Action|Romance|Thriller,1994
93392,896,358,1.0,Spawn (1997),Action|Adventure|Sci-Fi|Thriller,1997
69003,693,131,3.0,Breakfast at Tiffany's (1961),Drama|Romance,1961
90986,798,610,3.0,Gigi (1958),Musical,1958


In [7]:
# 1. create a baseline model

dataset_baseline = Dataset()

dataset_baseline.fit(
    users=data['userID'], 
    items=data['itemID']
)

num_users, num_items = dataset_baseline.interactions_shape()
print(f'Num users: {num_users}, num_topics: {num_items}.')

Num users: 943, num_topics: 1682.


In [8]:
(interactions, weights) = dataset_baseline.build_interactions(data.iloc[:, 0:3].values)

train_interactions, test_interactions = cross_validation.random_train_test_split(
    # some splitters have logic to ensure that all users have training history. this one does not
    interactions, test_percentage=TEST_PERCENTAGE,
    random_state=np.random.RandomState(SEED)
)

print(f"Shape of train interactions: {train_interactions.shape}")
print(f"Shape of test interactions: {test_interactions.shape}")

Shape of train interactions: (943, 1682)
Shape of test interactions: (943, 1682)


In [10]:
m_baseline = LightFM(loss='warp', no_components=NO_COMPONENTS, 
                 learning_rate=LEARNING_RATE,                 
                 random_state=np.random.RandomState(SEED))

m_baseline.fit(interactions=train_interactions, epochs=NO_EPOCHS)

<lightfm.lightfm.LightFM at 0x12e55af90>

In [45]:
from typing import Dict

def score_model(model, test, train, k, model_name, user_features = None, item_features = None) -> Dict[str,float]:
    
    precision = lightfm_prec_at_k(model, test, train, k, user_features, item_features).mean()
    recall    = lightfm_recall_at_k(model, test, train, k, user_features, item_features).mean()
    return {
        "model_name" : model_name,
        "precision"  : precision,
        "recall"     : recall,
        "K"          : k
    }

results_baseline = score_model(m_baseline, test_interactions,train_interactions,K,"baseline")
    
print(
    f"Precision@K:\t{results_baseline['precision']:.3f}",
    f"Recall@K:\t{results_baseline['recall']:.3f}", 
    sep='\n'
)

Precision@K:	0.148
Recall@K:	0.045


In [20]:
# 2. model with grouplens features added

# we have zipcode, gender, occupation, and movie genre(s)
user_feature_URL = 'http://files.grouplens.org/datasets/movielens/ml-100k/u.user'
columns = ['userID','age','gender','occupation','zipcode']

user_data = pd.read_table(user_feature_URL, sep='|', header=None, names=columns)

user_data["zipcode"].nunique(), user_data["occupation"].nunique()

(795, 21)

In [None]:
# zipcode will likely not have enough collisions so just take the first
# (least descriptive) piece of a person's zipcode
user_data['region'] = user_data['zipcode'].apply(lambda s:s[0])
# user_data['county'] = user_data['zipcode'].apply(lambda s:s[0:3])

In [23]:
alt.Chart(user_data).mark_bar().encode( # type:ignore
    x="age:Q",
    y="count()"
)

In [27]:
# turn user age into a categorical variable
def apply_age_category(user_age: int):
    if user_age < 18: return "18-" 
    if user_age < 25: return "18-24"
    if user_age < 32: return "25-31"
    if user_age < 45: return "32-45"
    if user_age < 50: return "45-50"
    else:             return  "50+"

user_data['age_category'] = user_data["age"].apply(apply_age_category)

alt.Chart(user_data).mark_bar().encode( # type:ignore
    x="age:Q",
    y="count()",
    color="age_category:N"
)

In [30]:
# set up the lightfm model to work with added features. ths is mostly boilerplate

data_grouplens_features = data.merge(
    user_data[['userID','occupation','age_category','gender','region']],
    on='userID', how='inner'
)


all_occupations = sorted(user_data['occupation'].unique().tolist())
all_regions     = sorted(user_data['region'].unique().tolist())
all_age_cats    = sorted(user_data['age_category'].unique().tolist())
all_gender_cats = sorted(user_data['gender'].unique().tolist())

movie_genre = [x.split('|') for x in data_grouplens_features['genre']]
all_movie_genre = list(set(flatten(movie_genre)))

#
grouplens_user_features = all_occupations + all_regions + all_age_cats + all_gender_cats
grouplens_item_features = all_movie_genre


# make a dataset object
ds_grouplens = Dataset()
ds_grouplens.fit(
    data['userID'], 
    data['itemID'], 
    item_features=grouplens_item_features,
    user_features=grouplens_user_features
)

In [41]:
grouplens_user_features_iter = (
    (x, [y,z,a,b]) for x, y, z, a, b in zip(
        data_grouplens_features.userID, 
        data_grouplens_features.occupation, 
        data_grouplens_features.region, 
        data_grouplens_features.age_category, 
        data_grouplens_features.gender
    )
)

grouplens_item_features_iter = (
    (x, y) for x, y in zip( data_grouplens_features.itemID, movie_genre )
)


item_features = ds_grouplens.build_item_features(grouplens_item_features_iter)
user_features = ds_grouplens.build_user_features(grouplens_user_features_iter)

In [33]:
interactions_grouplens, weights_grouplens = ds_grouplens.build_interactions(data.iloc[:, 0:3].values)


train_interactions_grouplens, test_interactions_grouplens = cross_validation.random_train_test_split(
    interactions_grouplens, 
    test_percentage=TEST_PERCENTAGE,
    random_state=np.random.RandomState(SEED)
)

In [34]:
m_grouplens = LightFM(
    loss='warp', no_components=NO_COMPONENTS, 
    learning_rate=LEARNING_RATE, 
    item_alpha=ITEM_ALPHA,
    user_alpha=USER_ALPHA,
    random_state=np.random.RandomState(SEED)
)

In [43]:
m_grouplens.fit(
    interactions=train_interactions_grouplens,
    user_features=user_features,
    item_features=item_features,
    epochs=NO_EPOCHS
)

<lightfm.lightfm.LightFM at 0x10d3ea7d0>

In [47]:
results_grouplens = score_model(
    m_grouplens, test_interactions_grouplens, train_interactions_grouplens, K, "grouplens", user_features, item_features
)
    
print(
    f"Precision@K:\t{results_grouplens['precision']:.3f}",
    f"Recall@K:\t{results_grouplens['recall']:.3f}", 
    sep='\n'
)

# it does quite a bit better

Precision@K:	0.163
Recall@K:	0.062


In [128]:
# 3. train a model with grouplens features AND ones from my extracted wiki dataset

wiki_features = pd.read_csv(item_data_URL) # load from my url
wiki_features.pop("title") # title is already in the df

data_wiki = data_grouplens_features.merge(
    wiki_features,
    how="inner",
    on="itemID"
)

data_wiki[data_wiki.maxDuration.isna()].sample(5,random_state=SEED)

Unnamed: 0,userID,itemID,rating,title,genre,year,occupation,age_category,gender,region,wikiID,bechdelOutcomeLabel,makoMoriOutcomeLabel,mpaRatingLabel,tomatoScore,maxDuration,origin,academyAwardCount,academyNominationCount
98992,13,897,1.0,Time Tracers (1995),Action|Adventure|Sci-Fi,1995,educator,45-50,M,2,Q619942,,,,,,,0.0,0.0
67494,704,631,3.0,"Crying Game, The (1992)",Action|Drama|Romance|War,1992,librarian,50+,F,9,Q3986479,,,,,,,0.0,0.0
79873,497,163,2.0,"Return of the Pink Panther, The (1974)",Comedy,1974,student,18-24,M,5,Q15012089,,,,,,,0.0,0.0
26302,194,179,4.0,"Clockwork Orange, A (1971)",Sci-Fi,1971,administrator,32-45,M,0,Q692557,,,,,,United Kingdom,0.0,0.0
60447,758,441,3.0,"Amityville Horror, The (1979)",Horror,1979,student,25-31,M,5,Q2252749,,,,,,United States of America,0.0,0.0


In [139]:
# preprocessng to be done before I can train on these features:
# 1. there are some null categorical values. add an UNK tag or similar:
categorical_cols = ["bechdelOutcomeLabel","makoMoriOutcomeLabel","mpaRatingLabel","origin"]
# 2. there are some scalar values (with null values too) that I must convert to categorical variables too. these are:
scalar_cols = ["academyAwardCount", "academyNominationCount", "maxDuration", "tomatoScore"]

# 3. some categoricals will have the same name e.g. {passes,fails} and this will break lightFM
# need to transform them so that they have a distinct prefix. e.g. bechdel:passes



In [130]:
# fix nulls first: 
fill_mean_cols = ["maxDuration", "tomatoScore"]
data_wiki[fill_mean_cols] = data_wiki[fill_mean_cols].fillna(data_wiki[fill_mean_cols].mean())

fill_0_cols = ["academyAwardCount", "academyNominationCount"]
data_wiki[fill_0_cols] = data_wiki[fill_0_cols].fillna(0.0)

data_wiki[categorical_cols] = data_wiki[categorical_cols].fillna("UNK")

In [131]:
data_wiki.info() # fixed all null values with the exception of year which I am not usingn

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100000 entries, 0 to 99999
Data columns (total 19 columns):
 #   Column                  Non-Null Count   Dtype  
---  ------                  --------------   -----  
 0   userID                  100000 non-null  int64  
 1   itemID                  100000 non-null  int64  
 2   rating                  100000 non-null  float64
 3   title                   100000 non-null  object 
 4   genre                   100000 non-null  object 
 5   year                    99985 non-null   object 
 6   occupation              100000 non-null  object 
 7   age_category            100000 non-null  object 
 8   gender                  100000 non-null  object 
 9   region                  100000 non-null  object 
 10  wikiID                  100000 non-null  object 
 11  bechdelOutcomeLabel     100000 non-null  object 
 12  makoMoriOutcomeLabel    100000 non-null  object 
 13  mpaRatingLabel          100000 non-null  object 
 14  tomatoScore          

In [132]:
# convert all scalars to categorical values (like with age)

wiki_features[["academyAwardCount"]].value_counts(), wiki_features[["academyNominationCount"]].value_counts()

    

(academyAwardCount
 0.0                  1518
 1.0                    85
 2.0                    26
 3.0                    15
 4.0                     9
 5.0                     7
 7.0                     7
 8.0                     5
 6.0                     4
 11.0                    2
 9.0                     1
 dtype: int64,
 academyNominationCount
 0.0                       1312
 1.0                        136
 2.0                         57
 4.0                         38
 3.0                         37
 7.0                         17
 5.0                         15
 6.0                         15
 8.0                         14
 10.0                        11
 9.0                          8
 11.0                         8
 12.0                         7
 13.0                         2
 14.0                         2
 dtype: int64)

In [133]:
# most academy award vals are 0 with steep decay. make 3 meaningful bins
def award2cat(count: float) -> str:
    if count == 0: return "0"
    if count < 3:  return "1-2"
    else:          return "3+"

def apply_duration_category(movie_duration: float) -> str:
    if movie_duration < 80:  return "80-"
    if movie_duration < 110: return "90-110"
    if movie_duration < 140: return "110-150"
    else:                    return "150+"

def apply_tomato_category(tomato_score: float) -> str:  
    if tomato_score > 0.89: return "90+"
    if tomato_score > 0.69: return "70-89"
    if tomato_score > 0.49: return "50-69"
    if tomato_score > 0.29: return "30-49"
    else:                   return "29-"


# items["duration_category"] = items["duration"].apply(apply_duration_category)

data_wiki["academyAwardCount"] = data_wiki["academyAwardCount"].apply(award2cat)
data_wiki["academyNominationCount"] = data_wiki["academyNominationCount"].apply(award2cat)
data_wiki["tomatoScore"] = data_wiki["tomatoScore"].apply(apply_tomato_category)
data_wiki["maxDuration"] = data_wiki["maxDuration"].apply(apply_duration_category)

In [134]:
# ensure categoricals have different names
duplicate_value_cols = [
    "makoMoriOutcomeLabel", "bechdelOutcomeLabel", "mpaRatingLabel",
    "maxDuration", "academyAwardCount", "academyNominationCount"
]
transform_categorical_column = lambda val,column: "{column}:{val}".format(column=column,val=val)
for i in duplicate_value_cols:
    data_wiki[i] = data_wiki[i].apply(lambda val: transform_categorical_column(val,i))

In [147]:
data_wiki.sample(5,random_state=SEED)

Unnamed: 0,userID,itemID,rating,title,genre,year,occupation,age_category,gender,region,wikiID,bechdelOutcomeLabel,makoMoriOutcomeLabel,mpaRatingLabel,tomatoScore,maxDuration,origin,academyAwardCount,academyNominationCount
17015,835,50,4.0,Star Wars (1977),Action|Adventure|Romance|Sci-Fi|War,1977,executive,32-45,F,1,Q17738,bechdelOutcomeLabel:fails,makoMoriOutcomeLabel:passes,mpaRatingLabel:PG,90+,maxDuration:110-150,United States of America,academyAwardCount:3+,academyNominationCount:3+
32255,13,651,5.0,Glory (1989),Action|Drama|War,1989,educator,45-50,M,2,Q862317,bechdelOutcomeLabel:fails,makoMoriOutcomeLabel:fails,mpaRatingLabel:UNK,90+,maxDuration:110-150,United States of America,academyAwardCount:3+,academyNominationCount:3+
93392,621,810,3.0,"Shadow, The (1994)",Action,1994,student,18-,M,6,Q1546938,bechdelOutcomeLabel:fails,makoMoriOutcomeLabel:UNK,mpaRatingLabel:UNK,30-49,maxDuration:90-110,United States of America,academyAwardCount:0,academyNominationCount:0
69003,159,588,2.0,Beauty and the Beast (1991),Animation|Children's|Musical,1991,student,18-24,F,5,Q179673,bechdelOutcomeLabel:passes,makoMoriOutcomeLabel:passes,mpaRatingLabel:G,90+,maxDuration:90-110,United States of America,academyAwardCount:1-2,academyNominationCount:3+
90986,244,743,5.0,"Crow: City of Angels, The (1996)",Action|Thriller,1996,technician,25-31,M,8,Q302181,bechdelOutcomeLabel:UNK,makoMoriOutcomeLabel:UNK,mpaRatingLabel:UNK,29-,maxDuration:90-110,United States of America,academyAwardCount:0,academyNominationCount:0


In [172]:
# user features are the same as grouplens
# get names of item features

movie_origin = [x.split('|') for x in data_wiki['origin']]
all_movie_origin = list(set(flatten(movie_origin)))
wikidata_item_cols_minus_origin = [
    'bechdelOutcomeLabel', 'makoMoriOutcomeLabel','mpaRatingLabel',
    'academyAwardCount','academyNominationCount','maxDuration','tomatoScore'
]

wikidata_feature_names = list(flatten(list(data_wiki[col].unique()) for col in wikidata_item_cols_minus_origin))

wiki_user_feature_names = grouplens_user_features
wiki_item_feature_names = wikidata_feature_names + all_movie_genre #+ all_movie_origin

In [173]:
# make a dataset object
ds_wiki = Dataset()
ds_wiki.fit(
    data_wiki['userID'], 
    data_wiki['itemID'], 
    item_features=wiki_item_feature_names,
    user_features=wiki_user_feature_names
) 

In [174]:
wiki_user_features_iter = ( # copy from the grouplens
    (x, [y,z,a,b]) for x, y, z, a, b in zip(
        data_grouplens_features.userID, 
        data_grouplens_features.occupation, 
        data_grouplens_features.region, 
        data_grouplens_features.age_category, 
        data_grouplens_features.gender
    )
)

wiki_item_features_iter = (
    (x, y + list(vals[1:]) ) 
    for x, y, vals in zip(
        data_wiki.itemID,
        movie_genre,
        #movie_origin,
        data_wiki[wikidata_item_cols_minus_origin].itertuples(name=None)
    )
)

wiki_user_features = ds_wiki.build_user_features(wiki_user_features_iter)
wiki_item_features = ds_wiki.build_item_features(wiki_item_features_iter)

In [175]:
interactions_wiki, weights_wiki = ds_wiki.build_interactions(data.iloc[:, 0:3].values)


train_interactions_wiki, test_interactions_wiki = cross_validation.random_train_test_split(
    interactions_wiki, 
    test_percentage=TEST_PERCENTAGE,
    random_state=np.random.RandomState(SEED)
)

In [176]:
m_wiki = LightFM(
    loss='warp', no_components=NO_COMPONENTS, 
    learning_rate=LEARNING_RATE, 
    item_alpha=ITEM_ALPHA,
    user_alpha=USER_ALPHA,
    random_state=np.random.RandomState(SEED)
)

In [177]:
m_wiki.fit(
    interactions=train_interactions_wiki,
    user_features=wiki_user_features,
    item_features=wiki_item_features,
    epochs=NO_EPOCHS
)

<lightfm.lightfm.LightFM at 0x133e4c890>

In [179]:

results_wiki = score_model(
    m_wiki, test_interactions_wiki, train_interactions_wiki, K, "wiki", wiki_user_features, wiki_item_features
)
    
print(
    f"Precision@K:\t{results_wiki['precision']:.3f}",
    f"Recall@K:\t{results_wiki['recall']:.3f}", 
    sep='\n'
)
# Precision@K:	0.171
# Recall@K:	0.064

Precision@K:	0.171
Recall@K:	0.064


In [210]:

results_df = pd.DataFrame([results_wiki,results_baseline,results_grouplens]).melt(
    id_vars="model_name",value_vars=["precision","recall"],var_name="metric"
)

alt.Chart(results_df).mark_bar().encode( # type:ignore
    x="metric:O",
    y="value:Q",
    color="metric:N",
    column="model_name:O"
).properties(title="Comparison of Recommender Model Performance")

In [623]:
scores = prepare_single_prediction(data_user_item, 813, ds._user_id_mapping, ds._item_feature_mapping, m2, 3, user_features, item_features)
scores = scores.sort_values(by="prediction",ascending=False)

id2movie = dict(data_wiki[["itemID","title"]].itertuples(index=False,name=None))
scores['title'] = scores['itemID'].map(id2movie)
scores.head(10)

Unnamed: 0,userID,itemID,prediction,title
304,813,419,-164.743622,Mary Poppins (1964)
253,813,313,-165.027313,Titanic (1997)
433,813,385,-165.228531,True Lies (1994)
423,813,432,-165.334717,Fantasia (1940)
309,813,216,-165.36882,When Harry Met Sally... (1989)
201,813,151,-165.373154,Willy Wonka and the Chocolate Factory (1971)
200,813,96,-165.376678,Terminator 2: Judgment Day (1991)
150,813,143,-165.410095,"Sound of Music, The (1965)"
447,813,568,-165.422287,Speed (1994)
63,813,258,-165.427765,Contact (1997)


In [622]:
def prepare_single_prediction(data, userID, uid_map, iid_map, model, num_threads, user_features=None, item_features=None ):

    users, items, preds = [], [], []  # noqa: F841
    item = list(data.itemID.unique())
    for user in [userID]:
        user = [user] * len(item)
        users.extend(user)
        items.extend(item)

    all_predictions = pd.DataFrame(data={"userID": users, "itemID": items})
    all_predictions["uid"] = all_predictions.userID.map(uid_map)
    all_predictions["iid"] = all_predictions.itemID.map(iid_map)

    all_predictions["prediction"] = all_predictions.apply(
        lambda x: model.predict(
            user_ids=np.array([x["uid"]], dtype=np.int32),
            item_ids=np.array([x["iid"]], dtype=np.int32),
            user_features=user_features,
            item_features=item_features,
            num_threads=num_threads,
        )[0],
        axis=1,
    )

    return all_predictions[["userID", "itemID", "prediction"]]


