In [1]:
# Neccessity
import numpy as np
import pandas as pd
import math
import random
from sklearn.model_selection import train_test_split
# from sklearn.metrics.pairwise import cosine_similarity

# Evaluator
from evaluate_component.evaluate import ModelEvaluator

np.set_printoptions(precision=3)

np.set_printoptions(suppress=True)

# Initialize

In [2]:
movies = pd.read_csv('ml-latest-small/movies.csv').set_index('movieId')
ratings = pd.read_csv('ml-latest-small/ratings.csv')

## Split Data

In [3]:
ratings_train_df, ratings_test_df = train_test_split(ratings,
                                   stratify=ratings['userId'], 
                                   test_size=0.20,
                                   random_state=42)

print('Number of ratings on the Train set: %d' % len(ratings_train_df))
print('Number of ratings on the Test set: %d' % len(ratings_test_df))

Number of ratings on the Train set: 80668
Number of ratings on the Test set: 20168


In [4]:
#Indexing by userId to speed up the searches during evaluation
ratings_full_indexed_df = ratings.set_index('userId')
ratings_train_indexed_df = ratings_train_df.set_index('userId')
ratings_test_indexed_df = ratings_test_df.set_index('userId')

## Evaluator

In [5]:
model_evaluator = ModelEvaluator(ratings_train_indexed_df, ratings_test_indexed_df, movies) 

# Popularity Model

In [6]:
from models.popularity import PopularityRecommender

In [7]:
popular_model = PopularityRecommender('popularity')

In [8]:
popular_model.fit(ratings_train_indexed_df.reset_index(), movies)

<models.popularity.PopularityRecommender at 0x7f90fc82ae10>

In [9]:
model_evaluator.evaluate_model_for_user(popular_model, 2, top_n = 10, verbose = True)

Watched movies:
	Step Brothers (2008)
	Dark Knight Rises, The (2012)
	Shawshank Redemption, The (1994)
	Departed, The (2006)
	Town, The (2010)
	Gladiator (2000)
	Exit Through the Gift Shop (2010)
	Zombieland (2009)
	Talladega Nights: The Ballad of Ricky Bobby (2006)
	Django Unchained (2012)
	Inception (2010)
	Good Will Hunting (1997)
	The Jinx: The Life and Deaths of Robert Durst (2015)
	Shutter Island (2010)
	Mad Max: Fury Road (2015)
	Ex Machina (2015)
	Girl with the Dragon Tattoo, The (2011)
	Louis C.K.: Hilarious (2010)
	Tommy Boy (1995)
	Inglourious Basterds (2009)
	Whiplash (2014)
	Dark Knight, The (2008)
	Collateral (2004)
Relevant movies:
	Wolf of Wall Street, The (2013)
	Warrior (2011)
	Inside Job (2010)
	Interstellar (2014)
	Kill Bill: Vol. 1 (2003)

Recommendation:
	Forrest Gump (1994)
	Pulp Fiction (1994)
	Silence of the Lambs, The (1991)
	Matrix, The (1999)
	Star Wars: Episode IV - A New Hope (1977)
	Jurassic Park (1993)
	Braveheart (1995)
	Toy Story (1995)
	Seven (a.k.a. 

{'hits_at_5_count': 0,
 'hits_at_10_count': 0,
 'interacted_count': 5,
 'recall_at_5': 0.0,
 'recall_at_10': 0.0,
 'precision_at_5': 0.0,
 'precision_at_10': 0.0,
 'auc_at_10': 1.0}

In [10]:
model_evaluator.evaluate_model(popular_model, top_n = -1)

609 users processed


{'modelName': 'popularity',
 'recall_at_5': 0.03212874405818682,
 'recall_at_10': 0.051600710154057615,
 'agg_precision_at_5': 0.18393442622950928,
 'agg_precision_at_10': 0.09196721311475464,
 'agg_auc_at_10': 0.6660551448897788}

# Content Based Recommender

In [11]:
from models.contentBased import ContentBasedRecommender
from general_info.general_info import genre_list

In [12]:
cb = ContentBasedRecommender('ContentBased', genre_list)

In [13]:
cb.fit(ratings.set_index('userId'), movies,copy = True)

<models.contentBased.ContentBasedRecommender at 0x7f90fc7fe710>

In [14]:
cb.predict(2, verbose = True, predict_rating = True)

Men Who Stare at Goats, The (2009)    3.581081
Super (2010)                          3.577063
Burn After Reading (2008)             3.564801
Down Terrace (2009)                   3.562940
Middle Men (2009)                     3.562940
Choke (2008)                          3.554013
Berlin Calling (2008)                 3.554013
Synecdoche, New York (2008)           3.554013
Marley & Me (2008)                    3.554013
Sunshine Cleaning (2008)              3.554013
dtype: float64

In [15]:
model_evaluator.evaluate_model_for_user(cb, 2, top_n = 10, verbose = True)

Watched movies:
	Step Brothers (2008)
	Dark Knight Rises, The (2012)
	Shawshank Redemption, The (1994)
	Departed, The (2006)
	Town, The (2010)
	Gladiator (2000)
	Exit Through the Gift Shop (2010)
	Zombieland (2009)
	Talladega Nights: The Ballad of Ricky Bobby (2006)
	Django Unchained (2012)
	Inception (2010)
	Good Will Hunting (1997)
	The Jinx: The Life and Deaths of Robert Durst (2015)
	Shutter Island (2010)
	Mad Max: Fury Road (2015)
	Ex Machina (2015)
	Girl with the Dragon Tattoo, The (2011)
	Louis C.K.: Hilarious (2010)
	Tommy Boy (1995)
	Inglourious Basterds (2009)
	Whiplash (2014)
	Dark Knight, The (2008)
	Collateral (2004)
Relevant movies:
	Wolf of Wall Street, The (2013)
	Warrior (2011)
	Inside Job (2010)
	Interstellar (2014)
	Kill Bill: Vol. 1 (2003)

Recommendation:
	Men Who Stare at Goats, The (2009)
	Super (2010)
	Burn After Reading (2008)
	Down Terrace (2009)
	Middle Men (2009)
	Choke (2008)
	Berlin Calling (2008)
	Synecdoche, New York (2008)
	Marley & Me (2008)
	Sunshine 

{'hits_at_5_count': 0,
 'hits_at_10_count': 0,
 'interacted_count': 5,
 'recall_at_5': 0.0,
 'recall_at_10': 0.0,
 'precision_at_5': 0.0,
 'precision_at_10': 0.0,
 'auc_at_10': 1.0}

In [16]:
model_evaluator.evaluate_model(cb, top_n = -1)

609 users processed


{'modelName': 'ContentBased',
 'recall_at_5': 0.0,
 'recall_at_10': 0.0,
 'agg_precision_at_5': 0.0,
 'agg_precision_at_10': 0.0,
 'agg_auc_at_10': 0.9950819672131147}

# User Based Collaborative Filtering

In [17]:
from models.userBasedCF import UserBasedCollaborativeFiltering

In [18]:
user_based_cf = UserBasedCollaborativeFiltering('User-Based Collaborative Filtering', 10)

In [19]:
user_based_cf.fit(ratings_train_indexed_df, movies[['title']])

In [20]:
user_based_cf.predict(2, top_n = 10, verbose= True, predict_rating = True)

Lord of the Rings: The Fellowship of the Ring, The (2001)    1.511974
Up (2009)                                                    1.479737
Fight Club (1999)                                            1.411536
Prestige, The (2006)                                         1.261648
Iron Man (2008)                                              1.199666
Batman Begins (2005)                                         1.174567
Silence of the Lambs, The (1991)                             1.163648
The Imitation Game (2014)                                    1.057577
Interstellar (2014)                                          1.047449
Forrest Gump (1994)                                          0.905956
dtype: float64

In [21]:
model_evaluator.evaluate_model_for_user(user_based_cf, 2, top_n = 10, verbose = True)

Watched movies:
	Step Brothers (2008)
	Dark Knight Rises, The (2012)
	Shawshank Redemption, The (1994)
	Departed, The (2006)
	Town, The (2010)
	Gladiator (2000)
	Exit Through the Gift Shop (2010)
	Zombieland (2009)
	Talladega Nights: The Ballad of Ricky Bobby (2006)
	Django Unchained (2012)
	Inception (2010)
	Good Will Hunting (1997)
	The Jinx: The Life and Deaths of Robert Durst (2015)
	Shutter Island (2010)
	Mad Max: Fury Road (2015)
	Ex Machina (2015)
	Girl with the Dragon Tattoo, The (2011)
	Louis C.K.: Hilarious (2010)
	Tommy Boy (1995)
	Inglourious Basterds (2009)
	Whiplash (2014)
	Dark Knight, The (2008)
	Collateral (2004)
Relevant movies:
	Wolf of Wall Street, The (2013)
	Warrior (2011)
	Inside Job (2010)
	Interstellar (2014)
	Kill Bill: Vol. 1 (2003)

Recommendation:
	Lord of the Rings: The Fellowship of the Ring, The (2001)
	Up (2009)
	Fight Club (1999)
	Prestige, The (2006)
	Iron Man (2008)
	Batman Begins (2005)
	Silence of the Lambs, The (1991)
	The Imitation Game (2014)
	I

{'hits_at_5_count': 0,
 'hits_at_10_count': 1,
 'interacted_count': 5,
 'recall_at_5': 0.0,
 'recall_at_10': 0.2,
 'precision_at_5': 0.0,
 'precision_at_10': 0.0,
 'auc_at_10': 1.0}

In [22]:
model_evaluator.evaluate_model(user_based_cf, top_n = -1)

609 users processed


{'modelName': 'User-Based Collaborative Filtering',
 'recall_at_5': 0.0497107840329878,
 'recall_at_10': 0.08470305251703797,
 'agg_precision_at_5': 0.28459016393442543,
 'agg_precision_at_10': 0.14229508196721272,
 'agg_auc_at_10': 0.7677022677235874}

# Item Based Collaborative Filtering

In [23]:
from models.itemBasedCF import ItemBasedCollaborativeFiltering

In [24]:
item_based_cf = ItemBasedCollaborativeFiltering('Item-Based Collaborative Filtering')

In [25]:
item_based_cf.fit(ratings_train_indexed_df.reset_index(), movies[['title']])

In [26]:
model_evaluator.evaluate_model_for_user(item_based_cf, 2, top_n = 10, verbose = True)

Watched movies:
	Step Brothers (2008)
	Dark Knight Rises, The (2012)
	Shawshank Redemption, The (1994)
	Departed, The (2006)
	Town, The (2010)
	Gladiator (2000)
	Exit Through the Gift Shop (2010)
	Zombieland (2009)
	Talladega Nights: The Ballad of Ricky Bobby (2006)
	Django Unchained (2012)
	Inception (2010)
	Good Will Hunting (1997)
	The Jinx: The Life and Deaths of Robert Durst (2015)
	Shutter Island (2010)
	Mad Max: Fury Road (2015)
	Ex Machina (2015)
	Girl with the Dragon Tattoo, The (2011)
	Louis C.K.: Hilarious (2010)
	Tommy Boy (1995)
	Inglourious Basterds (2009)
	Whiplash (2014)
	Dark Knight, The (2008)
	Collateral (2004)
Relevant movies:
	Wolf of Wall Street, The (2013)
	Warrior (2011)
	Inside Job (2010)
	Interstellar (2014)
	Kill Bill: Vol. 1 (2003)

Recommendation:
	Batman Begins (2005)
	Lord of the Rings: The Return of the King, The (2003)
	Kill Bill: Vol. 1 (2003)
	Lord of the Rings: The Fellowship of the Ring, The (2001)
	Fight Club (1999)
	Avatar (2009)
	Zodiac (2007)
	U

{'hits_at_5_count': 1,
 'hits_at_10_count': 1,
 'interacted_count': 5,
 'recall_at_5': 0.2,
 'recall_at_10': 0.2,
 'precision_at_5': 0.2,
 'precision_at_10': 0.1,
 'auc_at_10': 1.0}

In [27]:
model_evaluator.evaluate_model(item_based_cf, top_n = -1)

609 users processed


{'modelName': 'Item-Based Collaborative Filtering',
 'recall_at_5': 0.034705916041463836,
 'recall_at_10': 0.055495103373231776,
 'agg_precision_at_5': 0.19868852459016495,
 'agg_precision_at_10': 0.09934426229508247,
 'agg_auc_at_10': 0.9130236361014801}

# Hybrid Explicit Feedback Recommender

In [28]:
from models.hybridExplicitFeedback import HybridRecommender

## Hybrid - User Based CF and Content Based

In [29]:
hybrid_user_based_cf_and_content_based = HybridRecommender(name = 'hybrid_user_based_cf_and_content_based', 
                                                           recommenders = [
                                                               UserBasedCollaborativeFiltering('User-Based Collaborative Filtering', 10), 
                                                               ContentBasedRecommender('ContentBased', genre_list)
                                                           ], 
                                                           weights = [0.7, 0.3])

In [30]:
hybrid_user_based_cf_and_content_based.fit(ratings = ratings_train_indexed_df, movies = movies)

In [31]:
model_evaluator.evaluate_model(hybrid_user_based_cf_and_content_based, top_n = -1)

609 users processed


{'modelName': 'hybrid_user_based_cf_and_content_based',
 'recall_at_5': 0.051772521619609414,
 'recall_at_10': 0.08653570814959051,
 'agg_precision_at_5': 0.2963934426229502,
 'agg_precision_at_10': 0.1481967213114751,
 'agg_auc_at_10': 0.9044870800526668}

## Hybrid - Item Based and Content Based

In [32]:
hybrid_item_based_cf_and_content_based = HybridRecommender(name = 'hybrid_item_based_cf_and_content_based', 
                                                           recommenders = [
                                                               ItemBasedCollaborativeFiltering('Item-Based Collaborative Filtering'), 
                                                               ContentBasedRecommender('ContentBased', genre_list)
                                                           ], 
                                                           weights = [0.7, 0.3])

In [33]:
hybrid_item_based_cf_and_content_based.fit(ratings = ratings_train_indexed_df, movies = movies)

In [34]:
model_evaluator.evaluate_model(hybrid_item_based_cf_and_content_based, top_n = -1)

609 users processed


{'modelName': 'hybrid_item_based_cf_and_content_based',
 'recall_at_5': 0.03235782601225588,
 'recall_at_10': 0.051944333085161214,
 'agg_precision_at_5': 0.1852459016393452,
 'agg_precision_at_10': 0.0926229508196726,
 'agg_auc_at_10': 0.9138782093601633}

## Hybrid - User Based and Item Based CF

In [35]:
hybrid_item_based_cf_and_user_based_cf = HybridRecommender(name = 'hybrid_item_based_cf_and_user_based_cf', 
                                                           recommenders = [
                                                               ItemBasedCollaborativeFiltering('Item-Based Collaborative Filtering'), 
                                                               UserBasedCollaborativeFiltering('User-Based Collaborative Filtering', 10),
                                                           ], 
                                                           weights = [0.5, 0.5])

In [36]:
hybrid_item_based_cf_and_user_based_cf.fit(ratings = ratings_train_indexed_df, movies = movies)

In [37]:
model_evaluator.evaluate_model(hybrid_item_based_cf_and_user_based_cf, top_n = -1)

609 users processed


{'modelName': 'hybrid_item_based_cf_and_user_based_cf',
 'recall_at_5': 0.051600710154057615,
 'recall_at_10': 0.08573392131034878,
 'agg_precision_at_5': 0.29540983606557303,
 'agg_precision_at_10': 0.14770491803278651,
 'agg_auc_at_10': 0.9298287026702459}

## Hybrid - Popularity and Content Based

In [38]:
hybrid_popularity_and_content_based = HybridRecommender(name = 'hybrid_popularity_and_content_based', 
                                                           recommenders = [
                                                               PopularityRecommender('PopularityRecommender'), 
                                                               ContentBasedRecommender('ContentBased', genre_list)
                                                           ], 
                                                           weights = [0.2, 0.8])

In [39]:
hybrid_item_based_cf_and_content_based.fit(ratings = ratings_train_indexed_df, movies = movies)

In [40]:
model_evaluator.evaluate_model(hybrid_item_based_cf_and_content_based, top_n = -1)

609 users processed


{'modelName': 'hybrid_item_based_cf_and_content_based',
 'recall_at_5': 0.03235782601225588,
 'recall_at_10': 0.051944333085161214,
 'agg_precision_at_5': 0.1852459016393452,
 'agg_precision_at_10': 0.0926229508196726,
 'agg_auc_at_10': 0.9138782093601633}

# Hybrid Implicit Feedback Recommender

In [41]:
# movies = pd.read_csv('ml-latest-small/movies.csv').set_index('movieId')
# ratings = pd.read_csv('ml-latest-small/ratings.csv')

# ratings_train_df, ratings_test_df = train_test_split(ratings,
#                                    stratify=ratings['userId'], 
#                                    test_size=0.20,
#                                    random_state=42)

# print('# ratings on Train set: %d' % len(ratings_train_df))
# print('# ratings on Test set: %d' % len(ratings_test_df))

In [42]:
from models.hybridImplicitFeedback import LightFMRecommender

In [43]:
lightfm_recommender = LightFMRecommender(name = 'LightFM',
                                                no_components=10, 
        k=5, 
        n=10,
        loss='warp',
        random_state = 0)

In [44]:
lightfm_recommender.fit(ratings_train_df, movies)

<models.hybridImplicitFeedback.LightFMRecommender at 0x7f90fa4dae90>

In [45]:
lightfm_recommender.predict(2, verbose = True)

['Piano, The (1993)',
 'Sense and Sensibility (1995)',
 'Jungle Book, The (1994)',
 'Before Sunrise (1995)',
 'Pride and Prejudice (1995)',
 'Leaving Las Vegas (1995)',
 'Angels and Insects (1995)',
 'Iron Will (1994)',
 'Remains of the Day, The (1993)',
 'Bridges of Madison County, The (1995)']

In [46]:
model_evaluator.evaluate_model_for_user(lightfm_recommender, 2, top_n = 10, verbose = True)

Watched movies:
	Step Brothers (2008)
	Dark Knight Rises, The (2012)
	Shawshank Redemption, The (1994)
	Departed, The (2006)
	Town, The (2010)
	Gladiator (2000)
	Exit Through the Gift Shop (2010)
	Zombieland (2009)
	Talladega Nights: The Ballad of Ricky Bobby (2006)
	Django Unchained (2012)
	Inception (2010)
	Good Will Hunting (1997)
	The Jinx: The Life and Deaths of Robert Durst (2015)
	Shutter Island (2010)
	Mad Max: Fury Road (2015)
	Ex Machina (2015)
	Girl with the Dragon Tattoo, The (2011)
	Louis C.K.: Hilarious (2010)
	Tommy Boy (1995)
	Inglourious Basterds (2009)
	Whiplash (2014)
	Dark Knight, The (2008)
	Collateral (2004)
Relevant movies:
	Wolf of Wall Street, The (2013)
	Warrior (2011)
	Inside Job (2010)
	Interstellar (2014)
	Kill Bill: Vol. 1 (2003)

Recommendation:
	Piano, The (1993)
	Sense and Sensibility (1995)
	Jungle Book, The (1994)
	Before Sunrise (1995)
	Pride and Prejudice (1995)
	Leaving Las Vegas (1995)
	Angels and Insects (1995)
	Iron Will (1994)
	Remains of the D

{'hits_at_5_count': 0,
 'hits_at_10_count': 0,
 'interacted_count': 5,
 'recall_at_5': 0.0,
 'recall_at_10': 0.0,
 'precision_at_5': 0.0,
 'precision_at_10': 0.0,
 'auc_at_10': 1.0}

In [47]:
model_evaluator.evaluate_model(lightfm_recommender, top_n = -1)

609 users processed


{'modelName': 'LightFM',
 'recall_at_5': 0.010938663306798007,
 'recall_at_10': 0.020903728308802473,
 'agg_precision_at_5': 0.06262295081967212,
 'agg_precision_at_10': 0.03131147540983606,
 'agg_auc_at_10': 0.8488023102120572}

# Conclusion

## Performance

- The performance of hybrid models is always beter then each of their component
- User-based CF has the best `precision` and `recall` score but bad `auc`. However, this can be improved by combine it with models having good `auc`. In this case, the combination of User-based and Item-based CF has the best result
- Content-based and popularity models alone have unacceptable scores. However, their combination is acceptable
- LightFM's performance is not good among the tested models, but it is acceptable

**Note:** The dataset is too small to jump to conclusion of which model is better. However, in this setting

## Usage

- **Main engine**: Hybrid of User-based and Item-based CF - since it has the best evaluation scores
- **Similar to just-watched-movie recommender**: Hybrid of Item-based CF and Content-based
- **Cold start**: Hybrid of Popularity and Content-based