# Playing with Recommenders

In [1]:
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import pickle

In [2]:
# Load module from another directory
import shutil
shutil.copyfile(src="../scripts/redcarpet.py", dst="../notebooks/redcarpet.py")
from redcarpet import mat_to_sets, get_recs, write_kaggle_recs

## Load Data

In [3]:
item_file = "../input/talent.pkl"
item_records, COLUMN_LABELS, READABLE_LABELS, ATTRIBUTES = pickle.load(open(item_file, "rb"))
item_df = pd.DataFrame(item_records)[ATTRIBUTES + COLUMN_LABELS].fillna(value=0)
item_df.head()

Unnamed: 0,id,name,price,reactions,stars,joined,categories,in_13_reasons_why,in_90_day_fiance,in_actors,...,in_ufc,in_vanderpump_rules,in_venture_capitalists,in_viners,in_vlog_squad,in_voice_actors,in_winter_sports,in_writers,in_younow,in_youtubers
0,perezhilton,Perez Hilton,27.0,924,5.0,April 2018,"[Reality TV, Commentators, Featured]",0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,andydick,Andy Dick,99.0,340,4.9,October 2018,"[Reality TV, Comedians, Featured, Actors]",0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,tjlavin,TJ Lavin,80.0,291,5.0,February 2018,"[Reality TV, Riders, Featured, Extreme Sports,...",0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,carsonkressley,Carson Kressley,59.0,290,5.0,October 2018,"[Reality TV, Bravo, Stylists, Featured, Actors...",0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,riffraff,RiFF RAFF,75.0,402,4.7,December 2017,"[Rappers, Featured, Musicians]",0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [4]:
ITEM_NAMES = item_df["id"].values
like_file = "../input/likes.pkl"
like_csr = pickle.load(open(like_file, "rb"))
like_mat = np.array(like_csr.todense())
like_df = pd.DataFrame(like_mat, columns=ITEM_NAMES)
like_df.head()

Unnamed: 0,perezhilton,andydick,tjlavin,carsonkressley,riffraff,chumlee,gilbertgottfried,icet,benhiggy,laturtle,...,chrisjaialex,voman,el_peego,thisannaisbananas,zachharper,johnoberg,zacpullam,kansasbowling,mattcirulnick,itsscaleb__
0,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,1,0,0,1,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [5]:
s_train, s_test, s_input, s_hidden = pickle.load(open("../input/train_test_set.pkl", "rb"))
print(len(s_train), len(s_test), len(s_input), len(s_hidden))

4000 1000 1000 1000


In [6]:
s_items = mat_to_sets(item_df[COLUMN_LABELS].values)
print(len(s_items))

5392


## Evaluation Metrics

In [7]:
from redcarpet import mapk_score, uhr_score

In [8]:
help(mapk_score)

Help on function mapk_score in module redcarpet:

mapk_score(recs_true, recs_pred, k=10)
    Computes the mean average precision at k (MAP@K) of recommendations.
    MAP@K = mean AP@K score over all users
    AP@K = (1 / m) * sum from 1 to k of (precision at i * relevance of ith item)
    Where m is the number of items in a user's hidden set
    Where k is the number of items recommended to each user
    params:
        recs_true: list of sets of hidden items for each user
        recs_pred: list of lists of recommended items, with each list
        k: number of recommendations to use in top set
    returns:
        float, range [0, 1], though score of 1 will be impossible if
        recs_true includes users who have more than k hidden items



In [9]:
help(uhr_score)

Help on function uhr_score in module redcarpet:

uhr_score(recs_true, recs_pred, k=10)
    Computes the user hit tate (UHR) score of recommendations.
    UHR = the fraction of users whose top list included at
    least one item also in their hidden set.
    params:
        recs_true: list of sets of hidden items for each user
        recs_pred: list of lists of recommended items, with each list
        k: number of recommendations to use in top set
    returns:
        float, range [0, 1]



## Similarity Measures

In [10]:
from redcarpet import jaccard_sim, cosine_sim

In [11]:
help(jaccard_sim)

Help on function jaccard_sim in module redcarpet:

jaccard_sim(u, v)
    Computes the Jaccard similarity between sets u and v.
    sim = intersection(u, v) / union(u, v)
    params:
        u, v: sets to compare
    returns:
        float between 0 and 1, where 1 represents perfect
            similarity and 0 represents no similarity



In [12]:
help(cosine_sim)

Help on function cosine_sim in module redcarpet:

cosine_sim(u, v)
    Computes the Cosine similarity between sets u and v.
    sim = intersection(u, v) / sqrt(|u| * |v|)
    Where |s| is the number of items in set s
    params:
        u, v: sets to compare
    returns:
        float between 0 and 1, where 1 represents perfect
            similarity and 0 represents no similarity



## Recommendation Algorithms

In [13]:
from redcarpet import collaborative_filter, content_filter

In [14]:
help(collaborative_filter)

Help on function collaborative_filter in module redcarpet:

collaborative_filter(recs_train, recs_input, k=10, j=3, sim_fn=<function jaccard_sim at 0x117691048>)
    Collaborative filtering recommender system.
    params:
        recs_train: list of sets of liked item indices for train data
        recs_input: list of sets of liked item indices for input data
        k: number of items to recommend for each user
        j: number of similar users to base recommendations on
        sim_fn(u, v): function that returns a float value representing
            the similarity between sets u and v
    returns:
        recs_pred: list of lists of tuples of recommendations where
            each tuple has (item index, relevance score) with the list
            of tuples sorted in order of decreasing relevance



In [15]:
help(content_filter)

Help on function content_filter in module redcarpet:

content_filter(items_train, recs_input, k=10, sim_fn=<function jaccard_sim at 0x117691048>)
    Content-based filtering recommender system.
    params:
        items_train: list of sets of non-zero attribute indices for items
        recs_input: list of sets of liked item indices for input data
        k: number of items to recommend for each user
        sim_fn(u, v): function that returns a float value representing
            the similarity between sets u and v
    returns:
        recs_pred: list of lists of tuples of recommendations where
            each tuple has (item index, relevance score) with the list
            of tuples sorted in order of decreasing relevance



## Results

In [16]:
n_pred = 10
s_input_sample = s_input[0:n_pred]
s_hidden_sample = s_hidden[0:n_pred]

In [17]:
print("Strategy: Collaborative")
print("Similarity: Jaccard")
user_recs = collaborative_filter(s_train, s_input_sample)
print("MAP = {0:.3f}".format(mapk_score(s_hidden_sample, get_recs(user_recs), k=10)))
print("UHR = {0:.3f}".format(uhr_score(s_hidden_sample, get_recs(user_recs), k=10)))

Strategy: Collaborative
Similarity: Jaccard
MAP = 0.095
UHR = 0.600


In [21]:
print("Strategy: Collaborative")
print("Similarity: Cosine")
user_recs = collaborative_filter(s_train, s_input_sample, sim_fn=cosine_sim)
print("MAP = {0:.3f}".format(mapk_score(s_hidden_sample, get_recs(user_recs), k=10)))
print("UHR = {0:.3f}".format(uhr_score(s_hidden_sample, get_recs(user_recs), k=10)))

Strategy: Collaborative
Similarity: Cosine
MAP = 0.091
UHR = 0.500


In [22]:
print("Strategy: Content-Based")
print("Similarity: Jaccard")
user_recs = content_filter(s_items, s_input_sample)
print("MAP = {0:.3f}".format(mapk_score(s_hidden_sample, get_recs(user_recs), k=10)))
print("UHR = {0:.3f}".format(uhr_score(s_hidden_sample, get_recs(user_recs), k=10)))

Strategy: Content-Based
Similarity: Jaccard
MAP = 0.058
UHR = 0.300


In [None]:
print("Strategy: Content-Based")
print("Similarity: Cosine")
user_recs = content_filter(s_items, s_input_sample, sim_fn=cosine_sim)
print("MAP = {0:.3f}".format(mapk_score(s_hidden_sample, get_recs(user_recs), k=10)))
print("UHR = {0:.3f}".format(uhr_score(s_hidden_sample, get_recs(user_recs), k=10)))

Strategy: Content-Based
Similarity: Cosine
MAP = 0.052
UHR = 0.300
