# Playing with Recommenders

In [1]:
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import pickle

In [2]:
# Load module from another directory
import shutil
shutil.copyfile(src="../scripts/redcarpet.py", dst="../notebooks/redcarpet.py")
from redcarpet import mat_to_sets

## Load Data

In [3]:
item_file = "../input/talent.pkl"
item_records, COLUMN_LABELS, READABLE_LABELS, ATTRIBUTES = pickle.load(open(item_file, "rb"))
item_df = pd.DataFrame(item_records)[ATTRIBUTES + COLUMN_LABELS].fillna(value=0)
ITEM_NAMES = item_df["name"].values
ITEM_IDS = item_df["id"].values
item_df.head()

Unnamed: 0,id,name,price,reactions,stars,joined,categories,in_13_reasons_why,in_90_day_fiance,in_actors,...,in_ufc,in_vanderpump_rules,in_venture_capitalists,in_viners,in_vlog_squad,in_voice_actors,in_winter_sports,in_writers,in_younow,in_youtubers
0,perezhilton,Perez Hilton,27.0,924,5.0,April 2018,"[Reality TV, Commentators, Featured]",0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,andydick,Andy Dick,99.0,340,4.9,October 2018,"[Reality TV, Comedians, Featured, Actors]",0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,tjlavin,TJ Lavin,80.0,291,5.0,February 2018,"[Reality TV, Riders, Featured, Extreme Sports,...",0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,carsonkressley,Carson Kressley,59.0,290,5.0,October 2018,"[Reality TV, Bravo, Stylists, Featured, Actors...",0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,riffraff,RiFF RAFF,75.0,402,4.7,December 2017,"[Rappers, Featured, Musicians]",0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [4]:
like_file = "../input/likes.pkl"
like_csr = pickle.load(open(like_file, "rb"))
like_mat = np.array(like_csr.todense())
like_df = pd.DataFrame(like_mat, columns=ITEM_NAMES)
like_df.head()

Unnamed: 0,Perez Hilton,Andy Dick,TJ Lavin,Carson Kressley,RiFF RAFF,Chumlee,Gilbert Gottfried,Ice T,Ben Higgins,Evan Breen,...,Chris Jai Alex,Peter Dickson,Laura Perlongo,Anna del Gaizo,Zach Harper,John Oberg,Zac Pullam,Kansas Bowling,Matt Cirulnick,Caleb Senzel
0,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,1,0,0,1,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [5]:
s_train, s_test, s_input, s_hidden = pickle.load(open("../input/train_test_set.pkl", "rb"))
print(len(s_train), len(s_test), len(s_input), len(s_hidden))

3000 1000 1000 1000


In [6]:
s_items = mat_to_sets(item_df[COLUMN_LABELS].values)
print(len(s_items))

5392


## Evaluation Metrics

In [7]:
from redcarpet import mapk_score, uhr_score

In [8]:
help(mapk_score)

Help on function mapk_score in module redcarpet:

mapk_score(recs_true, recs_pred, k=10)
    Computes the mean average precision at k (MAP@K) of recommendations.
    MAP@K = mean AP@K score over all users
    AP@K = (1 / m) * sum from 1 to k of (precision at i * relevance of ith item)
    Where m is the number of items in a user's hidden set
    Where k is the number of items recommended to each user
    params:
        recs_true: list of sets of hidden items for each user
        recs_pred: list of lists of recommended items, with each list
        k: number of recommendations to use in top set
    returns:
        float, range [0, 1], though score of 1 will be impossible if
        recs_true includes users who have more than k hidden items



In [9]:
help(uhr_score)

Help on function uhr_score in module redcarpet:

uhr_score(recs_true, recs_pred, k=10)
    Computes the user hit tate (UHR) score of recommendations.
    UHR = the fraction of users whose top list included at
    least one item also in their hidden set.
    params:
        recs_true: list of sets of hidden items for each user
        recs_pred: list of lists of recommended items, with each list
        k: number of recommendations to use in top set
    returns:
        float, range [0, 1]



## Similarity Measures

In [10]:
from redcarpet import jaccard_sim, cosine_sim

In [11]:
help(jaccard_sim)

Help on function jaccard_sim in module redcarpet:

jaccard_sim(u, v)
    Computes the Jaccard similarity between sets u and v.
    sim = intersection(u, v) / union(u, v)
    params:
        u, v: sets to compare
    returns:
        float between 0 and 1, where 1 represents perfect
            similarity and 0 represents no similarity



In [12]:
help(cosine_sim)

Help on function cosine_sim in module redcarpet:

cosine_sim(u, v)
    Computes the Cosine similarity between sets u and v.
    sim = intersection(u, v) / sqrt(|u| * |v|)
    Where |s| is the number of items in set s
    params:
        u, v: sets to compare
    returns:
        float between 0 and 1, where 1 represents perfect
            similarity and 0 represents no similarity



## Recommendation Algorithms

In [13]:
from redcarpet import collaborative_filter, content_filter, weighted_hybrid

In [14]:
help(collaborative_filter)

Help on function collaborative_filter in module redcarpet:

collaborative_filter(recs_train, recs_input, k=10, j=3, sim_fn=<function jaccard_sim at 0x1130120d0>)
    Collaborative filtering recommender system.
    params:
        recs_train: list of sets of liked item indices for train data
        recs_input: list of sets of liked item indices for input data
        k: number of items to recommend for each user
        j: number of similar users to base recommendations on
        sim_fn(u, v): function that returns a float value representing
            the similarity between sets u and v
    returns:
        recs_pred: list of lists of tuples of recommendations where
            each tuple has (item index, relevance score) with the list
            of tuples sorted in order of decreasing relevance



In [15]:
help(content_filter)

Help on function content_filter in module redcarpet:

content_filter(items_train, recs_input, k=10, sim_fn=<function jaccard_sim at 0x1130120d0>)
    Content-based filtering recommender system.
    params:
        items_train: list of sets of non-zero attribute indices for items
        recs_input: list of sets of liked item indices for input data
        k: number of items to recommend for each user
        sim_fn(u, v): function that returns a float value representing
            the similarity between sets u and v
    returns:
        recs_pred: list of lists of tuples of recommendations where
            each tuple has (item index, relevance score) with the list
            of tuples sorted in order of decreasing relevance



In [16]:
help(weighted_hybrid)

Help on function weighted_hybrid in module redcarpet:

weighted_hybrid(components, k=10)
    Hybrid recommender system using weights.
    params:
        components: list of tuples where each tuple has (recs list, weight)
            where recs_list is a list of tuples where each tuple has
            (item index, relevance score) and weight is the factor used to
            scale the relevance scores for this component
        k: number of items to recommend for each user
    returns:
        recs_pred: list of lists of tuples of recommendations where
            each tuple has (item index, relevance score) with the list
            of tuples sorted in order of decreasing relevance



## Model Selection

In [17]:
from redcarpet import get_recs

In [18]:
help(get_recs)

Help on function get_recs in module redcarpet:

get_recs(user_recs, k=None)
    Extracts recommended item indices, leaving out their scores.
    params:
        user_recs: list of lists of tuples of recommendations where
            each tuple has (item index, relevance score) with the
            list of tuples sorted in order of decreasing relevance
        k: maximumum number of recommendations to include for each
            user, if None, include all recommendations
    returns:
        list of lists of recommendations where each
            list has the column indices of recommended items
            sorted in order they appeared in user_recs



In [19]:
n_pred = 250 # len(s_input)
k_top = 10
j_neighbors = 30
s_input_sample = s_input[0:n_pred]
s_hidden_sample = s_hidden[0:n_pred]

In [20]:
print("Strategy: Collaborative")
print("Similarity: Jaccard")
collab_jac = collaborative_filter(s_train, s_input_sample, sim_fn=jaccard_sim, j=j_neighbors)
print("MAP = {0:.3f}".format(mapk_score(s_hidden_sample, get_recs(collab_jac), k=k_top)))
print("UHR = {0:.3f}".format(uhr_score(s_hidden_sample, get_recs(collab_jac), k=k_top)))

Strategy: Collaborative
Similarity: Jaccard
MAP = 0.156
UHR = 0.680


In [21]:
print("Strategy: Collaborative")
print("Similarity: Cosine")
collab_cos = collaborative_filter(s_train, s_input_sample, sim_fn=cosine_sim, j=j_neighbors)
print("MAP = {0:.3f}".format(mapk_score(s_hidden_sample, get_recs(collab_cos), k=k_top)))
print("UHR = {0:.3f}".format(uhr_score(s_hidden_sample, get_recs(collab_cos), k=k_top)))

Strategy: Collaborative
Similarity: Cosine
MAP = 0.157
UHR = 0.676


In [22]:
print("Strategy: Collaborative")
print("Similarity: Hybrid (0.8 * Jaccard + 0.2 * Cosine)")
collab_hybrid = weighted_hybrid([
    (collab_jac, 0.8),
    (collab_cos, 0.2)
])
print("MAP = {0:.3f}".format(mapk_score(s_hidden_sample, get_recs(collab_hybrid), k=k_top)))
print("UHR = {0:.3f}".format(uhr_score(s_hidden_sample, get_recs(collab_hybrid), k=k_top)))

Strategy: Collaborative
Similarity: Hybrid (0.8 * Jaccard + 0.2 * Cosine)
MAP = 0.156
UHR = 0.680


In [23]:
print("Strategy: Content-Based")
print("Similarity: Jaccard")
cont_jac = content_filter(s_items, s_input_sample, sim_fn=jaccard_sim)
print("MAP = {0:.3f}".format(mapk_score(s_hidden_sample, get_recs(cont_jac), k=k_top)))
print("UHR = {0:.3f}".format(uhr_score(s_hidden_sample, get_recs(cont_jac), k=k_top)))

Strategy: Content-Based
Similarity: Jaccard
MAP = 0.063
UHR = 0.380


In [24]:
print("Strategy: Content-Based")
print("Similarity: Cosine")
cont_cos = content_filter(s_items, s_input_sample, sim_fn=cosine_sim)
print("MAP = {0:.3f}".format(mapk_score(s_hidden_sample, get_recs(cont_cos), k=k_top)))
print("UHR = {0:.3f}".format(uhr_score(s_hidden_sample, get_recs(cont_cos), k=k_top)))

Strategy: Content-Based
Similarity: Cosine
MAP = 0.067
UHR = 0.408


In [25]:
print("Strategy: Content-Based")
print("Similarity: Hybrid (0.8 * Jaccard + 0.2 * Cosine)")
cont_hybrid = weighted_hybrid([
    (cont_jac, 0.8),
    (cont_cos, 0.2)
])
print("MAP = {0:.3f}".format(mapk_score(s_hidden_sample, get_recs(cont_hybrid), k=k_top)))
print("UHR = {0:.3f}".format(uhr_score(s_hidden_sample, get_recs(cont_hybrid), k=k_top)))

Strategy: Content-Based
Similarity: Hybrid (0.8 * Jaccard + 0.2 * Cosine)
MAP = 0.065
UHR = 0.380


## Inspect Individual User

In [26]:
uid = 13
all_recs = collab_jac
s_pred = get_recs(all_recs)
print("Model: Collaborative Filtering with Jaccard Similarity (j=30)")
print("User: {}".format(uid))
print()
print("Given:       {}".format(sorted(s_input[uid])))
print("Recommended: {}".format(sorted(s_pred[uid])))
print("Actual:      {}".format(sorted(s_hidden[uid])))
set_intersect = set(s_pred[uid]).intersection(set(s_hidden[uid]))
n_intersect = len(set_intersect)
n_union = len(set(s_pred[uid]).union(set(s_hidden[uid])))
apk = mapk_score([s_hidden[uid]], [s_pred[uid]], k_top)
jacc = jaccard_sim(set(s_pred[uid]), set(s_hidden[uid]))
print()
print("Recommendation Hits = {}".format(n_intersect))
print("Average Precision   = {0:.3f}".format(apk))
print("Jaccard Similarity  = {0:.3f}".format(jacc))
print()
print("Successful Recommendations:")
for item_id in set_intersect:
    print("- {} ({})".format(ITEM_NAMES[item_id], "cameo.com/" + ITEM_IDS[item_id]))
print()
print("All Recommendation Scores:")
for i, (item_id, score) in enumerate(all_recs[uid]):
    hit = "Y" if item_id in s_hidden[uid] else " "
    print("{0}. [{3}] ({2:.3f}) {1}".format(str(i + 1).zfill(2), ITEM_NAMES[item_id], score, hit))

Model: Collaborative Filtering with Jaccard Similarity (j=30)
User: 13

Given:       [14, 95, 250, 555]
Recommended: [0, 9, 23, 129, 154, 178, 179, 194, 691, 3470]
Actual:      [53, 178]

Recommendation Hits = 1
Average Precision   = 0.500
Jaccard Similarity  = 0.091

Successful Recommendations:
- Nikki Blonsky (cameo.com/nikkiblonsky)

All Recommendation Scores:
01. [Y] (0.333) Nikki Blonsky
02. [ ] (0.233) Chris Rankin
03. [ ] (0.167) Taylor Louderman
04. [ ] (0.100) Evan Breen
05. [ ] (0.100) Kyle Massey
06. [ ] (0.100) Tom Felton
07. [ ] (0.100) Michael Rapaport
08. [ ] (0.067) Perez Hilton
09. [ ] (0.067) Chelcie Lynn
10. [ ] (0.067) Drew Gehling


## Submit to Kaggle

In [27]:
from redcarpet import write_kaggle_recs

In [28]:
help(write_kaggle_recs)

Help on function write_kaggle_recs in module redcarpet:

write_kaggle_recs(recs_list, filename=None, headers=['Id', 'Predicted'])
    Writes recommendations to file in Kaggle submission format.
    params:
        recs_list: list of lists of recommendations where each
            list has the column indices of recommended items
            sorted in order of decreasing relevance
        filename: path to file for writing output
        headers: list of strings of output columns, defaults to
            submission columns: ["Id", "Predicted"]
    returns:
        int: number of non-header lines, where each line represents
            a user and the recommendations given to them



In [29]:
# Load hold out set
s_hold_input = pickle.load(open("../input/hold_set.pkl", "rb"))
print("Hold Out Set: N = {}".format(len(s_hold_input)))
s_all_input = s_input + s_hold_input
print("All Input:    N = {}".format(len(s_all_input)))

Hold Out Set: N = 1000
All Input:    N = 2000


In [30]:
print("Final Model")
print("Strategy: Collaborative")
print("Similarity: Jaccard")
# Be sure to use the entire s_input
final_scores = collaborative_filter(s_train, s_all_input, sim_fn=jaccard_sim, j=30)
final_recs = get_recs(final_scores)

Final Model
Strategy: Collaborative
Similarity: Jaccard


In [31]:
outfile = "../output/kaggle_submission_collab_jaccard_j30.csv"
n_lines = write_kaggle_recs(final_recs, outfile)
print("Wrote predictions for {} users to {}.".format(n_lines, outfile))

Wrote predictions for 2000 users to ../output/kaggle_submission_collab_jaccard_j30.csv.
