# Prepare Train and Test Data

## Recommended Reading

- **Surprise!** A Python scikit for recommender systems. [Module Page](http://surpriselib.com/)
- **Kaggle:** [Recommender Systems in Python 101](https://www.kaggle.com/gspmoreira/recommender-systems-in-python-101)
- **Towards Data Science:** [Evaluation Metrics for Recommender Systems](https://towardsdatascience.com/evaluation-metrics-for-recommender-systems-df56c6611093)

In [1]:
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import pickle

In [2]:
item_file = "talent.pkl"
item_records, COLUMN_LABELS, READABLE_LABELS, ATTRIBUTES = pickle.load(open(item_file, "rb"))
item_df = pd.DataFrame(item_records)[ATTRIBUTES + COLUMN_LABELS].fillna(value=0)
item_df.head()

Unnamed: 0,id,name,price,reactions,stars,joined,categories,in_13_reasons_why,in_90_day_fiance,in_actors,...,in_ufc,in_vanderpump_rules,in_venture_capitalists,in_viners,in_vlog_squad,in_voice_actors,in_winter_sports,in_writers,in_younow,in_youtubers
0,perezhilton,Perez Hilton,27.0,924,5.0,April 2018,"[Reality TV, Commentators, Featured]",0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,andydick,Andy Dick,99.0,340,4.9,October 2018,"[Reality TV, Comedians, Featured, Actors]",0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,tjlavin,TJ Lavin,80.0,291,5.0,February 2018,"[Reality TV, Riders, Featured, Extreme Sports,...",0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,carsonkressley,Carson Kressley,59.0,290,5.0,October 2018,"[Reality TV, Bravo, Stylists, Featured, Actors...",0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,riffraff,RiFF RAFF,75.0,402,4.7,December 2017,"[Rappers, Featured, Musicians]",0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [3]:
ITEM_NAMES = item_df["id"].values
like_file = "likes.pkl"
like_csr = pickle.load(open(like_file, "rb"))
like_mat = np.array(like_csr.todense())
like_df = pd.DataFrame(like_mat, columns=ITEM_NAMES)
like_df.head()

Unnamed: 0,perezhilton,andydick,tjlavin,carsonkressley,riffraff,chumlee,gilbertgottfried,icet,benhiggy,laturtle,...,chrisjaialex,voman,el_peego,thisannaisbananas,zachharper,johnoberg,zacpullam,kansasbowling,mattcirulnick,itsscaleb__
0,0,0,0,1,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,1,0,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Make Train/Test/Input/Hidden Data

In [4]:
from sklearn.model_selection import train_test_split


def nonzero_index_set(arr):
    """
    Returns a set of  indices corresponding to non-zero
    entries in a numpy array (or other list-like).
    """
    res = set()
    for i, val in enumerate(arr):
        if val > 0:
            res.add(i)
    return res


def mat_to_sets(mat):
    """
    Converts a numpy matrix into a list of sets of column
    indices corresponding to non-zero row entries.
    """
    return [nonzero_index_set(row) for row in mat]


def make_test_input(m_test, min_entries=3, seed=0):
    """
    Randomly removes non-zero entries from rows in a numpy matrix
    leaving exactly {min_entries} non-zero entries per row.
    params:
        m_test: numpy matrix of test data
        min_entries: number of entries to leave in each row
        seed: seed for pseudorandomness
    returns:
        m_input: new numpy matrix with some entries removed
        m_hidden: new numpy matrix of the removed entries
        s_input: list of sets with some entries removed
        s_hidden: list of sets of the removed entries
    """
    np.random.seed(seed)
    m_input = m_test.copy()
    m_hidden = np.zeros(m_input.shape)
    s_hidden = []
    for ridx, row in enumerate(m_input):
        idxs = nonzero_index_set(row)
        to_remove = len(idxs) - min_entries
        rems = np.random.choice(list(idxs), size=to_remove)
        for cidx in rems:
            m_input[ridx][cidx] = 0
            m_hidden[ridx][cidx] = 1
        s_hidden.append(set(rems))
    s_input = mat_to_sets(m_input)
    return m_input, m_hidden, s_input, s_hidden


def recs_train_test_split(mat, test_size=0.2, min_entries=3, seed=0):
    """
    Splits a transaction matrix into train and test data.
    params:
        mat: numpy matrix of all data
        test_size: proportion of dataset to hold for testing
        min_entries: number of entries to leave in each test row
        seed: seed for pseudorandomness
    returns:
        m_train: numpy matrix of train data
        m_test: numpy matrix of test data
        m_input: numpy matrix of test data, with some entries removed
        m_hidden: numpy matrix of entries removed from test data
        s_train: list of sets of train data
        s_test: list of sets of test data
        s_input: list of sets of test data, with some entries removed
        s_hidden: list of sets of entries removed from test data
    """
    m_train, m_test = train_test_split(mat, test_size=test_size, shuffle=True, random_state=seed)
    m_input, m_hidden, s_input, s_hidden = make_test_input(m_test, min_entries=min_entries, seed=seed)
    s_train = mat_to_sets(m_train)
    s_test = mat_to_sets(m_test)
    return m_train, m_test, m_input, m_hidden, s_train, s_test, s_input, s_hidden

In [5]:
min_likes = like_df.sum(axis=1).min()
print("Each user has liked at least {} talent.".format(min_likes))

Each user has liked at least 6 talent.


In [6]:
m_train, m_test, m_input, m_hidden, s_train, s_test, s_input, s_hidden = recs_train_test_split(
    like_mat, test_size=0.2, min_entries=3, seed=0
)
len(m_train), len(m_test), len(m_input), len(m_hidden), len(s_train), len(s_test), len(s_input), len(s_hidden)

(4000, 1000, 1000, 1000, 4000, 1000, 1000, 1000)

In [7]:
from scipy.sparse import csr_matrix
import pickle


mat_split = [csr_matrix(mat) for mat in (m_train, m_test, m_input, m_hidden)]
set_split = (s_train, s_test, s_input, s_hidden)
out_data = (mat_split, set_split)


mat_file = "train_test_mat.pkl"
pickle.dump(mat_split, open(mat_file, "wb"))
print("Mat Format: Wrote {} train records and {} test records to {}.".format(len(m_train), len(m_test), mat_file))


set_file = "train_test_set.pkl"
pickle.dump(set_split, open(set_file, "wb"))
print("Set Format: Wrote {} train records and {} test records to {}.".format(len(s_train), len(s_test), set_file))

Mat Format: Wrote 4000 train records and 1000 test records to train_test_mat.pkl.
Set Format: Wrote 4000 train records and 1000 test records to train_test_set.pkl.


In [8]:
def write_kaggle_recs_file(filename, headers, recs_list):
    lines = [",".join(headers)]
    for i, recs in enumerate(recs_list):
        lines.append("{},{}".format(i, " ".join([str(v) for v in recs])))
    text = "\n".join(lines)
    with open(filename, "w") as file:
        file.write(text)
    return len(lines) - 1

In [9]:
soln_headers = ["Id", "Expected"]
soln_file = "kaggle_solution.csv"
soln_n = write_kaggle_recs_file(soln_file, soln_headers, s_hidden)
print("Wrote solution file of {} sets to {}.".format(soln_n, soln_file))

Wrote solution file of 1000 sets to kaggle_solution.csv.


## Evaluation Metrics

In [10]:
def mtp_score(recs_true, recs_pred, t=10, adjusted=False):
    """
    Computes the Mean Top Precision (MTP) score of recommendations.
    For each user, top precision = # of correct recs / # of total recs
    Where # of total recommendations is limited to the first t.
    MTP is the mean of top precision across all users.
    params:
        recs_true: list of sets of hidden items for each user
        recs_pred: list of lists of recommended items, with each list
        sorted in order of decreasing relevance
        t: number of recommendations to use in top set
        adjusted: if True, adjust for the number of hidden items
    """
    if len(recs_true) != len(recs_pred):
        note = "Length of true list {} does not match length of recommended list {}."
        raise ValueError(note.format(len(recs_true), len(recs_pred)))
    scores = []
    for r_true, r_pred_orig in zip(recs_true, recs_pred):
        r_pred = list(r_pred_orig)[0:t]
        possible = t
        # If a user has less than t hidden recs, then the
        # maximum top precision cannot be reached
        if adjusted:
            possible = min(len(r_true), t)
        intersect = set(r_true).intersection(set(r_pred))
        prec = len(intersect) / possible
        scores.append(prec)
    return np.mean(scores)

In [11]:
def uhr_score(recs_true, recs_pred, t=10):
    """
    Computes the User Hit Rate (UHR) score of recommendations.
    UHR = the fraction of users whose top list included at
    least one item also in their hidden set.
    params:
        recs_true: list of sets of hidden items for each user
        recs_pred: list of lists of recommended items, with each list
        t: number of recommendations to use in top set
    """
    if len(recs_true) != len(recs_pred):
        note = "Length of true list {} does not match length of recommended list {}."
        raise ValueError(note.format(len(recs_true), len(recs_pred)))
    scores = []
    for r_true, r_pred_orig in zip(recs_true, recs_pred):
        r_pred = list(r_pred_orig)[0:t]
        intersect = set(r_true).intersection(set(r_pred))
        scores.append(1 if len(intersect) > 0 else 0)
    return np.mean(scores)

In [12]:
def collaborate_jaccard(recs_train, recs_input, n=10, k=3):
    """
    Collaborative filtering recommender system using Jaccard set similarity.
    params:
        recs_train: list of sets of liked item indices for train data
        recs_input: list of sets of liked item indices for input data
        n: number of items to recommend for each user
        k: number of similar users to base recommendations on
    returns:
        recs_pred: list of lists of recommended item indices,
        with each list sorted in order of decreasing relevance
    """
    recs_pred = []
    for src in recs_input:
        users = []
        for vec in recs_train:
            sim = len(vec.intersection(src)) / (len(vec.union(src)) + 1e-5)
            if sim > 0:
                users.append((sim, vec))
        k_users = min(len(users), k)
        if k_users > 0:
            top_users = sorted(users, key=lambda p: p[0], reverse=True)
            vecs = [vec for (sim, vec) in top_users[0:k_users]]
            opts = dict()
            for user_set in vecs:
                for item in user_set:
                    if item not in src:
                        if item not in opts:
                            opts[item] = 0
                        opts[item] += 1
            ranks = [(opts[i], i) for i in opts]
            top_ranks = sorted(ranks, reverse=True)
            n_recs = min(len(top_ranks), n)
            recs = [i for (s, i) in top_ranks[0:n_recs]]
            recs_pred.append(recs)
        else:
            recs_pred.append([])
    return recs_pred

In [13]:
import ml_metrics


n_pred = len(s_input)
n_top = 10
s_pred = collaborate_jaccard(s_train, s_input[0:n_pred], n=n_top, k=30)
mtp = mtp_score(s_hidden[0:n_pred], s_pred, t=n_top)
mtp_adj = mtp_score(s_hidden[0:n_pred], s_pred, t=n_top, adjusted=True)
uhr = uhr_score(s_hidden[0:n_pred], s_pred, t=n_top)
mapk = ml_metrics.mapk(s_hidden[0:n_pred], s_pred, n_top)
print("MTP  = {0:.3f}".format(mtp))
print("MTP* = {0:.3f}".format(mtp_adj))
print("UHR  = {0:.3f}".format(uhr))
print("MAP  = {0:.3f}".format(mapk))

MTP  = 0.115
MTP* = 0.278
UHR  = 0.694
MAP  = 0.152


In [14]:
pred_headers = ["Id", "Predicted"]
pred_file = "kaggle_submission.csv"
pred_n = write_kaggle_recs_file(pred_file, pred_headers, s_pred)
print("Wrote sample submission file of {} recommendations to {}.".format(pred_n, pred_file))

Wrote sample submission file of 1000 recommendations to kaggle_submission.csv.
