# Prepare Train and Test Data

## Recommended Reading

- **Surprise!** A Python scikit for recommender systems. [Module Page](http://surpriselib.com/)
- **Kaggle:** [Recommender Systems in Python 101](https://www.kaggle.com/gspmoreira/recommender-systems-in-python-101)
- **Towards Data Science:** [Evaluation Metrics for Recommender Systems](https://towardsdatascience.com/evaluation-metrics-for-recommender-systems-df56c6611093)

In [1]:
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import pickle

In [2]:
# Load module from another directory
import shutil
shutil.copyfile(src="../scripts/redcarpet.py", dst="../notebooks/redcarpet.py")
from redcarpet import nonzero_index_set, mat_to_sets, write_kaggle_recs

In [3]:
item_file = "../input/talent.pkl"
item_records, COLUMN_LABELS, READABLE_LABELS, ATTRIBUTES = pickle.load(open(item_file, "rb"))
item_df = pd.DataFrame(item_records)[ATTRIBUTES + COLUMN_LABELS].fillna(value=0)
ITEM_NAMES = item_df["name"].values
ITEM_IDS = item_df["id"].values
item_df.head()

Unnamed: 0,id,name,price,reactions,stars,joined,categories,in_13_reasons_why,in_90_day_fiance,in_actors,...,in_ufc,in_vanderpump_rules,in_venture_capitalists,in_viners,in_vlog_squad,in_voice_actors,in_winter_sports,in_writers,in_younow,in_youtubers
0,perezhilton,Perez Hilton,27.0,924,5.0,April 2018,"[Reality TV, Commentators, Featured]",0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,andydick,Andy Dick,99.0,340,4.9,October 2018,"[Reality TV, Comedians, Featured, Actors]",0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,tjlavin,TJ Lavin,80.0,291,5.0,February 2018,"[Reality TV, Riders, Featured, Extreme Sports,...",0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,carsonkressley,Carson Kressley,59.0,290,5.0,October 2018,"[Reality TV, Bravo, Stylists, Featured, Actors...",0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,riffraff,RiFF RAFF,75.0,402,4.7,December 2017,"[Rappers, Featured, Musicians]",0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [4]:
like_file = "../input/likes.pkl"
like_csr = pickle.load(open(like_file, "rb"))
like_mat = np.array(like_csr.todense())
like_df = pd.DataFrame(like_mat, columns=ITEM_NAMES)
like_df.head()

Unnamed: 0,Perez Hilton,Andy Dick,TJ Lavin,Carson Kressley,RiFF RAFF,Chumlee,Gilbert Gottfried,Ice T,Ben Higgins,Evan Breen,...,Chris Jai Alex,Peter Dickson,Laura Perlongo,Anna del Gaizo,Zach Harper,John Oberg,Zac Pullam,Kansas Bowling,Matt Cirulnick,Caleb Senzel
0,1,0,0,0,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Make Train/Test/Input/Hidden Data

In [5]:
from sklearn.model_selection import train_test_split


def make_test_input(m_test, min_entries=3, seed=0):
    """
    Randomly removes non-zero entries from rows in a numpy matrix
    leaving exactly {min_entries} non-zero entries per row.
    params:
        m_test: numpy matrix of test data
        min_entries: number of entries to leave in each row
        seed: seed for pseudorandomness
    returns:
        m_input: new numpy matrix with some entries removed
        m_hidden: new numpy matrix of the removed entries
        s_input: list of sets with some entries removed
        s_hidden: list of sets of the removed entries
    """
    np.random.seed(seed)
    m_input = m_test.copy()
    m_hidden = np.zeros(m_input.shape)
    s_hidden = []
    for ridx, row in enumerate(m_input):
        idxs = nonzero_index_set(row)
        to_remove = len(idxs) - min_entries
        rems = np.random.choice(list(idxs), size=to_remove)
        for cidx in rems:
            m_input[ridx][cidx] = 0
            m_hidden[ridx][cidx] = 1
        s_hidden.append(set(rems))
    s_input = mat_to_sets(m_input)
    return m_input, m_hidden, s_input, s_hidden


def recs_train_test_split(mat, test_size=0.2, min_entries=3, seed=0):
    """
    Splits a transaction matrix into train and test data.
    params:
        mat: numpy matrix of all data
        test_size: proportion of dataset to hold for testing
        min_entries: number of entries to leave in each test row
        seed: seed for pseudorandomness
    returns:
        m_train: numpy matrix of train data
        m_test: numpy matrix of test data
        m_input: numpy matrix of test data, with some entries removed
        m_hidden: numpy matrix of entries removed from test data
        s_train: list of sets of train data
        s_test: list of sets of test data
        s_input: list of sets of test data, with some entries removed
        s_hidden: list of sets of entries removed from test data
    """
    m_train, m_test = train_test_split(mat, test_size=test_size, shuffle=True, random_state=seed)
    m_input, m_hidden, s_input, s_hidden = make_test_input(m_test, min_entries=min_entries, seed=seed)
    s_train = mat_to_sets(m_train)
    s_test = mat_to_sets(m_test)
    return m_train, m_test, m_input, m_hidden, s_train, s_test, s_input, s_hidden

In [6]:
min_likes = like_df.sum(axis=1).min()
print("Each user has liked at least {} talent.".format(min_likes))

Each user has liked at least 11 talent.


In [7]:
print("Make Hold Out Split")
m_train, m_hold, m_hold_input, m_hold_hidden, s_train, s_hold, s_hold_input, s_hold_hidden = recs_train_test_split(
    like_mat, test_size=0.2, min_entries=3, seed=0
)
print("Type", "Train", "Hold", "Input", "Hidden")
print("Matrix",len(m_train), len(m_hold), len(m_hold_input), len(m_hold_hidden))
print("Set", len(s_train), len(s_hold), len(s_hold_input), len(s_hold_hidden))
print()

print("Make Train/Test Split")
m_train, m_test, m_input, m_hidden, s_train, s_test, s_input, s_hidden = recs_train_test_split(
    m_train, test_size=0.25, min_entries=3, seed=0
)
print("Type", "Test", "Hold", "Input", "Hidden")
print("Train",len(m_train), len(m_test), len(m_input), len(m_hidden))
print("Test", len(s_train), len(s_test), len(s_input), len(s_hidden))

Make Hold Out Split
Type Train Hold Input Hidden
Matrix 4000 1000 1000 1000
Set 4000 1000 1000 1000

Make Train/Test Split
Type Test Hold Input Hidden
Train 3000 1000 1000 1000
Test 3000 1000 1000 1000


In [8]:
from scipy.sparse import csr_matrix
import pickle


mat_split = [csr_matrix(mat) for mat in (m_train, m_test, m_input, m_hidden)]
set_split = (s_train, s_test, s_input, s_hidden)

mat_file = "../input/train_test_mat.pkl"
pickle.dump(mat_split, open(mat_file, "wb"))
print("Mat Format: Wrote {} train records and {} test records to {}.".format(len(m_train), len(m_test), mat_file))

set_file = "../input/train_test_set.pkl"
pickle.dump(set_split, open(set_file, "wb"))
print("Set Format: Wrote {} train records and {} test records to {}.".format(len(s_train), len(s_test), set_file))

Mat Format: Wrote 3000 train records and 1000 test records to ../input/train_test_mat.pkl.
Set Format: Wrote 3000 train records and 1000 test records to ../input/train_test_set.pkl.


In [9]:
mat_hold_file = "../input/hold_mat.pkl"
pickle.dump(csr_matrix(m_hold_input), open(mat_hold_file, "wb"))
print("Mat Format: Wrote {} hold out input records to {}.".format(len(m_hold_input), mat_hold_file))

set_hold_file = "../input/hold_set.pkl"
pickle.dump(s_hold_input, open(set_hold_file, "wb"))
print("Set Format: Wrote {} hold out input records to {}.".format(len(s_hold_input), set_hold_file))

Mat Format: Wrote 1000 hold out input records to ../input/hold_mat.pkl.
Set Format: Wrote 1000 hold out input records to ../input/hold_set.pkl.


In [10]:
mat_hold_soln_split = [csr_matrix(mat) for mat in (m_hold, m_hold_input, m_hold_hidden)]
set_hold_soln_split = (s_hold, s_hold_input, s_hold_hidden)

mat_hold_soln_file = "../input/hold_soln_mat.pkl"
pickle.dump(mat_hold_soln_split, open(mat_hold_soln_file, "wb"))
print("Mat Format: Wrote {} hold out input records to {}.".format(len(m_hold), mat_hold_soln_file))

set_hold_soln_file = "../input/hold_soln_set.pkl"
pickle.dump(set_hold_soln_split, open(set_hold_soln_file, "wb"))
print("Set Format: Wrote {} hold out input records to {}.".format(len(s_hold), set_hold_soln_file))

Mat Format: Wrote 1000 hold out input records to ../input/hold_soln_mat.pkl.
Set Format: Wrote 1000 hold out input records to ../input/hold_soln_set.pkl.


In [11]:
soln_headers = ["Id", "Expected"]
soln_file = "../output/kaggle_solution_test.csv"
soln_n = write_kaggle_recs(s_hidden, soln_file, soln_headers)
print("Wrote test set solution file for {} users to {}.".format(soln_n, soln_file))

Wrote test set solution file for 1000 users to ../output/kaggle_solution_test.csv.


In [12]:
hold_headers = ["Id", "Expected"]
hold_file = "../output/kaggle_solution_hold.csv"
hold_n = write_kaggle_recs(s_hold_hidden, hold_file, hold_headers)
print("Wrote hold out set solution file for {} users to {}.".format(hold_n, hold_file))

Wrote hold out set solution file for 1000 users to ../output/kaggle_solution_hold.csv.


In [13]:
# Write joint Kaggle file for public and private leaderboard
full_file = "../output/kaggle_solution_leaderboard.csv"
full_headers = ["Id", "Expected", "Usage"]
full_lines = [",".join(full_headers)]
all_hidden = s_hidden + s_hold_hidden
all_usage = ["Public" for _ in s_hidden] + ["Private" for _ in s_hold_hidden]
for i, (recs, usage) in enumerate(zip(all_hidden, all_usage)):
    user_recs = " ".join([str(v) for v in recs])
    line = "{},{},{}".format(i, user_recs, usage)
    full_lines.append(line)
full_text = "\n".join(full_lines)
with open(full_file, "w") as file:
    file.write(full_text)
full_n = len(full_lines) - 1
print("Wrote leaderboard solution file for {} users to {}.".format(full_n, full_file))

Wrote leaderboard solution file for 2000 users to ../output/kaggle_solution_leaderboard.csv.


## Evaluation Metrics

In [14]:
def mtp_score(recs_true, recs_pred, k=10, adjusted=False):
    """
    Computes the Mean Top Precision (MTP) score of recommendations.
    For each user, top precision = # of correct recs / # of total recs
    Where # of total recommendations is limited to the first t.
    MTP is the mean of top precision across all users.
    params:
        recs_true: list of sets of hidden items for each user
        recs_pred: list of lists of recommended items, with each list
        sorted in order of decreasing relevance
        k: number of recommendations to use in top set
        adjusted: if True, adjust for the number of hidden items
    """
    if len(recs_true) != len(recs_pred):
        note = "Length of true list {} does not match length of recommended list {}."
        raise ValueError(note.format(len(recs_true), len(recs_pred)))
    scores = []
    for r_true, r_pred_orig in zip(recs_true, recs_pred):
        r_pred = list(r_pred_orig)[0:k]
        possible = k
        # If a user has less than t hidden recs, then the
        # maximum top precision cannot be reached
        if adjusted:
            possible = min(len(r_true), k)
        intersect = set(r_true).intersection(set(r_pred))
        prec = len(intersect) / possible
        scores.append(prec)
    return np.mean(scores)

In [15]:
from redcarpet import get_recs, mapk_score, uhr_score, collaborative_filter, jaccard_sim


n_pred = len(s_input)
n_top = 10
s_scores = collaborative_filter(s_train, s_input[0:n_pred], sim_fn=jaccard_sim, k=n_top, j=30)
s_pred = get_recs(s_scores)
mtp = mtp_score(s_hidden[0:n_pred], s_pred, k=n_top)
mtp_adj = mtp_score(s_hidden[0:n_pred], s_pred, k=n_top, adjusted=True)
uhr = uhr_score(s_hidden[0:n_pred], s_pred, k=n_top)
mapk = mapk_score(s_hidden[0:n_pred], s_pred, k=n_top)
print("MTP  = {0:.3f}".format(mtp))
print("MTP* = {0:.3f}".format(mtp_adj))
print("UHR  = {0:.3f}".format(uhr))
print("MAP  = {0:.3f}".format(mapk))

MTP  = 0.461
MTP* = 0.462
UHR  = 0.884
MAP  = 0.364


In [16]:
all_input = s_input + s_hold_input
full_scores = collaborative_filter(s_train, all_input, sim_fn=jaccard_sim, k=n_top, j=30)
full_pred = get_recs(full_scores)
pred_headers = ["Id", "Predicted"]
pred_file = "../output/kaggle_submission_sample.csv"
pred_n = write_kaggle_recs(full_pred, pred_file, pred_headers)
print("Wrote sample submission file of recommendations for {} users to {}.".format(pred_n, pred_file))

Wrote sample submission file of recommendations for 2000 users to ../output/kaggle_submission_sample.csv.
