# Problem 1

In this specific scenario, the calculation of a user-item pair's score is based on the weighted sum of features, encompassing both user and item characteristics. When arranging items for a single user, the user-specific part of the prediction remains constant, and the list of sorted items is solely influenced by item features. Surprisingly, the sorted list for another user turns out to be the same due to the consistent nature of the user-specific component. This model, therefore, lacks the capacity for personalization.

# Problem 2

In [62]:
from pathlib import Path
import gdown
DATA_PATH = Path("./data")
DATA_PATH.mkdir(exist_ok=True)

ANIMES_DATA_PATH = DATA_PATH / "animes.gz"
PROFILES_DATA_PATH = DATA_PATH / "profiles.gz"
REVIEWS_DATA_PATH = DATA_PATH / "reviews.gz"

urls = {
    "animes.gz": "https://drive.google.com/uc?id=1Z_nyJUEY8gqcQArAGatB6LsScGirEb6u",
    "profiles.gz": "https://drive.google.com/uc?id=1CLWRy-mv-bZzlzlCFN07h2anl2E-CtXU",
    "reviews.gz": "https://drive.google.com/uc?id=1XQW-OQyo64RqsYEo5_gzfgygN-5O1pdn"
}

for file, url in urls.items():
    gdown.download(url, str(DATA_PATH / file))

Downloading...
From: https://drive.google.com/uc?id=1Z_nyJUEY8gqcQArAGatB6LsScGirEb6u
To: c:\Users\Aleksey Ryabykin\Documents\GitHub\hse_courses\2nd_year\term1\recsys\hw1\data\animes.gz
100%|██████████| 3.60M/3.60M [00:00<00:00, 10.1MB/s]
Downloading...
From: https://drive.google.com/uc?id=1CLWRy-mv-bZzlzlCFN07h2anl2E-CtXU
To: c:\Users\Aleksey Ryabykin\Documents\GitHub\hse_courses\2nd_year\term1\recsys\hw1\data\profiles.gz
100%|██████████| 1.04M/1.04M [00:00<00:00, 4.49MB/s]
Downloading...
From (uriginal): https://drive.google.com/uc?id=1XQW-OQyo64RqsYEo5_gzfgygN-5O1pdn
From (redirected): https://drive.google.com/uc?id=1XQW-OQyo64RqsYEo5_gzfgygN-5O1pdn&confirm=t&uuid=9560bdbc-5a4f-407c-ab77-33a0c0395efc
To: c:\Users\Aleksey Ryabykin\Documents\GitHub\hse_courses\2nd_year\term1\recsys\hw1\data\reviews.gz
100%|██████████| 119M/119M [00:12<00:00, 9.64MB/s] 


In [63]:
import ast
from tqdm import tqdm
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LinearRegression, LogisticRegression, Ridge
from sklearn.pipeline import Pipeline, TransformerMixin
from sklearn.preprocessing import StandardScaler
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

USER_COL = "user_id"
ITEM_COL = "anime_id"
RELEVANCE_COL = "score"

animes_df = pd.read_csv(ANIMES_DATA_PATH, na_filter=False)
reviews_df = pd.read_csv(REVIEWS_DATA_PATH)
profiles_df = pd.read_csv(PROFILES_DATA_PATH, converters={'favorites_anime': ast.literal_eval})

In [64]:
print("Rows: {}\nColunms: {}".format(*profiles_df.shape))
print("NaNs: {}".format(profiles_df.isna().sum().sum()))
profiles_df.sample(3)

Rows: 37458
Colunms: 5
NaNs: 22846


Unnamed: 0,user_id,gender,birthday,favorites_anime,link
19202,W3lkin830,Male,,"[32, 849, 2001, 7311, 12189, 32281, 33489, 352...",https://myanimelist.net/profile/W3lkin830
1487,yniverse,Female,Apr 29,"[7054, 20507, 21603, 22199, 28999, 32189, 33028]",https://myanimelist.net/profile/yniverse
9404,ukiAY,,,"[2942, 10232, 13759, 21273, 31953, 34798, 3554...",https://myanimelist.net/profile/ukiAY


In [65]:
print("Rows: {}\nColunms: {}".format(*reviews_df.shape))
print("NaNs: {}".format(reviews_df.isna().sum().sum()))
print("Duplicates: {}".format(reviews_df.duplicated().sum()))
reviews_df.sample(3)

Rows: 109297
Colunms: 7
NaNs: 0
Duplicates: 0


Unnamed: 0,uid,user_id,anime_id,text,score,scores,link
93035,272622,Mar33p,1793,It now being 2018 and seeing this anime for th...,6,"{'Overall': '6', 'Story': '6', 'Animation': '6...",https://myanimelist.net/reviews.php?id=272622
40977,124599,Kundalini,18153,I'd rate this somewhere between 6 and 7 out of...,6,"{'Overall': '6', 'Story': '7', 'Animation': '1...",https://myanimelist.net/reviews.php?id=124599
16010,199283,UrsulaCallistis,23623,Non Non Biyori Repeat returns once more to Ren...,9,"{'Overall': '9', 'Story': '8', 'Animation': '1...",https://myanimelist.net/reviews.php?id=199283


In [66]:
print("Rows: {}\nColunms: {}".format(*animes_df.shape))
print("NaNs: {}".format(animes_df.isna().sum().sum()))
print("Duplicates: {}".format(animes_df.duplicated().sum()))
animes_df.sample(3)

Rows: 16216
Colunms: 12
NaNs: 0
Duplicates: 0


Unnamed: 0,anime_id,title,synopsis,genre,aired,episodes,members,popularity,ranked,score,img_url,link
4537,6351,"Clannad: After Story - Mou Hitotsu no Sekai, K...",Included in the 8th and final DVD of Clannad ~...,"Drama, Romance, School","Jul 1, 2009",1,191025,567,663.0,7.93,https://cdn.myanimelist.net/images/anime/10/19...,https://myanimelist.net/anime/6351/Clannad__Af...
7392,16458,Perrine Monogatari Movie,Movie version of the TV series Perrine Monogat...,"Drama, Historical, Shoujo, Slice of Life","Jun 30, 1990",1,283,12343,13535.0,6.4,https://cdn.myanimelist.net/images/anime/3/444...,https://myanimelist.net/anime/16458/Perrine_Mo...
1232,1374,Kyoushoku Soukou Guyver (2005),"Sho Fukamachi, a normal teenager accidentally ...","Adventure, Sci-Fi, Shounen","Aug 6, 2005 to Feb 18, 2006",26,16662,3495,2462.0,7.32,https://cdn.myanimelist.net/images/anime/6/369...,https://myanimelist.net/anime/1374/Kyoushoku_S...


Let's take function from the seminar to obtain the test users who have at least 5 interactions.

In [67]:
def get_test_pairs(reviews, favorites, n_pairs, score_cutoff, seed):
    '''
    Construct a dataset consisting of pairs of liked and disliked animes. The likes and dislikes
    are defined by the ratings value: everything below threshold is a dislike, the rest are likes.

    The function ensures that the amount of likes and dislikes is the same per each user in data.
    The users that do not contain enough likes or dislikes are discarded from the result.
    The result is to be used for evaluating the quality of recommendations by some algorithms.
    Hence, user favorites are excluded to ensure that there is no trivial solution.
    '''
    rng = np.random.default_rng(seed)
    def strict_sample_no_favs(series):
        # sample `n_pairs` elements from `series`, if not enough data - return empty list,
        # discard favorites, otherwise the evaluation on test pairs against favorites makes no sense
        above_cutoff, user_id = series.name
        allowed_items = np.setdiff1d(series.values, favorites.loc[user_id])
        return rng.choice(allowed_items, n_pairs, replace=False) if len(allowed_items)>=n_pairs else []

    test_pairs = (
        reviews
         # split by likes and dislikes, group by users
        .groupby([(reviews["score"] >= score_cutoff), 'user_id'])
        # sample `n_pairs` items (both likes and dislikes), disregard user favorites
        ['anime_id'].apply(strict_sample_no_favs)
         # disregard users that have not enough items
        .loc[lambda x: x.apply(len) > 0]
         # make two columns of likes and dislikes
        .unstack('score')
        # ensure each user has both likes and dislikes
        .dropna()
         # rename by rule `score >= score_cutoff`
        .rename(columns={False: 'dislikes', True: 'likes'})
    )
    return test_pairs

In [68]:
favorites = profiles_df.set_index('user_id')['favorites_anime']
favorites_scores = pd.merge(
    favorites.explode().rename('anime_id').reset_index(),
    reviews_df[['user_id', 'anime_id', 'score']],
    on = ['user_id', 'anime_id'],
    how = 'left'
)['score']

In [69]:
test_pairs = get_test_pairs(reviews_df, favorites, 3, 5, 0)

Let's write function for model evaluation:

In [87]:
from typing import Iterable, Dict
def model_evaluate(
        recommended_items: Iterable,
        holdout_items: Iterable,
        total_items: int=len(reviews_df['anime_id'].unique()),
        top_n: int=10
):
    hits_mask = recommended_items[:, :top_n] == holdout_items[:, None]
    hr = np.mean(hits_mask.any(axis=1))
    n_test_users = recommended_items.shape[0]
    hit_rank = np.where(hits_mask)[1] + 1.
    mrr = np.sum(1 / hit_rank) / n_test_users
    coverage_score = len(np.unique(recommended_items)) / total_items
    return hr, mrr, coverage_score

In [112]:
def split_train_test_data(reviews, favorites, test_pairs, seed):
    rng = np.random.default_rng(seed)

    test_items = test_pairs['likes'].apply(lambda x: rng.choice(x))

    train_items = (
        reviews.query('user_id in @test_pairs.index')
        .groupby('user_id')
        ['anime_id'].apply(
            lambda row: np.setdiff1d(
                row.values, 
                favorites.loc[row.name] + [test_items.loc[row.name]]
            )
        )
    )

    return train_items, test_items

train_items, test_items = split_train_test_data(reviews_df, favorites, test_pairs, 0)

Let's use the pipeline from the seminar

In [141]:
class DenseTransformer(TransformerMixin):
    """
    Convert sparse matrix to dense np array to apply standard scaler with mean.
    """

    def fit(self, X, y=None, **fit_params):
        return self

    def transform(self, X, y=None, **fit_params):
        return X.toarray()
    



def build_cb_model(config, trainset, trainset_description, logistic=False, binary_vectorizer=True):
    """
    Config and fit cb model
    """
    feature_matrix, word_vectorizer = generate_features(config, trainset, trainset_description, binary_vectorizer)
    if logistic:
        regressor = LogisticRegression
    elif 'alpha' in config['model']:
        regressor = Ridge
    else:
        regressor = LinearRegression
    target_column = trainset_description['feedback']
    model = regressor(**config['model']).fit(feature_matrix, trainset[target_column])
    return model, word_vectorizer

def generate_features(config, trainset, trainset_description, binary_vectorizer):
    """
    Config and fit text vectorizer
    """
    if binary_vectorizer:
        word_vectorizer = CountVectorizer(**config['vectorizer']['binary'])
    else:
        word_vectorizer = Pipeline([("tfidf", TfidfVectorizer(**config['vectorizer']['tfidf'])), 
                                    ('dense', DenseTransformer()), 
                                    ("scaler", StandardScaler())])
    features_column = trainset_description['item_features']
    feature_matrix = word_vectorizer.fit_transform(trainset[features_column])
    return feature_matrix, word_vectorizer


def transform_predict(params, tokens):
    """
    Get recommendations from either classification or regression model
    """
    model, word_vectorizer = params
    tokens_encoded = word_vectorizer.transform(tokens)
    try: # handle classification models
        predictor = model.predict_proba
    except AttributeError:
        predictor = model.predict
    scores = predictor(tokens_encoded)
    if scores.ndim > 1: # handle classification
        scores = scores[:, 1] # take class 1
    return scores

def cb_model_scoring(params, testset, testset_description):
    """
    Select necessary features and get recommendations with the fitted pipeline
    """
    tokens = testset[testset_description['item_features']]
    scores = transform_predict(params, tokens)
    return scores

In [180]:
cb_config = {
    "model": dict(),
    "vectorizer":{
        "binary": dict( # simple binary token encoder
            min_df = 1,
            max_df = 0.9,
            strip_accents='unicode',
            stop_words = 'english',
            analyzer = 'word',
            binary = True,
        ),
        "tfidf": dict( # TfIDF Vectorizer
            min_df = 1,
            max_df = 0.9,
            strip_accents='unicode',
            stop_words = 'english',
            analyzer = 'word',
            use_idf = True,
            smooth_idf = True,
            sublinear_tf = True,
            binary = False,
            norm="l2",
        ),
    }
}
# we also define a general representation of our dataset
anime_description = {
    'feedback' : "score",
    "item_features": "tokens"
}

In [172]:
tokenize = lambda x: ' '.join(x.strip().split(', '))
animes_df['tokens'] = animes_df['genre'].apply(tokenize)

In [176]:
def get_predictions_for_user(reviews, anime_data, user_id, train_item_ids, all_item_ids, k=10, tokens='tokens', binary_vectorizer=True):
    # Step 1: Filter user-specific reviews and fetch anime tokens
    user_reviews = reviews[(reviews['user_id'] == user_id) & (reviews['anime_id'].isin(train_item_ids))]
    user_reviews = user_reviews.merge(anime_data[['anime_id', tokens]], on='anime_id', how='left')

    # Step 2: Build a content-based recommendation model
    cb_model = build_cb_model(cb_config, user_reviews, anime_description, binary_vectorizer=binary_vectorizer)

    # Step 3: Find items to score
    items_to_score = np.setdiff1d(all_item_ids, train_item_ids)
    features_for_scoring = anime_data[anime_data['anime_id'].isin(items_to_score)].copy()

    # Step 4: Make predictions and sort by predicted score
    features_for_scoring['predicted_score'] = cb_model_scoring(cb_model, features_for_scoring, anime_description)
    top_k_anime_ids = features_for_scoring.sort_values(by='predicted_score', ascending=False)['anime_id'].head(k).tolist()

    return top_k_anime_ids

def make_predictions_for_all_users(reviews, anime_data, train_items, **kwargs):
    all_animes = reviews['anime_id'].unique()
    predictions = []

    for user_id, train_animes in tqdm(
        train_items.items(), 
        total=len(train_items)
    ):
        predictions.append(
            get_predictions_for_user(
                reviews, anime_data, user_id, 
                train_animes, all_animes, **kwargs
            )
        )

    return np.array(predictions)

In [177]:
from collections import defaultdict
scores = defaultdict(dict)

In [178]:
%%time
predictions = make_predictions_for_all_users(reviews_df, animes_df, train_items)
hr, mrr, coverage = model_evaluate(predictions, test_items.values)
scores["genre"] = {
    "hr@10": hr,
    "mrr@10": mrr,
    "coverage@10": coverage
}
scores['genre']

100%|██████████| 696/696 [00:37<00:00, 18.46it/s]

CPU times: total: 25.4 s
Wall time: 37.7 s





{'hr@10': 0.004310344827586207,
 'mrr@10': 0.001955619412515964,
 'coverage@10': 0.35620743844944996}

In [181]:
%%time
predictions = make_predictions_for_all_users(reviews_df, animes_df, train_items, binary_vectorizer=False)
hr, mrr, coverage = model_evaluate(predictions, test_items.values)
scores["genre_tf_idf"] = {
    "hr@10": hr,
    "mrr@10": mrr,
    "coverage@10": coverage
}
scores['genre_tf_idf']

100%|██████████| 696/696 [00:36<00:00, 19.12it/s]

CPU times: total: 23.4 s
Wall time: 36.4 s





{'hr@10': 0.004310344827586207,
 'mrr@10': 0.001125478927203065,
 'coverage@10': 0.4848088004190676}

More diverse

In [182]:
cb_config["model"] = {"alpha": 20000, "random_state": 0xDEAD}

In [183]:
%%time
predictions = make_predictions_for_all_users(reviews_df, animes_df, train_items, binary_vectorizer=False)
hr, mrr, coverage = model_evaluate(predictions, test_items.values)
scores["genre_tf_idf_A2000"] = {
    "hr@10": hr,
    "mrr@10": mrr,
    "coverage@10": coverage
}
scores['genre_tf_idf_A2000']

100%|██████████| 696/696 [00:38<00:00, 17.87it/s]

CPU times: total: 24.3 s
Wall time: 39 s





{'hr@10': 0.011494252873563218,
 'mrr@10': 0.004395867542419266,
 'coverage@10': 0.41618648507071765}

Much better with regularization!

In [184]:
animes_df['advanced_tokens'] = animes_df['tokens'] + animes_df['synopsis'].apply(tokenize) + animes_df['title'].apply(tokenize)

In [186]:
anime_description = {
    'feedback' : "score",
    "item_features": "advanced_tokens"
}

In [193]:
%%time
predictions = make_predictions_for_all_users(reviews_df, animes_df, train_items, tokens="advanced_tokens", binary_vectorizer=True)
hr, mrr, coverage = model_evaluate(predictions, test_items.values)
scores["advanced_A2000"] = {
    "hr@10": hr,
    "mrr@10": mrr,
    "coverage@10": coverage
}
scores['advanced_A2000']

  0%|          | 0/696 [00:00<?, ?it/s]

100%|██████████| 696/696 [05:25<00:00,  2.14it/s]

CPU times: total: 3min 39s
Wall time: 5min 25s





{'hr@10': 0.035919540229885055,
 'mrr@10': 0.014890644955300127,
 'coverage@10': 0.33800419067574644}

## POP/RANDOM BASELINE

In [189]:
def sample_random_animes(reviews, anime_data, n_users, k, seed):
    rng = np.random.default_rng(seed)
    animes_with_reviews = reviews['anime_id'].unique()
    anime_with_ratings = anime_data.loc[anime_data['anime_id'].isin(animes_with_reviews)][['anime_id', 'score']]
    sampled_animes = rng.choice(anime_with_ratings['anime_id'], size=(n_users, k), replace=True)
    return sampled_animes

def sample_popular_animes(reviews, anime_data, n_users, k, seed):
    rng = np.random.default_rng(seed)
    animes_with_reviews = reviews['anime_id'].unique()
    anime_with_ratings = anime_data.loc[anime_data['anime_id'].isin(animes_with_reviews)][['anime_id', 'score']]
    weights = anime_with_ratings['score'] / anime_with_ratings['score'].sum()
    sampled_animes = rng.choice(anime_with_ratings['anime_id'], size=(n_users, k), replace=True, p=weights)
    return sampled_animes

In [196]:
seeds = np.random.SeedSequence(0xDEAD).spawn(100)
hr_sum, mrr_sum, coverage_sum = 0, 0, 0
for seed in seeds:
    popular = sample_popular_animes(reviews_df, animes_df, len(test_items), 10, seed)
    metrics = model_evaluate(popular, test_items.values)
    hr_sum += metrics[0]
    mrr_sum += metrics[1]
    coverage_sum += metrics[2]


scores["pop"] = {
    "hr@10": hr_sum / len(seeds),
    "mrr@10": mrr_sum / len(seeds),
    "coverage@10": coverage_sum / len(seeds),
}

scores["pop"]

{'hr@10': 0.0016666666666666679,
 'mrr@10': 0.0004989509213647146,
 'coverage@10': 0.5949122577265588}

In [197]:
hr_sum, mrr_sum, coverage_sum = 0, 0, 0
for seed in seeds:
    popular = sample_random_animes(reviews_df, animes_df, len(test_items), 10, seed)
    metrics = model_evaluate(popular, test_items.values)
    hr_sum += metrics[0]
    mrr_sum += metrics[1]
    coverage_sum += metrics[2]


scores["random"] = {
    "hr@10": hr_sum / len(seeds),
    "mrr@10": mrr_sum / len(seeds),
    "coverage@10": coverage_sum / len(seeds),
}

scores["random"]

{'hr@10': 0.0015517241379310342,
 'mrr@10': 0.0004420954205436964,
 'coverage@10': 0.5982700366684127}

In [198]:
data = {
    'genre': scores['genre'],
    'genre_tf_idf': scores['genre_tf_idf'],
    'genre_tf_idf_A2000': scores['genre_tf_idf_A2000'],
    'pop': scores['pop'],
    'random': scores['random'],
    'advanced_A2000': scores['advanced_A2000']
}

df = pd.DataFrame(data)

In [200]:
df

Unnamed: 0,genre,genre_tf_idf,genre_tf_idf_A2000,pop,random,advanced_A2000
hr@10,0.00431,0.00431,0.011494,0.001667,0.001552,0.03592
mrr@10,0.001956,0.001125,0.004396,0.000499,0.000442,0.014891
coverage@10,0.356207,0.484809,0.416186,0.594912,0.59827,0.338004
