**rec_ratings**:

Tests the functionality of using multiple inputs with multiple assigned ratings.

In [1]:
# gensim had to be installed separately (setup.py wasn't working)

from wikirec import data_utils, model, utils
import os
import json
import pickle

import matplotlib.pyplot as plt
import seaborn as sns

INFO:tensorflow:Enabling eager execution
INFO:tensorflow:Enabling v2 tensorshape
INFO:tensorflow:Enabling resource variables
INFO:tensorflow:Enabling tensor equality
INFO:tensorflow:Enabling control flow v2


# Import existing data 

In [3]:
topic = "books"

In [4]:
# Make sure to extract the .zip file containing enwiki_books.ndjson
with open("./enwiki_books.ndjson/enwiki_books.ndjson", "r") as fin:
    books = [json.loads(l) for l in fin]

print(f"Found a total of {len(books)} books.")

Found a total of 41234 books.


In [5]:
titles = [m[0] for m in books] # Titles of each book
texts = [m[1] for m in books] # The text from the English Wiki Articles of each page 

In [6]:
if os.path.isfile("./book_corpus_idxs.pkl"):
    print(f"Loading book corpus and selected indexes")
    with open(f"./book_corpus_idxs.pkl", "rb") as f:
        text_corpus, selected_idxs = pickle.load(f)
        selected_titles = [titles[i] for i in selected_idxs]

else:
    print(f"Creating book corpus and selected indexes")
    text_corpus, selected_idxs = data_utils.clean(
        texts=texts,
        language="en",
        min_token_freq=5,  # 0 for Bert
        min_token_len=3,  # 0 for Bert
        min_tokens=50,
        max_token_index=-1,
        min_ngram_count=3,
        remove_stopwords=True,  # False for Bert
        ignore_words=None,
        remove_names=True,
        sample_size=1,
        verbose=True,
    )

    selected_titles = [titles[i] for i in selected_idxs]

    with open("./book_corpus_idxs.pkl", "wb") as f:
        print("Pickling book corpus and selected indexes")
        pickle.dump([text_corpus, selected_idxs], f, protocol=4)

Loading book corpus and selected indexes


# Preparing TFIDF model for recommendations

In [8]:
def load_or_create_sim_matrix(
    method,
    corpus,
    metric,
    topic,
    path="./",
    bert_st_model="xlm-r-bert-base-nli-stsb-mean-tokens",
    **kwargs,
):
    """
    Loads or creats a similarity matrix to deliver recommendations
    
    NOTE: the .pkl files made are 5-10GB or more in size
    """
    if os.path.isfile(f"{path}{topic}_{metric}_{method}_sim_matrix.pkl"):
        print(f"Loading {method} {topic} {metric} similarity matrix")
        with open(f"{path}{topic}_{metric}_{method}_sim_matrix.pkl", "rb") as f:
            sim_matrix = pickle.load(f)

    else:
        print(f"Creating {method} {topic} {metric} similarity matrix")
        embeddings = model.gen_embeddings(
            method=method, corpus=corpus, bert_st_model=bert_st_model, **kwargs,
        )
        sim_matrix = model.gen_sim_matrix(
            method=method, metric=metric, embeddings=embeddings,
        )

        with open(f"{path}{topic}_{metric}_{method}_sim_matrix.pkl", "wb") as f:
            print(f"Pickling {method} {topic} {metric} similarity matrix")
            pickle.dump(sim_matrix, f, protocol=4)

    return sim_matrix

In [92]:
def recommend(
    inputs=None, ratings = None, titles=None, sim_matrix=None, metric="cosine", n=10,
):
    """
    Recommends similar items given an input or list of inputs of interest.

    Parameters
    ----------
        inputs : str or list (default=None)
            The name of an item or items of interest

        titles : lists (default=None)
            The titles of the articles

        sim_matrix : gensim.interfaces.TransformedCorpus or np.ndarray (default=None)
            The similarity sim_matrix for the corpus from the given model

        n : int (default=10)
            The number of items to recommend

        metric : str (default=cosine)
            The metric to be used when comparing vectorized corpus entries

            Options include: cosine and euclidean

    Returns
    -------
        recommendations : list of lists
            Those items that are most similar to the inputs and their similarity scores
    """
    if isinstance(inputs, str):
        inputs = [inputs]
        
    if ratings:
        # compute weights based on number of inputs
        weights = np.divide(ratings, sum(ratings))

    first_input = True
    for r, inpt in enumerate(inputs):
        checked = 0
        num_missing = 0
        for i, t in enumerate(titles):
            if t == inpt:
                if first_input:
                    sims = sim_matrix[i]

                    first_input = False
                    
                    if ratings:
                        sims = sims * weights[0]

                else:
                    if ratings:
                        sims = [np.mean([s, weights[r] * sim_matrix[i][j]]) for j, s in enumerate(sims)]
                        
                        # scale the ratings to get comparable similarity scores 
                        if r == (len(ratings) - 1):
                            sims = [s*len(ratings) for s in sims]
                    else:
                        sims = [np.mean([s, sim_matrix[i][j]]) for j, s in enumerate(sims)]
                

            else:
                checked += 1
                if checked == len(titles):
                    num_missing += 1
                    print(f"{inpt} not available")
                    utils._check_str_args(arguments=inpt, valid_args=titles)

                    if num_missing == len(inputs):
                        ValueError(
                            "None of the provided inputs were found in the index. Please check them and reference Wikipedia for valid inputs via article names."
                        )

    titles_and_scores = [[t, sims[i]] for i, t in enumerate(titles)]

    if metric == "cosine":
        # Cosine similarities have been used (higher is better)
        recommendations = sorted(titles_and_scores, key=lambda x: x[1], reverse=True)
    elif metric == "euclidean":
        # Euclidean distances have been used (lower is better)
        recommendations = sorted(titles_and_scores, key=lambda x: x[1], reverse=False)

    recommendations = [r for r in recommendations if r[0] not in inputs][:n]

    return recommendations

In [37]:
tfidf_sim_matrix = load_or_create_sim_matrix(
    method="tfidf",
    corpus=text_corpus,
    metric="cosine",  # euclidean
    topic=topic,
    path="./",
    max_features=None,
    norm='l2',
)

Loading tfidf books cosine similarity matrix


# Utilizing the ratings for making recommendations

## No ratings (simply averages them)

In [70]:
recommend(
    inputs=["Harry Potter and the Philosopher's Stone", "The Hobbit"],
    ratings = None,
    titles=selected_titles,
    sim_matrix=tfidf_sim_matrix,
    n=10,
    metric="cosine",
)

[['The History of The Hobbit', 0.39543687196383887],
 ['The Annotated Hobbit', 0.3363038160377437],
 ['Harry Potter and the Deathly Hallows', 0.31365133665137784],
 ['Harry Potter and the Chamber of Secrets', 0.30697017204471655],
 ['Harry Potter and the Order of the Phoenix', 0.3050990193809162],
 ['Harry Potter and the Goblet of Fire', 0.3008483408564587],
 ['Harry Potter and the Half-Blood Prince', 0.29072541606015917],
 ['The Magical Worlds of Harry Potter', 0.27557913845105425],
 ['The Lord of the Rings', 0.2705902834440923],
 ['Harry Potter and the Prisoner of Azkaban', 0.2645807400342994]]

In [93]:
recommend(
    inputs=["Harry Potter and the Philosopher's Stone", "The Hobbit"],
    ratings = [1,1],
    titles=selected_titles,
    sim_matrix=tfidf_sim_matrix,
    n=10,
    metric="cosine",
)

[['The History of The Hobbit', 0.39543687196383887],
 ['The Annotated Hobbit', 0.3363038160377437],
 ['Harry Potter and the Deathly Hallows', 0.31365133665137784],
 ['Harry Potter and the Chamber of Secrets', 0.30697017204471655],
 ['Harry Potter and the Order of the Phoenix', 0.3050990193809162],
 ['Harry Potter and the Goblet of Fire', 0.3008483408564587],
 ['Harry Potter and the Half-Blood Prince', 0.29072541606015917],
 ['The Magical Worlds of Harry Potter', 0.27557913845105425],
 ['The Lord of the Rings', 0.2705902834440923],
 ['Harry Potter and the Prisoner of Azkaban', 0.2645807400342994]]

## Rating them with a slight preference
Notice the slight change in order, with preference for Harry Potter books being shifted higher.

In [50]:
recommend(
    inputs=["Harry Potter and the Philosopher's Stone", "The Hobbit"],
    ratings=[10, 7],
    titles=selected_titles,
    sim_matrix=tfidf_sim_matrix,
    n=10,
    metric="cosine",
)

[['Harry Potter and the Deathly Hallows', 0.18033784648608733],
 ['Harry Potter and the Chamber of Secrets', 0.17594624462307532],
 ['Harry Potter and the Order of the Phoenix', 0.1756540960132673],
 ['Harry Potter and the Goblet of Fire', 0.1726741972850122],
 ['Harry Potter and the Half-Blood Prince', 0.16668867872859788],
 ['The History of The Hobbit', 0.16547755013570214],
 ['The Magical Worlds of Harry Potter', 0.1563955007984641],
 ['Harry Potter and the Prisoner of Azkaban', 0.15039618037214678],
 ['The Annotated Hobbit', 0.14085849302448103],
 ['Fantastic Beasts and Where to Find Them', 0.14042896585808473]]

## Completely different ratings
The recommendations get dominated by Harry Potter-related books. 

In [94]:
recommend(
    inputs=["Harry Potter and the Philosopher's Stone", "The Hobbit"],
    ratings=[10, 1],
    titles=selected_titles,
    sim_matrix=tfidf_sim_matrix,
    n=10,
    metric="cosine",
)

[['Harry Potter and the Deathly Hallows', 0.5316733523205266],
 ['Harry Potter and the Order of the Phoenix', 0.5193415471015106],
 ['Harry Potter and the Chamber of Secrets', 0.5152463699786378],
 ['Harry Potter and the Goblet of Fire', 0.5071667717102636],
 ['Harry Potter and the Half-Blood Prince', 0.48847532617369244],
 ['The Magical Worlds of Harry Potter', 0.44810686758192414],
 ['Harry Potter and the Prisoner of Azkaban', 0.4324709815079089],
 ['Harry Potter and the Methods of Rationality', 0.40968395333697355],
 ['Fantastic Beasts and Where to Find Them', 0.40658222086493995],
 ['Harry, A History', 0.39851750751938997]]

## Let's try out more books and ratings

In [95]:
recommend(
    inputs=["Harry Potter and the Philosopher's Stone", "The Hobbit", "The Hunger Games"],
    ratings=[10, 3, 7],
    titles=selected_titles,
    sim_matrix=tfidf_sim_matrix,
    n=20,
    metric="cosine",
)

[['Mockingjay', 0.31480874258417063],
 ['Catching Fire', 0.2994814770252149],
 ['Harry Potter and the Deathly Hallows', 0.2570596127501137],
 ['Harry Potter and the Order of the Phoenix', 0.2555220410683682],
 ['Harry Potter and the Goblet of Fire', 0.24831425745082442],
 ['Harry Potter and the Chamber of Secrets', 0.24626256775293565],
 ['Harry Potter and the Half-Blood Prince', 0.23938628740774218],
 ['Harry Potter and the Prisoner of Azkaban', 0.22674636201363085],
 ['The Ballad of Songbirds and Snakes', 0.2211152005365305],
 ['The Magical Worlds of Harry Potter', 0.21119311863026252],
 ['Fantastic Beasts and Where to Find Them', 0.19749803207545133],
 ['The Bone Season', 0.19459012004314707],
 ['Harry Potter and the Cursed Child', 0.19273396967259301],
 ['Harry, A History', 0.1896708634434452],
 ['Harry Potter and the Methods of Rationality', 0.1895936627076112],
 ['Divergent', 0.18482139471796297],
 ['The Casual Vacancy', 0.1820582051350734],
 ['The Ickabog', 0.16790599281721247],