**rec_ratings**:

Tests the functionality of using multiple inputs with multiple assigned ratings.

In [1]:
from wikirec import data_utils, model, utils
import os
import json
import pickle

import matplotlib.pyplot as plt
import seaborn as sns

INFO:tensorflow:Enabling eager execution
INFO:tensorflow:Enabling v2 tensorshape
INFO:tensorflow:Enabling resource variables
INFO:tensorflow:Enabling tensor equality
INFO:tensorflow:Enabling control flow v2


# Import existing data 

In [2]:
topic = "books"

In [3]:
# Make sure to extract the .zip file containing enwiki_books.ndjson
with open("./enwiki_books.ndjson/enwiki_books.ndjson", "r") as fin:
    books = [json.loads(l) for l in fin]

print(f"Found a total of {len(books)} books.")

Found a total of 41234 books.


In [4]:
titles = [m[0] for m in books] # Titles of each book
texts = [m[1] for m in books] # The text from the English Wiki Articles of each page 

In [5]:
if os.path.isfile("./book_corpus_idxs.pkl"):
    print(f"Loading book corpus and selected indexes")
    with open(f"./book_corpus_idxs.pkl", "rb") as f:
        text_corpus, selected_idxs = pickle.load(f)
        selected_titles = [titles[i] for i in selected_idxs]

else:
    print(f"Creating book corpus and selected indexes")
    text_corpus, selected_idxs = data_utils.clean(
        texts=texts,
        language="en",
        min_token_freq=5,  # 0 for Bert
        min_token_len=3,  # 0 for Bert
        min_tokens=50,
        max_token_index=-1,
        min_ngram_count=3,
        remove_stopwords=True,  # False for Bert
        ignore_words=None,
        remove_names=True,
        sample_size=1,
        verbose=True,
    )

    selected_titles = [titles[i] for i in selected_idxs]

    with open("./book_corpus_idxs.pkl", "wb") as f:
        print("Pickling book corpus and selected indexes")
        pickle.dump([text_corpus, selected_idxs], f, protocol=4)

Loading book corpus and selected indexes


# Preparing a TFIDF model for recommendations

In [6]:
def load_or_create_sim_matrix(
    method,
    corpus,
    metric,
    topic,
    path="./",
    bert_st_model="xlm-r-bert-base-nli-stsb-mean-tokens",
    **kwargs,
):
    """
    Loads or creats a similarity matrix to deliver recommendations
    
    NOTE: the .pkl files made are 5-10GB or more in size
    """
    if os.path.isfile(f"{path}{topic}_{metric}_{method}_sim_matrix.pkl"):
        print(f"Loading {method} {topic} {metric} similarity matrix")
        with open(f"{path}{topic}_{metric}_{method}_sim_matrix.pkl", "rb") as f:
            sim_matrix = pickle.load(f)

    else:
        print(f"Creating {method} {topic} {metric} similarity matrix")
        embeddings = model.gen_embeddings(
            method=method, corpus=corpus, bert_st_model=bert_st_model, **kwargs,
        )
        sim_matrix = model.gen_sim_matrix(
            method=method, metric=metric, embeddings=embeddings,
        )

        with open(f"{path}{topic}_{metric}_{method}_sim_matrix.pkl", "wb") as f:
            print(f"Pickling {method} {topic} {metric} similarity matrix")
            pickle.dump(sim_matrix, f, protocol=4)

    return sim_matrix

In [7]:
tfidf_sim_matrix = load_or_create_sim_matrix(
    method="tfidf",
    corpus=text_corpus,
    metric="cosine",  # euclidean
    topic=topic,
    path="./",
    max_features=None,
    norm='l2',
)

Loading tfidf books cosine similarity matrix


# Utilizing the ratings for making recommendations
Ratings for each input are restricted to be less than 10, and greater than or equal to 0

## No ratings 
Ratings are simply averaged

In [11]:
model.recommend(
    inputs=["Harry Potter and the Philosopher's Stone", "The Hobbit"],
    ratings = None,
    titles=selected_titles,
    sim_matrix=tfidf_sim_matrix,
    n=10,
    metric="cosine",
)

[['The History of The Hobbit', 0.39543687196383887],
 ['The Annotated Hobbit', 0.3363038160377437],
 ['Harry Potter and the Deathly Hallows', 0.31365133665137784],
 ['Harry Potter and the Chamber of Secrets', 0.30697017204471655],
 ['Harry Potter and the Order of the Phoenix', 0.3050990193809162],
 ['Harry Potter and the Goblet of Fire', 0.3008483408564587],
 ['Harry Potter and the Half-Blood Prince', 0.29072541606015917],
 ['The Magical Worlds of Harry Potter', 0.27557913845105425],
 ['The Lord of the Rings', 0.2705902834440923],
 ['Harry Potter and the Prisoner of Azkaban', 0.2645807400342994]]

## Rating them with a slight preference
Notice the slight change in order, with preference for Harry Potter books being shifted higher.

In [10]:
model.recommend(
    inputs=["Harry Potter and the Philosopher's Stone", "The Hobbit"],
    ratings=[10, 7],
    titles=selected_titles,
    sim_matrix=tfidf_sim_matrix,
    n=10,
    metric="cosine",
)

[['Harry Potter and the Deathly Hallows', 0.30657433902634845],
 ['Harry Potter and the Chamber of Secrets', 0.299108615859228],
 ['Harry Potter and the Order of the Phoenix', 0.2986119632225544],
 ['Harry Potter and the Goblet of Fire', 0.2935461353845208],
 ['Harry Potter and the Half-Blood Prince', 0.28337075383861643],
 ['The History of The Hobbit', 0.28131183523069364],
 ['The Magical Worlds of Harry Potter', 0.2658723513573889],
 ['Harry Potter and the Prisoner of Azkaban', 0.25567350663264954],
 ['The Annotated Hobbit', 0.23945943814161777],
 ['Fantastic Beasts and Where to Find Them', 0.23872924195874404]]

## Completely different ratings
The recommendations become dominated by Harry Potter-related books. 

In [12]:
model.recommend(
    inputs=["Harry Potter and the Philosopher's Stone", "The Hobbit"],
    ratings=[10, 2],
    titles=selected_titles,
    sim_matrix=tfidf_sim_matrix,
    n=15,
    metric="cosine",
)

[['Harry Potter and the Deathly Hallows', 0.29477934298463276],
 ['Harry Potter and the Order of the Phoenix', 0.2878002029586181],
 ['Harry Potter and the Chamber of Secrets', 0.28600602221674704],
 ['Harry Potter and the Goblet of Fire', 0.28137579293129095],
 ['Harry Potter and the Half-Blood Prince', 0.27111298346937845],
 ['The Magical Worlds of Harry Potter', 0.2496943728679467],
 ['Harry Potter and the Prisoner of Azkaban', 0.24082811762989983],
 ['Harry Potter and the Methods of Rationality', 0.22732611579756462],
 ['Fantastic Beasts and Where to Find Them', 0.22613839155622148],
 ['Harry, A History', 0.2210389981400308],
 ['Harry Potter and the Cursed Child', 0.22078239641328692],
 ['The Casual Vacancy', 0.1933741447502036],
 ['The Ickabog', 0.18729998944935433],
 ['Pollomuhku ja Posityyhtynen', 0.1701209697837757],
 ['Quidditch Through the Ages', 0.14644881625297318]]

## Let's try out more books and ratings

In [13]:
model.recommend(
    inputs=["Harry Potter and the Philosopher's Stone", "The Hobbit", "The Hunger Games"],
    ratings=[7, 5, 9],
    titles=selected_titles,
    sim_matrix=tfidf_sim_matrix,
    n=20,
    metric="cosine",
)

[['Mockingjay', 0.2833900475601968],
 ['Catching Fire', 0.2667058570158859],
 ['Harry Potter and the Deathly Hallows', 0.2441572424724626],
 ['Harry Potter and the Order of the Phoenix', 0.24185900215519326],
 ['Harry Potter and the Goblet of Fire', 0.2361172970905478],
 ['Harry Potter and the Chamber of Secrets', 0.2351746504405977],
 ['Harry Potter and the Half-Blood Prince', 0.2279300124518356],
 ['Harry Potter and the Prisoner of Azkaban', 0.21689561851501843],
 ['The History of The Hobbit', 0.21330459347629357],
 ['The Magical Worlds of Harry Potter', 0.20485904169643493],
 ['The Ballad of Songbirds and Snakes', 0.1969163655963662],
 ['Fantastic Beasts and Where to Find Them', 0.18965280996309142],
 ['Harry Potter and the Cursed Child', 0.18417542815018734],
 ['Harry Potter and the Methods of Rationality', 0.18141303011897664],
 ['Harry, A History', 0.1806894597581523],
 ['The Bone Season', 0.1805092769052663],
 ['The Annotated Hobbit', 0.18036065589770353],
 ['The Lord of the Rin