<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Download-and-Clean-Data" data-toc-modified-id="Download-and-Clean-Data-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Download and Clean Data</a></span></li><li><span><a href="#Making-Recommendations" data-toc-modified-id="Making-Recommendations-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Making Recommendations</a></span><ul class="toc-item"><li><span><a href="#BERT" data-toc-modified-id="BERT-2.1"><span class="toc-item-num">2.1&nbsp;&nbsp;</span>BERT</a></span></li><li><span><a href="#Doc2vec" data-toc-modified-id="Doc2vec-2.2"><span class="toc-item-num">2.2&nbsp;&nbsp;</span>Doc2vec</a></span></li><li><span><a href="#LDA" data-toc-modified-id="LDA-2.3"><span class="toc-item-num">2.3&nbsp;&nbsp;</span>LDA</a></span></li><li><span><a href="#TFIDF" data-toc-modified-id="TFIDF-2.4"><span class="toc-item-num">2.4&nbsp;&nbsp;</span>TFIDF</a></span></li><li><span><a href="#WikilinkNN" data-toc-modified-id="WikilinkNN-2.5"><span class="toc-item-num">2.5&nbsp;&nbsp;</span>WikilinkNN</a></span></li><li><span><a href="#Weighted-Model" data-toc-modified-id="Weighted-Model-2.6"><span class="toc-item-num">2.6&nbsp;&nbsp;</span>Weighted Model</a></span></li></ul></li></ul></div>

**rec_movies**

Downloads an English Wikipedia dump and parses it for all available movies. All available models are then ran to compare recommendation efficacy.

If using this notebook in [Google Colab](https://colab.research.google.com/github/andrewtavis/wikirec/blob/main/examples/rec_movies.ipynb), you can activate GPUs by following `Edit > Notebook settings > Hardware accelerator` and selecting `GPU`.

In [None]:
# pip install wikirec -U

The following gensim update might be necessary in Google Colab as the default version is very low.

In [None]:
# pip install gensim -U

In Colab you'll also need to download nltk's names data.

In [None]:
# import nltk
# nltk.download("names")

In [None]:
import os
import json
import pickle

import matplotlib.pyplot as plt
import seaborn as sns

sns.set(style="darkgrid")
sns.set(rc={"figure.figsize": (15, 5)})

from wikirec import data_utils, model, utils

from IPython.core.display import display, HTML

display(HTML("<style>.container { width:99% !important; }</style>"))

# Download and Clean Data

In [None]:
files = data_utils.download_wiki(
    language="en", target_dir="./enwiki_dump", file_limit=-1, dump_id=False
)
len(files)

In [None]:
topic = "movies"

In [None]:
data_utils.parse_to_ndjson(
    topics=topic,
    output_path="./enwiki_movies.ndjson",
    input_dir="./enwiki_dump",
    partitions_dir="./enwiki_movie_partitions",
    limit=None,
    delete_parsed_files=True,
    multicore=True,
    verbose=True,
)

In [None]:
with open("./enwiki_movies.ndjson", "r") as fin:
    movies = [json.loads(l) for l in fin]

print(f"Found a total of {len(movies)} movies.")

In [None]:
titles = [m[0] for m in movies]
texts = [m[1] for m in movies]
wikilinks = [m[2] for m in movies]

In [None]:
if os.path.isfile("./movie_corpus_idxs.pkl"):
    print(f"Loading movie corpus and selected indexes")
    with open(f"./movie_corpus_idxs.pkl", "rb") as f:
        text_corpus, selected_idxs = pickle.load(f)

else:
    print(f"Creating movie corpus and selected indexes")
    text_corpus, selected_idxs = data_utils.clean(
        texts=texts,
        language="en",
        min_token_freq=5,  # 0 for Bert
        min_token_len=3,  # 0 for Bert
        min_tokens=50,
        max_token_index=-1,
        min_ngram_count=3,
        remove_stopwords=True,  # False for Bert
        ignore_words=None,
        remove_names=True,
        sample_size=1,
        verbose=True,
    )

    selected_titles = [titles[i] for i in selected_idxs]

    with open("./movie_corpus_idxs.pkl", "wb") as f:
        print("Pickling movie corpus and selected indexes")
        pickle.dump([text_corpus, selected_idxs], f, protocol=4)

# Making Recommendations

In [None]:
single_input_0 = "The Godfather"
single_input_1 = "The Dark Knight"
mutliple_inputs = ["The Godfather", "The Dark Knight"]

In [None]:
def load_or_create_sim_matrix(
    method,
    corpus,
    metric,
    topic,
    path="./",
    bert_st_model="xlm-r-bert-base-nli-stsb-mean-tokens",
    **kwargs,
):
    """
    Loads or creats a similarity matrix to deliver recommendations
    
    NOTE: the .pkl files made are 10-20GB or more in size
    """
    if os.path.isfile(f"{path}{topic}_{metric}_{method}_sim_matrix.pkl"):
        print(f"Loading {method} {topic} {metric} similarity matrix")
        with open(f"{path}{topic}_{metric}_{method}_sim_matrix.pkl", "rb") as f:
            sim_matrix = pickle.load(f)

    else:
        print(f"Creating {method} {topic} {metric} similarity matrix")
        embeddings = model.gen_embeddings(
            method=method, corpus=corpus, bert_st_model=bert_st_model, **kwargs,
        )
        sim_matrix = model.gen_sim_matrix(
            method=method, metric=metric, embeddings=embeddings,
        )

        with open(f"{path}{topic}_{metric}_{method}_sim_matrix.pkl", "wb") as f:
            print(f"Pickling {method} {topic} {metric} similarity matrix")
            pickle.dump(sim_matrix, f, protocol=4)

    return sim_matrix

## BERT

In [None]:
# Remove n-grams for BERT training
corpus_no_ngrams = [
    " ".join([t for t in text.split(" ") if "_" not in t]) for text in text_corpus
]

In [None]:
# We can pass kwargs for sentence_transformers.SentenceTransformer.encode
bert_sim_matrix = load_or_create_sim_matrix(
    method="bert",
    corpus=corpus_no_ngrams,
    metric="cosine",  # euclidean
    topic=topic,
    path="./",
    bert_st_model="xlm-r-bert-base-nli-stsb-mean-tokens",
    show_progress_bar=True,
    batch_size=32,
)

In [None]:
model.recommend(
    inputs=single_input_0,
    titles=selected_titles,
    sim_matrix=bert_sim_matrix,
    n=10,
    metric="cosine",
)

In [None]:
model.recommend(
    inputs=single_input_1,
    titles=selected_titles,
    sim_matrix=bert_sim_matrix,
    n=10,
    metric="cosine",
)

In [None]:
model.recommend(
    inputs=multiple_inputs,
    titles=selected_titles,
    sim_matrix=bert_sim_matrix,
    n=10,
    metric="cosine",
)

## Doc2vec

Note: Doc2vec wasn't ran because of runtime considerations

In [None]:
# We can pass kwargs for gensim.models.doc2vec.Doc2Vec
doc2vec_sim_matrix = load_or_create_sim_matrix(
    method="doc2vec",
    corpus=text_corpus,
    metric="cosine",  # euclidean
    topic=topic,
    path="./",
    vector_size=100,
    epochs=10,
    alpha=0.025,
)

In [None]:
model.recommend(
    inputs=single_input_0,
    titles=selected_titles,
    sim_matrix=doc2vec_sim_matrix,
    n=10,
    metric="cosine",
)

In [None]:
model.recommend(
    inputs=single_input_1,
    titles=selected_titles,
    sim_matrix=doc2vec_sim_matrix,
    n=10,
    metric="cosine",
)

In [None]:
model.recommend(
    inputs=multiple_inputs,
    titles=selected_titles,
    sim_matrix=doc2vec_sim_matrix,
    n=10,
    metric="cosine",
)

## LDA

In [None]:
topic_nums_to_compare = [1, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100]

# We can pass kwargs for gensim.models.ldamulticore.LdaMulticore
utils.graph_lda_topic_evals(
    corpus=text_corpus,
    num_topic_words=10,
    topic_nums_to_compare=topic_nums_to_compare,
    metrics=True,
    verbose=True,
)

plt.show()

In [None]:
# We can pass kwargs for gensim.models.ldamulticore.LdaMulticore
lda_sim_matrix = load_or_create_sim_matrix(
    method="lda",
    corpus=text_corpus,
    metric="cosine",  # euclidean not an option at this time
    topic=topic,
    path="./",
    num_topics=90,
    passes=10,
    decay=0.5,
)

In [None]:
model.recommend(
    inputs=single_input_0,
    titles=selected_titles,
    sim_matrix=lda_sim_matrix,
    n=10,
    metric="cosine",
)

In [None]:
model.recommend(
    inputs=single_input_1,
    titles=selected_titles,
    sim_matrix=lda_sim_matrix,
    n=10,
    metric="cosine",
)

In [None]:
model.recommend(
    inputs=multiple_inputs,
    titles=selected_titles,
    sim_matrix=lda_sim_matrix,
    n=10,
    metric="cosine",
)

## TFIDF

In [None]:
# We can pass kwargs for sklearn.feature_extraction.text.TfidfVectorizer
tfidf_sim_matrix = load_or_create_sim_matrix(
    method="tfidf",
    corpus=text_corpus,
    metric="cosine",  # euclidean
    topic=topic,
    path="./",
    max_features=None,
    norm='l2',
)

In [None]:
model.recommend(
    inputs=single_input_0,
    titles=selected_titles,
    sim_matrix=tfidf_sim_matrix,
    n=10,
    metric="cosine",
)

In [None]:
model.recommend(
    inputs=single_input_0,
    titles=selected_titles,
    sim_matrix=tfidf_sim_matrix,
    n=10,
    metric="cosine",
)

In [None]:
model.recommend(
    inputs=multiple_inputs,
    titles=selected_titles,
    sim_matrix=tfidf_sim_matrix,
    n=10,
    metric="cosine",
)

## WikilinkNN

In [None]:
# We can pass kwargs for the WikilinkNN Keras model
wikilink_sim_matrix = load_or_create_sim_matrix(
    method="wikilinknn",
    corpus=text_corpus,
    metric="cosine",  # euclidean
    topic=topic,
    path="./",
    path_to_json="./enwiki_books.ndjson",
    path_to_embedding_model="books_embedding_model.h5",
    embedding_size=50,
    epochs=20,
    verbose=True,
)

In [None]:
model.recommend(
    inputs=single_input_0,
    titles=selected_titles,
    sim_matrix=wikilink_sim_matrix,
    n=10,
    metric="cosine",
)

In [None]:
model.recommend(
    inputs=single_input_1,
    titles=selected_titles,
    sim_matrix=wikilink_sim_matrix,
    n=10,
    metric="cosine",
)

In [None]:
model.recommend(
    inputs=multiple_inputs,
    titles=selected_titles,
    sim_matrix=wikilink_sim_matrix,
    n=10,
    metric="cosine",
)

## Weighted Model

In [None]:
# wikilink_sims_copy = wikilink_sims.copy()
# not_selected_idxs = [i for i in range(len(titles)) if i not in selected_idxs]

# wikilink_sims_copy = np.delete(wikilink_sims_copy, not_selected_idxs, axis=0)
# wikilink_sims_copy = np.delete(wikilink_sims_copy, not_selected_idxs, axis=1)

In [None]:
tfidf_weight = 0.35
bert_weight = 1.0 - tfidf_weight
bert_tfidf_sim_matrix = tfidf_weight * tfidf_sim_matrix + bert_weight * bert_sim_matrix

In [None]:
model.recommend(
    inputs=single_input_0,
    titles=selected_titles,
    sim_matrix=bert_tfidf_sim_matrix,
    n=10,
    metric="cosine",
)

In [None]:
model.recommend(
    inputs=single_input_1,
    titles=selected_titles,
    sim_matrix=bert_tfidf_sim_matrix,
    n=10,
    metric="cosine",
)

In [None]:
model.recommend(
    inputs=multiple_inputs,
    titles=selected_titles,
    sim_matrix=bert_tfidf_sim_matrix,
    n=10,
    metric="cosine",
)