# Imports

In [8]:
import pandas as pd
import numpy as np
import itertools
import matplotlib.pyplot as plt
%matplotlib inline
import pickle
import gensim.downloader as api
import pandas as pd
import numpy as np
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer

# Data Loading

In [39]:
# X_reviews = pd.read_csv("../data/proc_data/X_raw_1000_jsonlines.csv")
X_book_description = pd.read_json("./../data/raw_data/raw_book/metadata.json", lines=True)

In [40]:
X_book_description.drop(columns=["url", "authors", "lang", "img", "year"], inplace=True)

In [41]:
def Cleaner_light(df, list_stop_words=None, see_evolution=False):
    """Cleaner_light

    Args:
        df (pd.DataFrame): need a column txt for the process
        list_stop_words (list(str), optional): a list for remove a few word. Defaults to None.
        see_evolution (bool, optional): print the evolution of the function. Defaults to False.

    Returns:
        pd.DataFrame: return the DataFrame processed
    """
    # Initialisation
    if see_evolution:
        print("\nStart Cleaner_light ... 🏃\nInitialisation ...\n")

    # Liste de ponctuation conservée -> !&'(),-.:=?`
    # Liste de ponctuation supprimée -> "#$%*+/;<>@[\]^_`{|}~
    punctuation = string.punctuation + ""
    punctuation.replace("!&'(),-.:=?`", "")

    # Transformation des stop words, copie et ajout d'une majuscule :
    # film --> film, Film
    if list_stop_words:
        list_stop_words_process = []
        for word in list_stop_words:
            list_stop_words_process.append(word)
            list_stop_words_process.append(word.capitalize())

    # Run cleaner
    if see_evolution:
        print("Run process ...")

    df.replace({r"[^\x00-\x7F]+":""}, regex=True, inplace=True)
    df.replace(punctuation, "")
    df.replace("\n", "")


    if list_stop_words:
        df.txt = [word_tokenize(text) for text in df.txt]
        out_list = []
        for text in df.txt:
            out_text = []
            for word in text:
                if not word in list_stop_words_process:
                    out_text.append(word)
            out_list.append(" ".join(out_text))
        df.txt = out_list

    if see_evolution:
        print("\n✅ Cleaner_light is done !\n")

    return df


In [42]:
X_book_description.rename(columns={"description": "txt"},inplace=True)

In [43]:
X_book_description.dropna(subset=['txt'], inplace=True)

In [44]:
X_book_description_cleaned = Cleaner_light(X_book_description)

In [59]:
X_book_description_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9309 entries, 0 to 9373
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   item_id  9309 non-null   int64 
 1   title    9309 non-null   object
 2   txt      9309 non-null   object
dtypes: int64(1), object(2)
memory usage: 548.9+ KB


In [58]:
X_book_description_cleaned["txt"].apply(lambda x: type(x)).value_counts()

<class 'str'>    9309
Name: txt, dtype: int64

# Bert embedding


In [35]:
from sklearn.cluster import KMeans, MiniBatchKMeans, AgglomerativeClustering, Birch, BisectingKMeans
from sentence_transformers import SentenceTransformer # Make sure you have done a "pip install -e ." to have SentenceTransformer package installed

  from .autonotebook import tqdm as notebook_tqdm
2023-03-13 12:07:27.935276: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-03-13 12:07:28.615129: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-03-13 12:07:29.802939: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2023-03-13 12:07:29.803150: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7

In [36]:
bert = SentenceTransformer('all-MiniLM-L6-v2') 

In [46]:
bert_embeddings = bert.encode(X_book_description_cleaned["txt"]) 
np.shape(bert_embeddings)

KeyError: 4901

# Agglomerative clustering

In [None]:
N_CLUSTER = 5_000

In [None]:
clustering_bert = AgglomerativeClustering(n_clusters=N_CLUSTER).fit(bert_embeddings)

In [None]:
X_reviews["clustering_label_bert"] = clustering_bert.labels_
X_reviews["vector"] = bert_embeddings # TODO doesn't work

# Loading metadata

In [None]:
metadata_movies = pd.read_json("./../data/raw_data/raw_movies/metadata.json", lines=True)
metadata_books = pd.read_json("./../data/raw_data/raw_book/metadata.json", lines=True)

In [None]:
metadata_movies.rename({"item_id":"item_id_movie", "title":"movie_title"}, axis='columns',inplace=True)
metadata_books.rename({"item_id":"item_id_book", "title":"book_title"}, axis='columns',inplace=True)

In [None]:
merged_movies_complete = pd.merge(X_reviews, metadata_movies, on="item_id_movie", how="left")
merged_all_bert = pd.merge(merged_movies_complete, metadata_books, on="item_id_book", how="left")

In [None]:
clustered = merged_all_bert[["movie_title","book_title","clustering_label_bert","is_movie", "item_id_movie", "item_id_book" ]]
clustered = clustered.fillna("")

# Recommendation