# Imports

In [None]:
import pandas as pd
import numpy as np
import itertools
import matplotlib.pyplot as plt
%matplotlib inline
import pickle
import gensim.downloader as api
import pandas as pd
import numpy as np
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer

In [None]:
X_book_description

# Data Loading

In [None]:
# X_reviews = pd.read_csv("../data/proc_data/X_raw_1000_jsonlines.csv")
X_book_description = pd.read_json("./../data/raw_data/raw_book/metadata.json", lines=True)

In [None]:
X_movie_description = pd.read_csv("./../data/raw_data/raw_movies/df_overview.csv")

In [None]:
X_movie_description.drop(columns="imdb_id", inplace=True)

In [None]:
X_movie_description.rename(columns={"overview": "txt", "id":"item_id"},inplace=True)

In [None]:
X_movie_description.dropna(subset=['txt'], inplace=True)

In [None]:
X_book_description.drop(columns=["url", "authors", "lang", "img", "year"], inplace=True)

In [None]:
def Cleaner_light(df, list_stop_words=None, see_evolution=False):
    """Cleaner_light
    Args:
        df (pd.DataFrame): need a column txt for the process
        list_stop_words (list(str), optional): a list for remove a few word. Defaults to None.
        see_evolution (bool, optional): print the evolution of the function. Defaults to False.
    Returns:
        pd.DataFrame: return the DataFrame processed
    """
    # Initialisation
    if see_evolution:
        print("\nStart Cleaner_light ... 🏃\nInitialisation ...\n")

    # Liste de ponctuation conservée -> !&'(),-.:=?`
    # Liste de ponctuation supprimée -> "#$%*+/;<>@[\]^_`{|}~
    punctuation = string.punctuation + ""
    punctuation.replace("!&'(),-.:=?`", "")

    # Transformation des stop words, copie et ajout d'une majuscule :
    # film --> film, Film
    if list_stop_words:
        list_stop_words_process = []
        for word in list_stop_words:
            list_stop_words_process.append(word)
            list_stop_words_process.append(word.capitalize())

    # Run cleaner
    if see_evolution:
        print("Run process ...")

    df.replace({r"[^\x00-\x7F]+":""}, regex=True, inplace=True)
    df.replace(punctuation, "")

    
    if list_stop_words:
        df.txt = [word_tokenize(text) for text in df.txt]
        out_list = []
        for text in df.txt:
            out_text = []
            for word in text:
                if not word in list_stop_words_process:
                    out_text.append(word)
            out_list.append(" ".join(out_text))
        df.txt = out_list

    df.dropna(inplace=True)

    if see_evolution:
        print("\n✅ Cleaner_light is done !\n")

    return df



In [None]:
def flatten_txt(data,id="item_id",colname="txt"):
    """
    permet de concat les "txt" de dataframe par "item_im"
    et renvoit un df avec autant de lignes que de item_id
    """
    return data.groupby(id, as_index=False).agg({colname: " ".join})

In [None]:
X_book_description.rename(columns={"description": "txt"},inplace=True)

In [None]:
X_book_description.dropna(subset=['txt'], inplace=True)

In [None]:
chunk_flat=flatten_txt(data=X_book_description,id="item_id",colname="txt")
chunk_flat_clean=Cleaner_light(chunk_flat, see_evolution=True)

In [None]:
chunk_flat_movie=flatten_txt(data=X_movie_description,id="item_id",colname="txt")
movie_description_cleaned=Cleaner_light(chunk_flat_movie, see_evolution=True)

In [None]:
movie_description_cleaned.head()

In [None]:
chunk_flat_clean.head()

In [None]:
movie_description_cleaned["is_movie"] = 1
chunk_flat_clean["is_movie"] = 0

In [None]:
X_description = pd.concat([chunk_flat_clean, movie_description_cleaned])

# Bert embedding


In [None]:
from sklearn.cluster import KMeans, MiniBatchKMeans, AgglomerativeClustering, Birch, BisectingKMeans
from sentence_transformers import SentenceTransformer # Make sure you have done a "pip install -e ." to have SentenceTransformer package installed

In [None]:
bert = SentenceTransformer('all-MiniLM-L6-v2') 

In [None]:
bert_embeddings = bert.encode(movie_description_cleaned["txt"]) 
np.shape(bert_embeddings)

In [None]:
bert_embeddings_books = bert.encode(chunk_flat_clean["txt"]) 
np.shape(bert_embeddings)

In [None]:
bert_embedding_complete = np.vstack((bert_embeddings_books,bert_embeddings))

# Agglomerative clustering

In [None]:
N_CLUSTER = 200

In [None]:
clustering_bert = AgglomerativeClustering(n_clusters=N_CLUSTER).fit(bert_embedding_complete)

In [None]:
X_description["clustering_label_bert"] = clustering_bert.labels_

# Loading metadata

In [None]:
metadata_movies = pd.read_json("./../data/raw_data/raw_movies/metadata.json", lines=True)
metadata_books = pd.read_json("./../data/raw_data/raw_book/metadata.json", lines=True)

In [None]:
metadata_movies.rename({"item_id":"item_id_movie", "title":"movie_title"}, axis='columns',inplace=True)
metadata_books.rename({"item_id":"item_id_book", "title":"book_title"}, axis='columns',inplace=True)

In [None]:
merged_movies_complete = pd.merge(X_reviews, metadata_movies, on="item_id_movie", how="left")
merged_all_bert = pd.merge(merged_movies_complete, metadata_books, on="item_id_book", how="left")

In [None]:
clustered = merged_all_bert[["movie_title","book_title","clustering_label_bert","is_movie", "item_id_movie", "item_id_book" ]]
clustered = clustered.fillna("")

# Recommendation