# Imports

In [None]:
import pandas as pd
import numpy as np
import itertools
import matplotlib.pyplot as plt
%matplotlib inline
import pickle
import gensim.downloader as api
import pandas as pd
import numpy as np
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer

In [None]:
X_book_description

# Data Loading

In [None]:
# X_reviews = pd.read_csv("../data/proc_data/X_raw_1000_jsonlines.csv")
X_book_description = pd.read_json("./../data/raw_data/raw_book/metadata.json", lines=True)

In [None]:
X_movie_description = pd.read_csv("./../data/raw_data/raw_movies/df_overview.csv")

In [None]:
X_movie_description.drop(columns="imdb_id", inplace=True)

In [None]:
X_movie_description.rename(columns={"overview": "txt", "id":"item_id"},inplace=True)

In [None]:
X_movie_description.dropna(subset=['txt'], inplace=True)

In [None]:
X_book_description.drop(columns=["url", "authors", "lang", "img", "year"], inplace=True)

In [None]:
def Cleaner_light(df, list_stop_words=None, see_evolution=False):
    """Cleaner_light
    Args:
        df (pd.DataFrame): need a column txt for the process
        list_stop_words (list(str), optional): a list for remove a few word. Defaults to None.
        see_evolution (bool, optional): print the evolution of the function. Defaults to False.
    Returns:
        pd.DataFrame: return the DataFrame processed
    """
    # Initialisation
    if see_evolution:
        print("\nStart Cleaner_light ... 🏃\nInitialisation ...\n")

    # Liste de ponctuation conservée -> !&'(),-.:=?`
    # Liste de ponctuation supprimée -> "#$%*+/;<>@[\]^_`{|}~
    punctuation = string.punctuation + ""
    punctuation.replace("!&'(),-.:=?`", "")

    # Transformation des stop words, copie et ajout d'une majuscule :
    # film --> film, Film
    if list_stop_words:
        list_stop_words_process = []
        for word in list_stop_words:
            list_stop_words_process.append(word)
            list_stop_words_process.append(word.capitalize())

    # Run cleaner
    if see_evolution:
        print("Run process ...")

    df.replace({r"[^\x00-\x7F]+":""}, regex=True, inplace=True)
    df.replace(punctuation, "")

    
    if list_stop_words:
        df.txt = [word_tokenize(text) for text in df.txt]
        out_list = []
        for text in df.txt:
            out_text = []
            for word in text:
                if not word in list_stop_words_process:
                    out_text.append(word)
            out_list.append(" ".join(out_text))
        df.txt = out_list

    df.dropna(inplace=True)

    if see_evolution:
        print("\n✅ Cleaner_light is done !\n")

    return df



In [None]:
def flatten_txt(data,id="item_id",colname="txt"):
    """
    permet de concat les "txt" de dataframe par "item_im"
    et renvoit un df avec autant de lignes que de item_id
    """
    return data.groupby(id, as_index=False).agg({colname: " ".join})

In [None]:
X_book_description.rename(columns={"description": "txt"},inplace=True)

In [None]:
X_book_description.dropna(subset=['txt'], inplace=True)

In [None]:
chunk_flat=flatten_txt(data=X_book_description,id="item_id",colname="txt")
chunk_flat_clean=Cleaner_light(chunk_flat, see_evolution=True)

In [None]:
chunk_flat_movie=flatten_txt(data=X_movie_description,id="item_id",colname="txt")
movie_description_cleaned=Cleaner_light(chunk_flat_movie, see_evolution=True)

In [None]:
movie_description_cleaned.head()

In [None]:
chunk_flat_clean.head()

In [None]:
movie_description_cleaned["is_movie"] = 1
chunk_flat_clean["is_movie"] = 0

In [None]:
X_description = pd.concat([chunk_flat_clean, movie_description_cleaned])

# Bert embedding


In [None]:
from sklearn.cluster import KMeans, MiniBatchKMeans, AgglomerativeClustering, Birch, BisectingKMeans
from sentence_transformers import SentenceTransformer # Make sure you have done a "pip install -e ." to have SentenceTransformer package installed

In [None]:
bert = SentenceTransformer('all-MiniLM-L6-v2') 

In [None]:
bert_embeddings = bert.encode(movie_description_cleaned["txt"]) 
np.shape(bert_embeddings)

In [None]:
bert_embeddings_books = bert.encode(chunk_flat_clean["txt"]) 
np.shape(bert_embeddings)

In [None]:
bert_embedding_complete = np.vstack((bert_embeddings_books,bert_embeddings))

# Agglomerative clustering

In [None]:
N_CLUSTER = 200

In [None]:
clustering_bert = AgglomerativeClustering(n_clusters=N_CLUSTER).fit(bert_embedding_complete)

In [None]:
X_description["clustering_label_bert"] = clustering_bert.labels_

# Loading metadata

In [None]:
metadata_movies = pd.read_json("./../data/raw_data/raw_movies/metadata.json", lines=True)
metadata_books = pd.read_json("./../data/raw_data/raw_book/metadata.json", lines=True)

In [None]:
metadata_movies.rename({"item_id":"item_id_movie", "title":"movie_title"}, axis='columns',inplace=True)
metadata_books.rename({"item_id":"item_id_book", "title":"book_title"}, axis='columns',inplace=True)

In [None]:
merged_movies_complete = pd.merge(X_reviews, metadata_movies, on="item_id_movie", how="left")
merged_all_bert = pd.merge(merged_movies_complete, metadata_books, on="item_id_book", how="left")

In [None]:
clustered = merged_all_bert[["movie_title","book_title","clustering_label_bert","is_movie", "item_id_movie", "item_id_book" ]]
clustered = clustered.fillna("")

# Pickle file to get faster

In [1]:
import pandas as pd
import pickle

In [11]:
from bookmatch.params import *
from pathlib import Path

In [12]:
filename1=Path(LOCAL_CSV_POSTPROCESS_PATH).joinpath("X_all.pickle")

In [16]:
with open(filename1, 'rb') as handle:
    X_all = pickle.load(handle)

In [4]:
!pwd

/home/arostagnat/code/arostagnat/BookMatch/notebooks


In [7]:
X_all = pd.read_csv("./../data/post_process_data/X_all.csv")

In [10]:
# # Store data (serialize)
with open('./../data/post_process_data/X_all.pickle', 'wb') as handle:
     pickle.dump(X_all, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [2]:
# Load data (deserialize)
with open('./../data/post_process_data/X_all.pickle', 'rb') as handle:
    X_all = pickle.load(handle)

In [3]:
X_all.head()

Unnamed: 0.1,Unnamed: 0,item_id_movie,is_movie,item_id_book,clustering_label_bert,vector,title_movie,title_book
0,0,-1.0,0.0,49566885.0,2048,[-6.75622448e-02 -1.38096279e-02 8.74609575e-...,,"The Contract (The Contract, #1)"
1,1,-1.0,0.0,50941457.0,2048,[ 2.08302843e-03 -1.07728921e-01 2.19612103e-...,,Everything for Her (For Her #1)
2,2,-1.0,0.0,51750675.0,2048,[-1.08228497e-01 -5.80898300e-02 1.58334207e-...,,"The Gender Game (The Gender Game, #1)"
3,3,-1.0,0.0,48125855.0,769,[-2.13483274e-02 -1.30382717e-01 6.59563318e-...,,"The One Real Thing (Hart's Boardwalk, #1)"
4,4,-1.0,0.0,48358625.0,769,[-1.14925623e-01 -5.53144850e-02 1.05959505e-...,,The Wall of Winnipeg and Me


In [12]:
X_vect_b = pd.read_csv("./../data/post_process_data/X_vect_b.csv")

In [4]:
with open('./../data/post_process_data/X_vect_b.pickle', 'rb') as handle:
    X_vect_b = pickle.load(handle)

In [6]:
X_vect_b.head()

Unnamed: 0,item_id_book,0,1,2,3,4,5,6,7,8,...,374,375,376,377,378,379,380,381,382,383
0,49566885.0,-0.067562,-0.01381,0.087461,0.010315,-0.03419,0.044712,-0.042676,-0.003757,-0.01911,...,0.038768,-0.013075,-0.029805,0.044801,-0.076599,0.070121,0.072632,0.016956,-0.016898,-0.009537
1,50941457.0,0.002083,-0.107729,0.021961,0.127054,-0.030593,0.014972,0.008145,-0.008593,0.053529,...,0.010641,-0.033515,-0.027384,0.058879,0.00933,0.029805,0.09359,-0.032974,-0.018067,-0.058421
2,51750675.0,-0.108228,-0.05809,0.015833,0.080706,-0.050521,0.051526,0.011741,-0.030256,0.043681,...,0.034389,0.027473,-0.00489,0.086042,-0.056711,0.059688,0.047612,-0.023292,-0.055406,0.059669
3,48125855.0,-0.021348,-0.130383,0.065956,0.012416,0.047087,0.028791,-0.077499,-0.062951,0.042728,...,0.043824,-0.011231,-0.09278,0.053753,-0.033295,0.061573,0.027644,-0.020969,-0.093529,0.010398
4,48358625.0,-0.114926,-0.055314,0.10596,-0.048345,0.0077,0.013659,-0.016277,0.022655,0.075105,...,0.035695,0.000972,-0.056826,0.052611,-0.052656,0.041356,0.056243,0.010263,-0.106852,-0.02132


In [7]:
X_vect_b=X_vect_b.set_index("item_id_book")

In [8]:
X_vect_b.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,374,375,376,377,378,379,380,381,382,383
item_id_book,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
49566885.0,-0.067562,-0.01381,0.087461,0.010315,-0.03419,0.044712,-0.042676,-0.003757,-0.01911,-0.066859,...,0.038768,-0.013075,-0.029805,0.044801,-0.076599,0.070121,0.072632,0.016956,-0.016898,-0.009537
50941457.0,0.002083,-0.107729,0.021961,0.127054,-0.030593,0.014972,0.008145,-0.008593,0.053529,-0.00784,...,0.010641,-0.033515,-0.027384,0.058879,0.00933,0.029805,0.09359,-0.032974,-0.018067,-0.058421
51750675.0,-0.108228,-0.05809,0.015833,0.080706,-0.050521,0.051526,0.011741,-0.030256,0.043681,0.054876,...,0.034389,0.027473,-0.00489,0.086042,-0.056711,0.059688,0.047612,-0.023292,-0.055406,0.059669
48125855.0,-0.021348,-0.130383,0.065956,0.012416,0.047087,0.028791,-0.077499,-0.062951,0.042728,0.001619,...,0.043824,-0.011231,-0.09278,0.053753,-0.033295,0.061573,0.027644,-0.020969,-0.093529,0.010398
48358625.0,-0.114926,-0.055314,0.10596,-0.048345,0.0077,0.013659,-0.016277,0.022655,0.075105,0.005886,...,0.035695,0.000972,-0.056826,0.052611,-0.052656,0.041356,0.056243,0.010263,-0.106852,-0.02132


In [13]:
with open('./../data/post_process_data/X_vect_b.pickle', 'wb') as handle:
     pickle.dump(X_vect_b, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [23]:
with open('./../data/post_process_data/X_vect_m.pickle', 'rb') as handle:
    X_vect_m = pickle.load(handle)

In [14]:
X_vect_m = pd.read_csv("./../data/post_process_data/X_vect_m.csv")

In [15]:
with open('./../data/post_process_data/X_vect_m.pickle', 'wb') as handle:
     pickle.dump(X_vect_m, handle, protocol=pickle.HIGHEST_PROTOCOL)