# Imports

In [1]:
import pandas as pd
import numpy as np
import itertools
import matplotlib.pyplot as plt
%matplotlib inline
import pickle

In [2]:
# from tensorflow.keras.preprocessing.text import text_to_word_sequence
# from tensorflow.keras.preprocessing.text import Tokenizer
# from tensorflow.keras.preprocessing.sequence import pad_sequences
import gensim.downloader as api

# Data loading

In [3]:
X_movies = pd.read_csv("./../data/sample_movies_reviews_clean.csv")
X_books = pd.read_csv("./../data/sample_books_reviews_clean.csv")

### Extracting the lists out of the strings and removing unnecessary columns

In [4]:
X_books["txt_flatten"] = X_books["txt"].apply(lambda x: eval(x))
X_movies["txt_flatten"] = X_movies["txt"].apply(lambda x: eval(x))
X_movies.drop(columns=["Unnamed: 0", "txt"], inplace=True)
X_books.drop(columns=["Unnamed: 0", "txt"], inplace=True)

In [5]:
X_books_agg = X_books.groupby("item_id").sum()
X_movies_agg = X_movies.groupby("item_id").sum()
X_books_agg["is_movie"] = 0
X_movies_agg["is_movie"] = 1
X_books_agg.reset_index(inplace=True)
X_movies_agg.reset_index(inplace=True)
X_books_agg.rename({"item_id":"item_id_book"}, axis='columns', inplace=True)
X_movies_agg.rename({"item_id":"item_id_movie"}, axis='columns',inplace=True)
X_reviews = pd.concat([X_movies_agg, X_books_agg])

### Grouping the reviews by movies_id and book_id
### Adding a "is_movie" column to keep track of the category of each item
### Resetting index in order to keep the item_id for each book and movie

In [None]:
X_movies.groupby("item_id").count().hist(bins=[*range(20)])

In [None]:
X_books.groupby("item_id").count().hist(bins=[*range(40)])

### Looking at the distribution of reviews lengths

In [None]:
X_movies_agg["length_txt"] = X_movies_agg["txt_flatten"].apply(lambda x: len(x))

In [None]:
X_movies_agg["length_txt"].hist(bins=[*range(0,250,10)])
plt.xlabel("length_txt")
plt.ylabel("Frequency")
plt.title("# of words per movie review");

In [None]:
X_books_agg["length_txt"] = X_books_agg["txt_flatten"].apply(lambda x: len(x))

In [None]:
X_books_agg["length_txt"].hist(bins=[*range(0,2000,100)])
plt.xlabel("length_txt")
plt.ylabel("Frequency")
plt.title("# of words per book review");

In [None]:
X_reviews["length_txt"].hist(bins=50)
plt.xlabel("length_txt")
plt.ylabel("Frequency")
plt.title("# of words per review (book + movie)");

# Word2Vec

In [None]:
# To list the available Word2Vec models
print(list(api.info()['models'].keys()))

In [None]:
#TODO use another word2vec
#TODO utiliser BERT --> pas forcément besoin de preprocessing ?

In [None]:
word2vec_transfer = api.load("glove-wiki-gigaword-300")

In [None]:
# Store data (serialize)
with open('./../notebook_temp/word2vec.pickle', 'wb') as handle:
    pickle.dump(word2vec_transfer, handle, protocol=pickle.HIGHEST_PROTOCOL)

# Load data (deserialize)
# with open('filename.pickle', 'rb') as handle:
#     unserialized_data = pickle.load(handle)

In [None]:
# To list Word2Vec params
print(word2vec_transfer.vector_size)
print(len(word2vec_transfer.key_to_index))

## Old embedding version

In [None]:
# Function to convert a sentence (list of words) into a matrix representing the words in the embedding space
def embed_sentence_with_TF(word2vec, sentence):
    embedded_sentence = []
    for word in sentence:
        if word in word2vec:
            embedded_sentence.append(word2vec[word])
        
    return np.array(embedded_sentence)

# Function that converts a list of sentences into a list of matrices
def embedding(word2vec, sentences):
    embed = []
    
    for sentence in sentences:
        embedded_sentence = embed_sentence_with_TF(word2vec, sentence)
        embed.append(embedded_sentence)
        
    return embed

In [None]:
# Embed the training and test sentences
# X_embed = embedding(word2vec_transfer, X_reviews["txt_flatten"])

In [None]:
# Pad the training and test embedded sentences
# TODO: maxlen could be increased
# X_pad = pad_sequences(X_embed, dtype='float32', padding='post', maxlen=1000, value=0)

In [None]:
np.shape(X_embed)

In [None]:
np.shape(X_embed[3])

## Vectorizing

In [None]:
def vectorize(list_of_docs, model):
    """Generate vectors for list of documents using a Word Embedding

    Args:
        list_of_docs: List of documents
        model: Gensim's Word Embedding

    Returns:
        List of document vectors
    """
    features = []

    for tokens in list_of_docs:
        zero_vector = np.zeros(model.vector_size)
        vectors = []
        for token in tokens:
            if token in model:
                try:
                    vectors.append(model[token])
                except KeyError:
                    continue
        if vectors:
            vectors = np.asarray(vectors)
            avg_vec = vectors.mean(axis=0)
            features.append(avg_vec)
        else:
            features.append(zero_vector)
    return features


In [None]:
vectorized_docs = vectorize(X_reviews["txt_flatten"], model=word2vec_transfer)

In [None]:
len(vectorized_docs), len(vectorized_docs[0])

### BERT embedding

In [None]:
from sklearn.cluster import KMeans, MiniBatchKMeans, AgglomerativeClustering, Birch, BisectingKMeans
from sentence_transformers import SentenceTransformer

In [None]:
# bert = SentenceTransformer('all-MiniLM-L6-v2')
# # test with "all-MiniLM-L6-v2"
# # most powerful model : all-mpnet-base-v2

# # Store data (serialize)
# with open('./../notebook_temp/bert.pickle', 'wb') as handle:
#     pickle.dump(bert, handle, protocol=pickle.HIGHEST_PROTOCOL)

# Load data (deserialize)
with open('./../notebook_temp/bert.pickle', 'rb') as handle:
    bert = pickle.load(handle)

In [None]:
import glob
import os

In [None]:
# merging the files
joined_files = os.path.join("./../data/proc_data/proc_book/", "*.csv")
  
# A list of all joined files is returned
joined_list = glob.glob(joined_files)


In [None]:
  
# Finally, the files are joined
X_books_bert = pd.concat(map(pd.read_csv, joined_list), ignore_index=True)

In [None]:
X_books_bert

In [None]:
X_movies_bert = pd.read_csv("./../data/proc_data/proc_movies/mov_chunk1_raw.csv")

In [None]:
X_movies_bert

In [None]:
X_books_bert["is_movie"] = 0
X_movies_bert["is_movie"] = 1
# X_books_bert.reset_index(inplace=True)
# X_movies_bert.reset_index(inplace=True)
X_books_bert.rename({"item_id":"item_id_book"}, axis='columns', inplace=True)
X_movies_bert.rename({"item_id":"item_id_movie"}, axis='columns',inplace=True)
X_reviews_bert = pd.concat([X_movies_bert, X_books_bert], ignore_index=True)

In [None]:
bert_embeddings = bert.encode(X_reviews_bert["txt"]) # ou doc vectorized ? 

In [None]:
# Store data (serialize)
with open('./../notebook_temp/bert_embeddings.pickle', 'wb') as handle:
    pickle.dump(bert_embeddings, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
# Load data (deserialize)
with open('./../notebook_temp/bert_embeddings.pickle', 'rb') as handle:
    bert_embeddings = pickle.load(handle)

In [None]:
np.shape(bert_embeddings)

In [None]:
N_CLUSTERS = 50

### MiniBatchKmeans (Old)

In [None]:
# km = MiniBatchKMeans(n_clusters=75, batch_size=500).fit(vectorized_docs)

In [None]:
X_reviews["batch_label"] = km.labels_

### Kmeans

In [None]:
kmeans = KMeans(n_clusters=N_CLUSTERS).fit(vectorized_docs)

In [None]:
kmeans_bert = KMeans(n_clusters=N_CLUSTERS).fit(bert_embeddings)

In [None]:
X_reviews["Kmeans_label"] = kmeans.labels_
X_reviews_bert["Kmeans_label_bert"] = kmeans_bert.labels_

### Agglomerative clustering

In [None]:
clustering = AgglomerativeClustering(n_clusters=100).fit(vectorized_docs)

In [None]:
clustering_bert = AgglomerativeClustering(n_clusters=1500).fit(bert_embeddings)

In [None]:
X_reviews["clustering_label"] = clustering.labels_
X_reviews_bert["clustering_label_bert"] = clustering_bert.labels_

#### Silhouette scoring

In [None]:
from sklearn.metrics import silhouette_score

In [None]:
silhouette_score(vectorized_docs, clustering.labels_)

In [None]:
silhouette_score(bert_embeddings, clustering_bert.labels_)

In [None]:
linkage=['ward','complete', "average", "single"]
n_clusters=[50, 75, 100, 125, 150]
metric=["euclidian", "l1", "l2", "manhattan", "cosine"]

In [None]:
clustering_bert = AgglomerativeClustering(n_clusters=3000, metric="cosine", linkage="complete").fit(bert_embeddings)
silhouette_score(bert_embeddings, clustering_bert.labels_)

In [None]:
opti_k = {}

for cluster in [50,100,200]:
    opti_k[f"clustering_bert_{cluster}"] = AgglomerativeClustering(n_clusters=cluster).fit(bert_embeddings)   
    opti_k[f"silhouette_score_{cluster}"] = silhouette_score(bert_embeddings, opti_k[f"clustering_bert_{cluster}"].labels_)
    
opti_k

In [None]:
opti_k2 = {}

for cluster in [1500, 2000, 2500, 3000]:
    opti_k2[f"clustering_bert_{cluster}"] = AgglomerativeClustering(n_clusters=cluster).fit(bert_embeddings)   
    opti_k2[f"silhouette_score_{cluster}"] = silhouette_score(bert_embeddings, opti_k2[f"clustering_bert_{cluster}"].labels_)
    
opti_k2

### BisectingKMeans 

In [None]:
bisect_means = BisectingKMeans(n_clusters=N_CLUSTERS, random_state=1).fit(vectorized_docs)
bisect_means_bert = BisectingKMeans(n_clusters=N_CLUSTERS, random_state=1).fit(bert_embeddings)

In [None]:
X_reviews["bisectings"] = bisect_means.labels_
X_reviews_bert["bisectings_bert"] = bisect_means_bert.labels_

### Loading metadata and merging it

In [None]:
metadata_movies = pd.read_json("./../data/raw_data/raw_movies/metadata.json", lines=True)
metadata_books = pd.read_json("./../data/raw_data/raw_book/metadata.json", lines=True)

In [None]:
metadata_movies.rename({"item_id":"item_id_movie", "title":"movie_title"}, axis='columns',inplace=True)
metadata_books.rename({"item_id":"item_id_book", "title":"book_title"}, axis='columns',inplace=True)

In [None]:
merged_movies = pd.merge(X_reviews, metadata_movies, on="item_id_movie", how="left")
merged_all = pd.merge(merged_movies, metadata_books, on="item_id_book", how="left")

In [None]:
merged_movies_bert = pd.merge(X_reviews_bert, metadata_movies, on="item_id_movie", how="left")
merged_all_bert = pd.merge(merged_movies_bert, metadata_books, on="item_id_book", how="left")

In [None]:
final_df = merged_all[["movie_title","book_title","clustering_label", "Kmeans_label", "bisectings", "is_movie" ]]
final_df = final_df.fillna("")
final_df["clustering_label"].value_counts()

In [None]:
bert = merged_all_bert[["movie_title","book_title","clustering_label_bert", "Kmeans_label_bert", "bisectings_bert", "is_movie" ]]
bert = bert.fillna("")

In [None]:
final_df[final_df["movie_title"].str.contains("potter", case=False)]

In [None]:
final_df[final_df["book_title"].str.contains("potter", case=False)]

In [None]:
bert[bert["movie_title"].str.contains("potter", case=False)]

In [None]:
bert[bert["book_title"].str.contains("potter", case=False)]