<table><tr>
<td> <img src="https://upload.wikimedia.org/wikipedia/fr/thumb/e/e5/Logo_%C3%A9cole_des_ponts_paristech.svg/676px-Logo_%C3%A9cole_des_ponts_paristech.svg.png" width="200"  height="200" hspace="200"/> </td>
<td> <img src="https://pbs.twimg.com/profile_images/1156541928193896448/5ihYIbCQ_200x200.png" width="200" height="200" /> </td>
</tr></table>

<br/>

<h1><center>Session 6 - Unsupervised modeling</center></h1>



<font size="3">This session is divided into **3** parts:
- **Loading data**
- **Clustering**
- **Topic modeling**

In each of these parts, some **guidelines** and **hints** are given for each task. 
Do not hesitate to check the links to documentation to understand the functions you use. 
    
The goal of this session is to **implement different unsupervised models** to **create clusters** among movies and to see **which topics emerge** from movies description.
</font>

# 0 - Useful libraries

In [None]:
import pandas as pd
import numpy as np
import random as rd
import json
from matplotlib import pyplot as plt

# 1. Loading data

In [None]:
# Load dataset for unsupervised modeling
data = pd.read_csv('dataset_unsupervised.csv')

# Load tagline data
with open('movies_tagline.json', 'r') as f:
    tagline = json.load(f)
df_tagline = pd.DataFrame.from_dict(tagline, orient='index', columns=['tagline']).reset_index().rename({'index': 'id'}, axis=1)
df_tagline['id'] = df_tagline['id'].astype(int)

# Load overview data
with open('movies_overview.json', 'r') as f:
    overview = json.load(f)
df_overview = pd.DataFrame.from_dict(overview, orient='index', columns=['overview']).reset_index().rename({'index': 'id'}, axis=1)
df_overview['id'] = df_overview['id'].astype(int)

# Merge datasets
data = data.merge(df_tagline, on='id', how='left').merge(df_overview, on='id', how='left')

# Create dictionary to associate a movie ID to its title (will be used for exploration later)
dict_title = data[['id', 'title']].set_index('id').to_dict()['title']

# Drop variables useless for the modeling part (clustering and topic modeling)
data = data.drop(['release_date', 'title', 'index'], axis=1)

In [None]:
data.head()

# 2. Clustering

In [None]:
# For movie clustering, textual variables are not usefull, let's drop them
data_clustering = data.drop(['overview', 'tagline'], axis=1)

In [None]:
# Normalize data

In [None]:
# Split the dataset into a train part and a test part (with a ratio 80/20 for example)
# Hint: take a look at the train_test_split() function from sklearn 
# (https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html)
from sklearn.model_selection import train_test_split
X_train, X_test = 

## 2.1. K-means

In [None]:
from sklearn.cluster import KMeans

In [None]:
X_test_kmeans = X_test.copy()

In [None]:
# Choose a number k of clusters 
k = 

In [None]:
# Initialize a K-means model

In [None]:
# Fit the K-means model to your data

In [None]:
# Predict on your test set
predictions = 

In [None]:
# Explore your results
# Hint: merge your predictions with your initial dataset X_test_kmeans and use the dict_title object to retrieve 
# which title is associated to each movie ID. From there you will be able to know which movies are in each cluster
X_test_kmeans['prediction'] = 
X_test_kmeans['title'] = 

In [None]:
# Print the first 10 movies (identified with their titles) associated to a each cluster
# Hint: use a for loop on the number of clusters and use .loc to find movies that are related to the given cluster

In [None]:
# Try the elbow method to find the optimal number k of clusters. Train a new K-means model with this number, make 
# predictions and explore the results
# Hint: use the function plot_elbow() to determine the optimal k
X_test_kmeans = X_test.copy()

def plot_elbow(X_train, K):
    Sum_of_squared_distances = []
    for k in range(1, K):
        km = KMeans(n_clusters = k, random_state = 0)
        km = km.fit(X_train)
        Sum_of_squared_distances.append(km.inertia_)
    plt.plot(range(1, K), Sum_of_squared_distances, 'bx-')
    plt.xlabel('k')
    plt.ylabel('Sum_of_squared_distances')
    plt.title('Elbow Method For Optimal k')
    plt.show()
    


In [None]:
# Define optimal k and re-train a K-means model
optimal_k = 

In [None]:
# Make predictions on test set and add corresponding title



In [None]:
# Print the first 10 movies associated to a each cluster



# 3. Topic modeling

In [None]:
import wordcloud
from nlpretext import Preprocessor
from nlpretext.basic.preprocess import (
    fix_bad_unicode, lower_text, remove_eol_characters, remove_accents, remove_punct, remove_stopwords,
    normalize_whitespace
)

In [None]:
#!pip install https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.3.1/en_core_web_sm-2.3.1.tar.gz
#!pip install https://github.com/explosion/spacy-models/releases/download/fr_core_news_sm-2.3.0/fr_core_news_sm-2.3.0.tar.gz

In [None]:
custom_stopwords = ['vie', 'ans', 'jeune', 'film', 'femme', 'homme', 'famille', 'pere', 'fille', 'mere', 'monde',
                    'jour', 'ete']

def preprocess(df_text, custom_stopwords=custom_stopwords):
    df_text = df_text.loc[df_text['text'] != ' ']
    df_text['text_prepro'] = df_text['text'].map(fix_bad_unicode)
    df_text['text_prepro'] = df_text['text_prepro'].map(lower_text)
    df_text['text_prepro'] = df_text['text_prepro'].map(remove_eol_characters)
    df_text['text_prepro'] = df_text['text_prepro'].map(remove_accents)
    df_text['text_prepro'] = df_text['text_prepro'].map(remove_punct)
    df_text['text_prepro'] = df_text['text_prepro'].map(lambda x: remove_stopwords(
        x, lang='fr', custom_stopwords=custom_stopwords))
    df_text['text_prepro'] = df_text['text_prepro'].map(normalize_whitespace)
    df_text['tokens'] = df_text['text_prepro'].map(lambda x: x.split())
    return df_text


def make_word_cloud(text_df: pd.DataFrame):
    """
    Function that computes word cloud from tokens

    Parameters
    ----------
    text_df: pd.DataFrame
        dataframe with text column
    cmp: matplotlib.colors.LinearSegmentedColormap
        colormap for the wordcloud
    """
    text = ' '.join([' '.join(el) for el in text_df['tokens']])
    if isinstance(text, str):
        wordcloud_ = wordcloud.WordCloud(background_color='white', width=700, height=500).generate(text)
    else:
        raise TypeError('text_df contains non str values')
    plt.imshow(wordcloud_)
    plt.axis("off")
    plt.show()

In [None]:
# Create a 'text' column that is a combination of the tagline and the overview
# Example:
# - tagline = 'Il reprend du service.'
# - overview = 'Arthur Bishop pensait qu'il avait mis son passé de tueur à gages derrière lui. ... etc'
# -> text = 'Il reprend du service. Arthur Bishop pensait qu'il avait mis son passé de tueur à gages derrière lui. ... etc'
data['text'] = 

In [None]:
# Explore the results of the preprocessing
preprocessed_text = preprocess(pd.DataFrame(data.iloc[0]).T)
print('Raw text:', data['text'][0])
print()
print('Preprocessed text:', preprocessed_text['text_prepro'][0])
print()
print('Preprocessed tokens:', preprocessed_text['tokens'][0])

In [None]:
import warnings
warnings.filterwarnings('ignore')

# Preprocess the 'text' column on the entire dataframe:
# - transform the text to lowercase
# - remove end of line characters
# - remove accents
# - remove punctuation
# - remove stopwords
# - split the preprocessed text into words (i.e. tokens) and store it into a new column named 'tokens'
# Hint: use the 'preprocess' function defined above

data = 

## 3.1 LDA

In [None]:
import gensim
import gensim.corpora as corpora

In [None]:
# Create a list of all tokens associated to each movie and store it into the variable data_words
# Hint: during the preprocessing, a new column 'tokens' has been created in the dataframe 'data'
data_words = 

In [None]:
# Create a Dictionary based on the all the words per movie
# Hint: explore the gensim library (specifically the 'corpora' section...)

In [None]:
# Create a Corpus for each text associated to each movie

In [None]:
# Define a number of topics
num_topics = 

In [None]:
# Train a LDA model
# Hint: explore the gensim library (specifically the 'models' section...)
lda_model = 

In [None]:
# Print the keywords in the n topics
# Hint: maybe the gensim LDA model has a built-in function to do so...

In [None]:
import pyLDAvis
import pyLDAvis.gensim

pyLDAvis.enable_notebook()

# Visualize the topics
# Hint: use pyLDAvis




In [None]:
# What are your conclusions regarding the topics? (coherence, stability, ...)

## 3.2 To go further on topic modeling: Top2Vec

In [None]:
from top2vec import Top2Vec
# Pre-requisites for top2vec: keras_applications, keras_preprocessing

### 3.2.1 Topics interpretation

In [None]:
# Load the trained Top2Vec model
model_bis = Top2Vec.load('top2vec_imdb.pickle')

# Get information about the model: number of topics found and their sizes
n_topics_found = model_bis.get_num_topics()
topic_sizes, topic_nums = model_bis.get_topic_sizes()

print('Number of topics found:', n_topics_found)
print('Topic sizes:')
for n in range(n_topics_found):
    print('Topic ', n, ' - Size: ', topic_sizes[n])
    print(' ----- ')

In [None]:
# Explore the results part 1: check the texts with the best scores in each cluster

# Change "nb" between 0 and 17 to explore each cluster
# Clusters are sorted according to the number of texts they contain (exploring clusters between 0 and 10 may be
# enough to have a good overview of what main clusters represent)
nb = 0

# You can also change the number of texts to display for each cluster with 'num_docs'
num_docs = 7

documents, document_scores, document_ids = model_bis.search_documents_by_topic(topic_num = nb, num_docs = num_docs)
for doc, score, doc_id in zip(documents, document_scores, document_ids):
    print(f"Document: {doc_id}, Score: {score}")
    print(doc)
    print("-----------")

In [None]:
# Explore the results part 2: check the wordclouds associated to each cluster
# Wordclouds are really helpful to have a quick overview of most important words related to each topic
# They can help to interpret each topic

import warnings
warnings.filterwarnings('ignore')

for nb in range(11):
    documents, document_scores, document_ids = model_bis.search_documents_by_topic(topic_num=nb, num_docs=topic_sizes[nb])
    df_text = preprocess(pd.DataFrame(documents, columns=['text']))
    print('Topic', nb)
    make_word_cloud(df_text)

In [None]:
# What would be your interpretation for each cluster ?
# Topic 0: 
# Topic 1: 
# Topic 2: 
# Topic 3: 
# Topic 4: 
# Topic 5: 
# Topic 6: 
# Topic 7: 
# Topic 8: 
# Topic 9: 
# Topic 10: 

### 3.2.2 To go further: train your own Top2Vec model

In [None]:
# Create a new dataset for top2vec model, from the 'data' dataframe, do not take movies with empty text 
# into account
data_top2vec = 

# Make a list from the 'text' column that will contain all texts associated to all movies
all_texts = 

In [None]:
# Train a top2vec model
# Hint: use the "speed" argument to make the training faster
model = 

In [None]:
# Explore the results part: 
# - number of topics found
# - topics sizes
# Hint: check the documentation about top2vec to see what are the attributes of the trained model

In [None]:
# Save your model for later if you want to explore it in more details
# model.save('top2vec_imdb_bis.pickle')