In [58]:
import pandas as pd
import numpy as np
import random as rd
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from matplotlib import pyplot as plt
import hdbscan
from top2vec import Top2Vec
from matplotlib import pyplot as plt
from scipy.special import softmax
import wordcloud
import gensim
import gensim.corpora as corpora
from pprint import pprint
import pyLDAvis.gensim
import pyLDAvis
# Pre-requisites for top2vec: keras_applications, keras_preprocessing

# [OPTIONAL]
from nlpretext import Preprocessor
from nlpretext.basic.preprocess import *

# 1. Load data

In [200]:
# Load dataset for unsupervised modeling
data = pd.read_csv('dataset_unsupervised.csv')

# Load tagline data
with open('movies_tagline.json', 'r') as f:
    tagline = json.load(f)
df_tagline = pd.DataFrame.from_dict(tagline, orient='index', columns=['tagline']).reset_index().rename({'index': 'id'}, axis=1)
df_tagline['id'] = df_tagline['id'].astype(int)

# Load overview data
with open('movies_overview.json', 'r') as f:
    overview = json.load(f)
df_overview = pd.DataFrame.from_dict(overview, orient='index', columns=['overview']).reset_index().rename({'index': 'id'}, axis=1)
df_overview['id'] = df_overview['id'].astype(int)

# Merge datasets
data = data.merge(df_tagline, on='id', how='left').merge(df_overview, on='id', how='left')

# Create dictionary to associate a movie ID to its title (will be used for exploration later)
dict_title = data[['id', 'title']].set_index('id').to_dict()['title']

# Drop variables useless for the modeling part (clustering and topic modeling)
data = data.drop(['release_date', 'title', 'index'], axis=1)

In [201]:
data.head()

Unnamed: 0,year,sales,is_part_of_collection,budget,runtime,original_lang_en,original_lang_es,original_lang_fr,original_lang_it,original_lang_ja,...,mean_3_popularity,mean_5_popularity,actor_1_sales,actor_2_sales,actor_3_sales,mean_sales_actor,max_sales_actor,id,tagline,overview
0,2000,139087,0,25000000.0,120.0,0,0,1,0,0,...,0.489158,0.373572,0.0,0.0,0.0,0.0,0.0,2475,,Comme les Mousquetaires dont elles possèdent l...
1,2000,66228,0,22000000.0,142.0,1,0,0,0,0,...,1.404085,1.229533,0.0,0.0,0.0,0.0,0.0,2870,,"New York, été 1977. Alors que la ville connait..."
2,2000,1463152,0,25000000.0,77.0,0,1,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2706,,
3,2000,32954,0,25000000.0,116.0,1,0,0,0,0,...,0.465217,0.346425,0.0,0.0,0.0,0.0,0.0,11980,,"Félicia, dix-sept ans, traverse la mer d'Irlan..."
4,2000,223564,1,40000000.0,99.0,1,0,0,0,0,...,2.802817,2.044138,0.0,0.0,0.0,0.0,0.0,2480,Il reprend du service.,Arthur Bishop pensait qu'il avait mis son pass...


# 2. Clustering

In [204]:
# For movie clustering, textual variables are not usefull, let's drop them
data_clustering = data.drop(['overview', 'tagline'], axis=1)

In [205]:
# Split the dataset into a train part and a test part (with a ratio 80/20 for example)
# Hint: take a look at the train_test_split() function from sklearn 
# (https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html)
X_train, X_test = 

## 2.1. K-means

In [247]:
X_test_kmeans = X_test.copy()

In [248]:
# Choose a number k of clusters 
k = 

In [249]:
# Initialize a K-means model

In [250]:
# Fit the K-means model to your data

In [251]:
# Predict on your test set
predictions = 

In [252]:
# Explore your results
# Hint: merge your predictions with your initial dataset X_test_kmeans and use the dict_title object to retrieve 
# which title is associated to each movie ID. From there you will be able to know which movies are in each cluster

In [1]:
# Print the first 10 movies associated to a each cluster

In [3]:
# To go further:
# Try the elbow method to find the optimal number k of clusters. Train a new K-means model with this number, make predictions and explore the
# results
# Hint: use the function plot_elbow() to determine the optimal k
X_test_kmeans = X_test.copy()

def plot_elbow(X_train, K):
    Sum_of_squared_distances = []
    for k in range(1, K):
        km = KMeans(n_clusters = k, random_state = 0)
        km = km.fit(X_train)
        Sum_of_squared_distances.append(km.inertia_)
    plt.plot(range(1, K), Sum_of_squared_distances, 'bx-')
    plt.xlabel('k')
    plt.ylabel('Sum_of_squared_distances')
    plt.title('Elbow Method For Optimal k')
    plt.show()

In [255]:
# Define optimal k and re-train a K-means model
optimal_k = 


# Make predictions on test set and add corresponding title


# Print the first 10 movies associated to a each cluster


## 2.2. To go further on clustering: HDBSCAN
HDBSCAN is another clustering model. 
Its advantage is that it determines automatically the number of clusters, it does not need to be defined beforehand.

In [257]:
X_test_hdbscan = X_test.copy()

In [4]:
# Initialize a HDBSCAN model
# Hint: take a look at the hdbscan package (https://hdbscan.readthedocs.io/en/latest/)

In [5]:
# Find the number of clusters that HDBSCAN has determined

In [260]:
# Make predictions on the test set
# Hint: check the methods associated to an HDBSCAN model 
# (https://hdbscan.readthedocs.io/en/latest/api.html#hdbscan)

In [261]:
# Append predictions and scores to the test set


# Add the title associated to each movie


In [6]:
# Explore results

# 3. Topic modeling

In [193]:
def make_word_cloud(text_df: pd.DataFrame):
    """
    Function that computes word cloud from tokens

    Parameters
    ----------
    text_df: pd.DataFrame
        dataframe with text column
    cmp: matplotlib.colors.LinearSegmentedColormap
        colormap for the wordcloud
    """
    text = ' '.join([' '.join(el) for el in text_df['tokens']])
    if isinstance(text, str):
        wordcloud_ = wordcloud.WordCloud(background_color='white', width=700, height=500).generate(text)
        #wordcloud_.recolor(colormap=cmp)
    else:
        raise TypeError('text_df contains non str values')
    plt.imshow(wordcloud_)
    plt.axis("off")
    plt.show()

In [268]:
# Create a 'text' column that is a combination of the tagline and the overview
# Example:
# - tagline = 'Il reprend du service.'
# - overview = 'Arthur Bishop pensait qu'il avait mis son passé de tueur à gages derrière lui. ... etc'
# -> text = 'Il reprend du service. Arthur Bishop pensait qu'il avait mis son passé de tueur à gages derrière lui. ... etc'
data['text'] = 

In [7]:
# Preprocess the 'text' column:
# - transform the text to lowercase
# - remove end of line characters
# - remove accents
# - remove punctuation
# - remove stopwords
# - split the preprocessed text into words (i.e. tokens) and store it into a new column named 'tokens'

## 3.1 LDA

In [None]:
# Create a list of all tokens associated to each movie and store it into the variable data_words
data_words = 

In [None]:
# Create a Dictionary based on the all the words per movie
# Hint: explore the gensim library (specifically the 'corpora' section...)

In [None]:
# Create a Corpus for each text associated to each movie

In [None]:
# Define a number of topics
num_topics = 

In [None]:
# Train a LDA model
# Hint: explore the gensim library (specifically the 'models' section...)
lda_model = 

In [146]:
# Print the keywords in the n topics
# Hint: maybe the gensim LDA model has a built-in function to do so...

[(0,
  '0.004*"histoire" + 0.003*"fils" + 0.002*"part" + 0.002*"guerre" + '
  '0.002*"enfants" + 0.002*"amour" + 0.002*"roi" + 0.002*"voyage" + '
  '0.002*"face" + 0.002*"temps"'),
 (1,
  '0.004*"amour" + 0.004*"histoire" + 0.004*"grand" + 0.004*"parents" + '
  '0.003*"nouvelle" + 0.003*"annees" + 0.003*"amis" + 0.003*"passe" + '
  '0.002*"ami" + 0.002*"maison"'),
 (2,
  '0.004*"petit" + 0.003*"grand" + 0.003*"decide" + 0.003*"histoire" + '
  '0.003*"passe" + 0.003*"petite" + 0.003*"fils" + 0.003*"temps" + '
  '0.003*"decouvre" + 0.003*"vient"'),
 (3,
  '0.005*"enfants" + 0.003*"rencontre" + 0.003*"ville" + 0.002*"maison" + '
  '0.002*"amis" + 0.002*"jamais" + 0.002*"aventure" + 0.002*"petit" + '
  '0.002*"parents" + 0.002*"mari"'),
 (4,
  '0.005*"annees" + 0.004*"histoire" + 0.003*"retrouve" + 0.003*"rencontre" + '
  '0.003*"parents" + 0.003*"petit" + 0.003*"temps" + 0.003*"fils" + '
  '0.003*"decide" + 0.003*"grand"'),
 (5,
  '0.005*"ville" + 0.003*"decide" + 0.003*"rencontre" + 0.00

In [147]:
# Visualize the topics
# Hint: use pyLDAvis

In [None]:
# What are your conclusions regarding the topics? (coherence, stability, ...)

## 3.2 To go further on topic modeling: Top2Vec

In [273]:
# Create a new dataset for top2vec model, from the 'data' dataframe, do not take movies with empty text 
# into account
data_top2vec = 

# Make a list from the 'text' column that will contain all texts associated to all movies

In [None]:
# Train a top2vec model
# Hint: use the "speed" argument to make the training faster

In [149]:
# Explore the results part 1: 
# - number of topics found
# - topics sizes
# Hint: check the documentation about top2vec to see what are the attributes of the trained model

Number of topics found: 4
Topic sizes:
Topic  0  - Size:  5596
 ----- 
Topic  1  - Size:  322
 ----- 
Topic  2  - Size:  245
 ----- 
Topic  3  - Size:  232
 ----- 


In [None]:
# What would be your interpretation for each cluster ?

In [None]:
# Topic 0: action
# Topic 1: comédie romantique
# Topic 2: drame
# Topic 3: Court-metrage
# Topic 4: histoire de jeunes, duos
# Topic 5: histoire de famille
# Topic 6: flic, los angeles
# Topic 7: art, culture
# Topic 8: noël
# Topic 9: aventure
# Topic 10: amour

In [155]:
# Save your model for later if you want to explore it in more details
# model.save('top2vec_imdb_bis.pickle')