In [None]:
import json
import pandas as pd 
import re
import os

from wordcloud import WordCloud

import gensim
from gensim.utils import simple_preprocess
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
import gensim.corpora as corpora
from nltk.tokenize import RegexpTokenizer
from nltk.stem.porter import PorterStemmer

from pprint import pprint

import pickle 
import pyLDAvis
import pyLDAvis.gensim_models

import matplotlib.pyplot as plt
import numpy as np




import networkx as nx

In [None]:
def load_data(name):
    # Opening JSON file
    f = open(name)
    data = json.load(f)
    df_data = pd.DataFrame(data).transpose()
    return df_data

In [None]:
def preprocess_data_1(df_data): 
    """remove punctuation and convert to lowercase

    Args:
        df_data : data is a Series of texte
    """
    # Remove punctuation
    df_data = df_data.map(lambda x: re.sub('[,\.!?/:;]', '', x))

    # Convert the titles to lowercase
    df_data= df_data.map(lambda x: x.lower())
    return df_data


In [None]:
def sentence2word(df_data):
    """Remove stop words, tokenize and return a list of lists of words

    Args:
        df_data (_type_): _description_

    Returns:
        list of list: list the words of each text
    """
    stop_words = stopwords.words('english')
    stop_words.extend(['from', 'subject', 're', 'edu', 'use','test'])

    return [[word for word in simple_preprocess(str(doc)) 
        if word not in stop_words] for doc in df_data]

# simple_preprocess is a function of gensim.utils
# Convert a document into a list of tokens.

# This lowercases, tokenizes, de-accents (optional). 
# – the output are final tokens = unicode strings, that won’t be processed any further.

In [None]:
data = preprocess_data_1(load_data('data\\navigation_data_cloud.txt')['video_description'])
data2word = sentence2word(data)


In [None]:
def create_cloud(list_list):
    """
    Generate a word cloud from a list of lists of words
    """
    # Join the different processed titles together.
    long_string = ','.join([ item for row in list_list for item in row])
    # Create a WordCloud object
    wordcloud = WordCloud(background_color="white", max_words=5000, contour_width=3, contour_color='steelblue')
    # Generate a word cloud
    wordcloud.generate(long_string)

    return wordcloud

In [None]:
w = create_cloud(data2word)
w.to_image()

visual representation of most common words. It is key to understanding the data and ensuring we are on the right track, and if any more preprocessing is necessary before training the model.

## LSA Analysis

Next, let’s work to transform the textual data in a format that will serve as an input for training LDA model. We start by tokenizing the text and removing stopwords. Next, we convert the tokenized object into a corpus and dictionary.

In [None]:
def preprocess_data(doc_set,lang = 'English'):
    """
    Input  : docuemnt list
    Purpose: preprocess text (tokenize, removing stopwords, and stemming)
    Output : preprocessed text
    """
    # initialize regex tokenizer
    tokenizer = RegexpTokenizer(r'\w+')
    # create English stop words list
    stop_words_en = stopwords.words('english')
    stop_words_en.extend(['from', 'subject', 're', 'edu', 'use','test','http','by','the','to','and','instagram','tiktok','ly','bit','com','www','at'])

    stop_words_fr = stopwords.words('french')
    stop_words_fr.extend(['a','http','ly','fait'])



    en_stop = set(stop_words_en)
    fr_stop = set(stop_words_fr)


    # Create p_stemmer of class PorterStemmer
    p_stemmer = PorterStemmer()
    # list for tokenized documents in loop
    texts = []
    # loop through document list
    for i in doc_set:
        # clean and tokenize document string
        raw = i.lower()
        tokens = tokenizer.tokenize(raw)
        # remove stop words from tokens
        stopped_tokens = [i for i in tokens if not i in en_stop]
        stopped_tokens = [i for i in tokens if not i in fr_stop]

        # stem tokens
        stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]

        stemmed_tokens = [i if not i in en_stop else '' for i in stemmed_tokens ]
        stemmed_tokens = np.array([i if not i in fr_stop else '' for i in stemmed_tokens])
        stemmed_tokens = stemmed_tokens[stemmed_tokens != '']
        # add tokens to list
        texts.append(stemmed_tokens)
    return texts

In [None]:
def prepare_corpus(doc_clean):
    """
    Input  : clean document
    Purpose: create term dictionary of our courpus and Converting list of documents (corpus) into Document Term Matrix
    Output : term dictionary and Document Term Matrix
    """
    # Creating the term dictionary of our courpus, where every unique term is assigned an index. dictionary = corpora.Dictionary(doc_clean)
    dictionary = corpora.Dictionary(doc_clean)
    # Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above.
    doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_clean]
    # generate LDA model
    return dictionary,doc_term_matrix


In [None]:
def create_gensim_lsa_model(doc_clean,number_of_topics,words):
    """
    Input  : clean document, number of topics and number of words associated with each topic
    Purpose: create LSA model using gensim
    Output : return LSA model
    """
    dictionary,doc_term_matrix = prepare_corpus(doc_clean)
    # generate LSA model
    lsamodel = gensim.models.LsiModel(doc_term_matrix, num_topics=number_of_topics, id2word = dictionary)  # train model
    print(lsamodel.print_topics(num_topics=number_of_topics, num_words=words))
    return lsamodel

In [None]:
def compute_coherence_values(dictionary, doc_term_matrix, doc_clean, stop, start=2, step=3):
    """
    Input   : dictionary : Gensim dictionary
              corpus : Gensim corpus
              texts : List of input texts
              stop : Max num of topics
    purpose : Compute c_v coherence for various number of topics
    Output  : model_list : List of LSA topic models
              coherence_values : Coherence values corresponding to the LDA model with respective number of topics
    """
    coherence_values = []
    model_list = []
    for num_topics in range(start, stop, step):
        # generate LSA model
        model = gensim.models.LsiModel(doc_term_matrix, num_topics=num_topics, id2word = dictionary)  # train model
        model_list.append(model)
        coherencemodel = gensim.models.CoherenceModel(model=model, texts=doc_clean, dictionary=dictionary, coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())
    return model_list, coherence_values

In [None]:
def plot_graph(doc_clean,start, stop, step):
    dictionary,doc_term_matrix=prepare_corpus(doc_clean)
    model_list, coherence_values = compute_coherence_values(dictionary, doc_term_matrix,doc_clean,
                                                            stop, start, step)
    # Show graph
    x = range(start, stop, step)
    plt.plot(x, coherence_values)
    plt.xlabel("Number of Topics")
    plt.ylabel("Coherence score")
    plt.legend(("coherence_values"), loc='best')
    plt.show()

    return x[np.argmax(coherence_values)],model_list[np.argmax(coherence_values)]



In [None]:
doc_clean = preprocess_data(load_data('data\\navigation_data_cloud.txt')['video_description'].values)

start,stop,step=2,6,1
number_of_topics_opt,model_opt = plot_graph(doc_clean,start,stop,step)

In [None]:
# LSA Model
words=10
model_opt.print_topics(num_topics=number_of_topics_opt, num_words=words)
# clean_text=preprocess_data(df_data['video_description_processed'])
# model=create_gensim_lsa_model(clean_text,number_of_topics_opt,words)

## LDA

In [None]:
preprocessed_data = preprocess_data(load_data('data\\navigation_data_cloud.txt')['video_description'].values)
dictionary,doc_term_matrix = prepare_corpus(preprocessed_data)


In [None]:
# preprocessed_data[5]

In [None]:
# number of topics
num_topics = number_of_topics_opt
# Build LDA model
lda_model = gensim.models.LdaMulticore(corpus=doc_term_matrix,
                                       id2word=dictionary,
                                       num_topics=num_topics)
# Print the Keyword in the 10 topics
pprint(lda_model.print_topics())
doc_lda = lda_model[doc_term_matrix]

Now that we have a trained model let’s visualize the topics for interpretability. To do so, we’ll use a popular visualization package, pyLDAvis which is designed to help interactively with:

Better understanding and interpreting individual topics, and
Better understanding the relationships between the topics.
For (1), you can manually select each topic to view its top most frequent and/or “relevant” terms, using different values of the λ parameter. This can help when you’re trying to assign a human interpretable name or “meaning” to each topic.

For (2), exploring the Intertopic Distance Plot can help you learn about how topics relate to each other, including potential higher-level structure between groups of topics.

In [None]:
# Visualize the topics
pyLDAvis.enable_notebook()
LDAvis_data_filepath = os.path.join('prepared_'+str(num_topics))
# # this is a bit time consuming - make the if statement True
# # if you want to execute visualization prep yourself
LDAvis_prepared = pyLDAvis.gensim_models.prepare(lda_model, doc_term_matrix, dictionary)
with open(LDAvis_data_filepath, 'wb') as f:
    pickle.dump(LDAvis_prepared, f)

# load the pre-prepared pyLDAvis data from disk
with open(LDAvis_data_filepath, 'rb') as f:
    LDAvis_prepared = pickle.load(f)
pyLDAvis.save_html(LDAvis_prepared, 'prepared_'+ str(num_topics) +'.html')
LDAvis_prepared