## LDA and Gabriel García Márquez
 - A transgression to the literary world

In [None]:
# Cell #1: Import requirements and declare some helper functions, 
# yeah I know what you're thinking... this could be part of a library

import urllib
import json
import string
from IPython.display import IFrame
from IPython.core.display import display, HTML

# NLP libraries
import spacy
from gensim import corpora, models
from gensim.models.ldamulticore import LdaMulticore
from gensim.models import Phrases
from gensim.models.phrases import Phraser

# We are using spacy as a parser so we disable their other capabilities to speed up things
nlp = spacy.load('es', disable=['tagger', 'ner'])

with open('../data/100_años_de_soledad.txt', 'r') as file:
    corpus = file.read().replace('\n', '').split('----')

# Helper function
def flatten(top_list):
    for inner in top_list:
        if isinstance(inner, (list,tuple)):
            for j in flatten(inner):
                yield j
        else:
            yield inner

# Function to clean up the documents, lematizes words to their regular form.
def clean_sentences(doc):
    doc = nlp(doc)
    processed_sentences = []
    for num, sentence in enumerate(doc.sents):
        tokens = [token.lemma_.strip().lower() for token in sentence if token.lemma_ not in string.punctuation]
        cleaned_sentence = [token for token in tokens if token != '-pron-']
        final_sentence = [token for token in cleaned_sentence if token not in nlp.Defaults.stop_words and len(token)>1]
        processed_sentences.append(final_sentence)
    return processed_sentences

In [None]:
# Cell #2 parsing the corpus to generate sentences

# document list will contain our corpus after cleaning it.
gensim_unigram_documents = []
document_list = []
# unigram sentences is going to be used to train our bigram phraser
unigram_sentences = []

for page in corpus:
    text = clean_sentences(page)
    document_list.append({'text': text, 'bigrams': ''})
    for sentence in text:
        unigram_sentences.append(sentence)
        
bigram_model = Phrases(unigram_sentences)
bigram_phraser = Phraser(bigram_model)


In [None]:
# Cell #3 creating our bigrams

# bigram corpus will contain an array of documents and their tokens, with bigram tokens included
bigram_corpus = []

for doc in document_list:
    bigram_sentences = []
    for unigram_sentence in doc['text']:
        bigram_sentence = ' '.join(bigram_phraser[unigram_sentence])
        bigram_sentences.append(bigram_sentence)
    bigram_tokens = list(flatten(bigram_sentences))
    bigrams = ' '.join(bigram_tokens)
    doc['bigrams'] = bigram_sentences
    bigram_corpus.append(bigrams.split())

# Let's print a page from our corpus, note the difference between the not lemmatized/bigramized sentences and the ones that are
print(document_list[4])



### Building the LDA model using Gensim a library for topic modeling, the output is a list of topics present in our corpus.

In [None]:
# Cell 4: Using GENSIM to do topic modelling, this cell takes some time... hang on.

# num pases should be adjusted, 10 is just a guesstimate of when convergence will be achieved.
num_passes = 10
num_topics = 13
words_per_topic = 7
print_topics = False
filename = 'topics' + str(num_topics) + '.html'


dictionary = corpora.Dictionary(bigram_corpus)
dictionary.filter_extremes(no_below=2, no_above=0.8)
lda_corpus = [dictionary.doc2bow(text) for text in bigram_corpus]

lda_model = LdaMulticore(lda_corpus,
                         num_topics=num_topics,
                         id2word=dictionary,
                         passes=num_passes,
                         workers=8
                        )

topics = lda_model.print_topics(num_topics=num_topics, num_words=words_per_topic)
if print_topics:
    print ("Topic List: \n")
    for topic in topics:
        t = str((int(topic[0])+ 1))
        print('Topic ' + t + ': ', topic[1:])

import warnings
warnings.filterwarnings('ignore')

import pyLDAvis.gensim
import pyLDAvis.sklearn

pyLDAvis.enable_notebook()
ldaviz = pyLDAvis.gensim.prepare(corpus=lda_corpus,
                        topic_model=lda_model,
                        dictionary=dictionary,
                        sort_topics=False)

print ("\nPyLDAVis: \n")
print('link to file: ')
display(HTML('<a href="{}" target="_blank">PyLDAviz</a> '.format(filename)))
pyLDAvis.save_html(ldaviz, filename)
pyLDAvis.display(ldaviz)


### Now that we have a trained model we can classify a new unseen document.

In [None]:
# Cell 5: Classifying an unseen document using our GENSIM model

# this is from another García Márquez book, Love in the Time of Cholera

unseen_document = """
Lo más absurdo de la situación de ambos era que nunca parecieron tan felices en público 
como en aquellos años de infortunio. Pues en realidad fueron los años de sus victorias mayores 
sobre la hostilidad soterrada de un medio que no se resignaba a admitirlos como eran: distintos y novedosos, 
y por tanto transgresores del orden tradicional.
 """

parsed_doc = list(flatten(clean_sentences(unseen_document)))
vec = dictionary.doc2bow(parsed_doc)
predicted_topics = lda_model[vec]
predicted_topics = [(p[0]+1, p[1]) for p in predicted_topics]
print(predicted_topics)

## Now let's try some Scikit-Learn

In [None]:
# Cell 6: Building LDA with scikit-learn

from sklearn.decomposition import LatentDirichletAllocation, TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

scikit_corpus = []
for doc in document_list:
    for sent in doc['bigrams']:
        scikit_corpus.append(sent)
    

vectorizer = CountVectorizer(analyzer='word',       
                             min_df=10,                        # minimum reqd occurences of a word 
                             lowercase=True,                   # convert all words to lowercase
                             token_pattern='[a-zA-Z0-9]{3,}',  # num chars > 3
                             max_features=50000,             # max number of uniq words
                            )

data_vectorized = vectorizer.fit_transform(scikit_corpus)

# Build LDA Model
lda_model = LatentDirichletAllocation(n_topics=10,               # Number of topics
                                      max_iter=10,               # Max learning iterations
                                      learning_method='online',   
                                      random_state=100,          # Random state
                                      batch_size=128,            # n docs in each learning iter
                                      evaluate_every = -1,       # compute perplexity every n iters, default: Don't
                                      n_jobs = -1,               # Use all available CPUs
                                     )
lda_output = lda_model.fit_transform(data_vectorized)

print(lda_model)  # Model attributes

In [None]:
# Cell 7: Plotting our model using Singular Value Decomposition

from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
%matplotlib inline
clusters = KMeans(n_clusters=13, random_state=100).fit_predict(lda_output)

# Build the Singular Value Decomposition(SVD) model
svd_model = TruncatedSVD(n_components=2)  # 2 components
lda_output_svd = svd_model.fit_transform(lda_output)

# X and Y axes of the plot using SVD decomposition
x = lda_output_svd[:, 0]
y = lda_output_svd[:, 1]


# Plot
plt.figure(figsize=(12, 12))
plt.scatter(x, y, c=clusters)
plt.ylabel('Component 2')
plt.xlabel('Component 1')
plt.title("Segregation of Topic Clusters of Gabriel García Marquéz's 100 Years of solitude", )