In [1]:
import numpy as np
import pickle
import dill

In [2]:
def read_sinopsis(txt):
    """Obtiene un archivo y lo limpia de caracteres innecesarios"""
    import string
    import re
    
    #Eliminar Actores
    sinopsis = re.sub(r'\([A-Z].*?\)', '', txt)
    #Removemos caracteres innecesarios
    sinopsis = sinopsis.replace(',','')  #comas
    sinopsis = sinopsis.replace('\'','') #comillas
    sinopsis = sinopsis.replace('"','')  #comillas
    sinopsis = sinopsis.replace('\n','') #\n
    sinopsis = ''.join(filter(lambda char: char in string.printable, sinopsis)) #Caracteres no imprimibles.
    
    return [sinopsis]

In [3]:
dill.dump(read_sinopsis, open('read_sinopsis.pkl','wb'))

----

In [4]:
def sent_to_words(sentences):
    """Tokeniza y elimina puntuacion"""
    import gensim
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True elimina puntuacion

In [5]:
dill.dump(sent_to_words, open('sent_to_words.pkl','wb'))

---

In [6]:
import nltk
from nltk.corpus import stopwords

#Importamos las StopWord para ingles
stopwords = nltk.corpus.stopwords.words('english')

#Importamos nombres propios
sw_firstnames = open('names-first.txt', "r").readlines()
sw_firstnames = [i.strip('\n') if type(i) == str else str(i) for i in sw_firstnames]
sw_firstnames = [x.lower() for x in sw_firstnames]

#Agregamos a stopword 
stopwords.extend(sw_firstnames)

In [7]:
pickle.dump(stopwords, open('stopwords.pkl','wb'))

-----

In [8]:
#Funcion para remover Stopwords
def remove_stopwords(texts, stopwords):
    """Remueve stopwods"""
    from gensim.utils import simple_preprocess
    return [[word for word in simple_preprocess(str(doc)) if word not in stopwords] for doc in texts]

In [9]:
dill.dump(remove_stopwords, open('remove_stopwords.pkl','wb'))

----

In [10]:
def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'ADV']): #, 'VERB']):
    """https://spacy.io/api/annotation
    Lemmatiza el texto y aplica cierto nivel de stemmer.
    Por default, conserva solo 'NOUN', 'ADJ', 'ADV'
    """
    import spacy
    nlp = spacy.load('en', disable=['parser', 'ner'])
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append(" ".join([token.lemma_ if token.lemma_ not in ['-PRON-'] else '' for token in doc if token.pos_ in allowed_postags]))
    return texts_out

In [11]:
dill.dump(lemmatization, open('lemmatization.pkl','wb'))

---

In [12]:
def document_topic_genre_words(model, model_output, vectorizer, n_words=15): 
    """
    Genera df de relevancia de los topicos sobre el documento.
    Tambien agrega el nombre del documento colocado a mano.
    Por ultimo, genera df con las palabras del topico.
    """
    import pandas as pd
    
    #Nombres de Columnas e indice
    topicnames = ["Topic" + str(i) for i in range(model.n_components)]
    docnames = ["Sinopsis"]
    
    #Generamos un dataframe
    df_topic_relevance = pd.DataFrame(np.round(model_output, 2), columns=topicnames, index=docnames)
    
    #Obtenemos el topico dominante para el documento
    dominant_topic = np.argmax(df_topic_relevance.values, axis=1)
    df_topic_relevance['dominant_topic'] = dominant_topic
    
    #Asignamos el nombre del genero al df
    generos = ['historical', 'thriller', 'horror', 'comedy', 'crime', 'drama', 'fantasy', 'fiction', 'romance', 'action']
    df_topic_relevance['genre'] = generos[int(dominant_topic)]
    
    #Generamos df con palabras del topico
    keywords = np.array(vectorizer.get_feature_names())
    topic_keywords = []
    for topic_weights in model.components_:
        top_keyword_locs = (-topic_weights).argsort()[:n_words]
        topic_keywords.append(keywords.take(top_keyword_locs))

    # Topic - Keywords Dataframe
    df_topic_keywords = pd.DataFrame(topic_keywords)
    df_topic_keywords.columns = ['Word '+str(i) for i in range(df_topic_keywords.shape[1])]
    df_topic_keywords.index = [i for i in range(df_topic_keywords.shape[0])]

    return df_topic_relevance, generos[int(dominant_topic)], df_topic_keywords.iloc[[int(dominant_topic)]]

In [13]:
dill.dump(document_topic_genre_words, open('document_topic_genre_words.pkl','wb'))

---

In [14]:
def metrics(model, data_vectorized):
    """
    Devuelve las metricas del modelo relevado.
    Log Likelihood y Perplexity
    """
    # Log Likelyhood: Mas grande mejor
    #print("Log Likelihood (cuanto mas grande, mejor): ", model.score(data_vectorized))

    # Perplexity: Mas chico mejor. Perplexity = exp(-1. * log-likelihood per word)
    #print("Perplexity (cuanto mas chico, mejor)     : ", model.perplexity(data_vectorized))
    return model.score(data_vectorized), model.perplexity(data_vectorized)

In [15]:
dill.dump(metrics, open('metrics.pkl','wb'))

---

In [16]:
def lda_plot(model, data_vectorized, vectorizer, mds='tsne'):
    # Plotting tools
    import pyLDAvis
    import pyLDAvis.sklearn
    import matplotlib.pyplot as plt
    %matplotlib inline
    # Warnings
    import warnings
    warnings.filterwarnings("ignore")

    pyLDAvis.enable_notebook()
    panel = pyLDAvis.sklearn.prepare(model, data_vectorized, vectorizer, mds=mds, n_jobs=-1)
    pyLDAvis.save_html(panel, 'lda_pkl.html') #Grabo el grafico en html

In [17]:
dill.dump(lda_plot, open('lda_plot.pkl','wb'))

---
---
---

In [18]:
rs   = dill.load(open('read_sinopsis.pkl', 'rb'))
sw   = dill.load(open('sent_to_words.pkl', 'rb'))
stop = pickle.load(open('stopwords.pkl', 'rb'))
rsw  = dill.load(open('remove_stopwords.pkl', 'rb'))
lm   = dill.load(open('lemmatization.pkl', 'rb'))
vec  = pickle.load(open('vectorizer.pkl', 'rb'))
mod  = pickle.load(open('lda_model.pkl', 'rb'))
dtgw = dill.load(open('document_topic_genre_words.pkl', 'rb'))
met  = dill.load(open('metrics.pkl', 'rb'))
plot = dill.load(open('lda_plot.pkl', 'rb'))

In [19]:
#Abrir Archivo
f = open('1957_12_Angry_Men_[Drama].txt', 'r')
sinopsis = f.read()
f.close

data = rs(sinopsis)
data

['In a New York City courthouse an eighteen-year-old boy from a slum is on trial for allegedly stabbing his father to death. Final closing arguments having been presented a visibly bored judge instructs the jury to decide whether the boy is guilty of murder. If there is any reasonable doubt of his guilt they are to return a verdict of not guilty. The judge further informs them that a guilty verdict will be accompanied by a mandatory death sentence.The jury retires to a private room where the jurors spend a short while getting acquainted before they begin deliberating. It is immediately apparent that the jurors have already decided that the boy is guilty and that they plan to return their verdict without taking time for discussion with the sole exception of Juror 8  who is the only not guilty vote in a preliminary tally. He explains that there is too much at stake for him to go along with the verdict without at least talking about it first. His vote annoys the other jurors especially Ju

In [20]:
data_words = list(sent_to_words(data))
print(data_words)

[['in', 'new', 'york', 'city', 'courthouse', 'an', 'eighteen', 'year', 'old', 'boy', 'from', 'slum', 'is', 'on', 'trial', 'for', 'allegedly', 'stabbing', 'his', 'father', 'to', 'death', 'final', 'closing', 'arguments', 'having', 'been', 'presented', 'visibly', 'bored', 'judge', 'instructs', 'the', 'jury', 'to', 'decide', 'whether', 'the', 'boy', 'is', 'guilty', 'of', 'murder', 'if', 'there', 'is', 'any', 'reasonable', 'doubt', 'of', 'his', 'guilt', 'they', 'are', 'to', 'return', 'verdict', 'of', 'not', 'guilty', 'the', 'judge', 'further', 'informs', 'them', 'that', 'guilty', 'verdict', 'will', 'be', 'accompanied', 'by', 'mandatory', 'death', 'sentence', 'the', 'jury', 'retires', 'to', 'private', 'room', 'where', 'the', 'jurors', 'spend', 'short', 'while', 'getting', 'acquainted', 'before', 'they', 'begin', 'deliberating', 'it', 'is', 'immediately', 'apparent', 'that', 'the', 'jurors', 'have', 'already', 'decided', 'that', 'the', 'boy', 'is', 'guilty', 'and', 'that', 'they', 'plan', 'to

In [21]:
data_words_nonstop = rsw(data_words, stop)
print(data_words_nonstop)

[['new', 'york', 'city', 'courthouse', 'eighteen', 'year', 'old', 'boy', 'slum', 'trial', 'allegedly', 'stabbing', 'father', 'death', 'final', 'closing', 'arguments', 'presented', 'visibly', 'bored', 'judge', 'instructs', 'jury', 'decide', 'whether', 'boy', 'guilty', 'murder', 'reasonable', 'doubt', 'guilt', 'return', 'verdict', 'guilty', 'judge', 'informs', 'guilty', 'verdict', 'accompanied', 'mandatory', 'death', 'sentence', 'jury', 'retires', 'private', 'room', 'jurors', 'spend', 'short', 'getting', 'acquainted', 'begin', 'deliberating', 'immediately', 'apparent', 'jurors', 'already', 'decided', 'boy', 'guilty', 'plan', 'return', 'verdict', 'without', 'taking', 'time', 'discussion', 'exception', 'juror', 'guilty', 'vote', 'preliminary', 'explains', 'much', 'stake', 'go', 'along', 'verdict', 'without', 'least', 'talking', 'first', 'vote', 'annoys', 'jurors', 'especially', 'juror', 'tickets', 'baseball', 'game', 'evening', 'juror', 'believes', 'people', 'slum', 'backgrounds', 'liars',

In [22]:
data_lemmatized = lm(data_words_nonstop, allowed_postags=['NOUN', 'ADJ', 'ADV', 'VERB'])
print(data_lemmatized)

['courthouse year old boy slum trial allegedly stab father death final closing argument present visibly bored instruct jury decide boy guilty murder reasonable doubt guilt return verdict guilty judge inform guilty verdict accompany mandatory death sentence jury retire private room juror spend short get acquaint begin deliberate immediately apparent juror already decide boy guilty plan return verdict take time discussion exception guilty vote preliminary explain much stake go verdict least talk first vote annoy juror especially game evening believe people slum background liar wild dangerous rest focus jurys difficulty reach unanimous verdict several juror harbor personal prejudice maintain evidence present case circumstantial boy deserve fair deliberation call question accuracy reliability witness murder rarity murder weapon common identical copy overall questionable circumstance argue can good conscience vote guilty feel reasonable doubt boy guilt argue several point get favorable resp

In [23]:
data_vectorized = vec.transform(data_lemmatized)

In [24]:
lda_output = mod.transform(data_vectorized)

In [25]:
df_relevance, genre, df_topic_keywords = dtgw(mod, lda_output, vec)

In [26]:
df_relevance

Unnamed: 0,Topic0,Topic1,Topic2,Topic3,Topic4,Topic5,Topic6,Topic7,Topic8,Topic9,dominant_topic,genre
Sinopsis,0.0,0.04,0.0,0.18,0.41,0.17,0.0,0.0,0.2,0.0,4,crime


In [27]:
genre

'crime'

In [28]:
df_topic_keywords

Unnamed: 0,Word 0,Word 1,Word 2,Word 3,Word 4,Word 5,Word 6,Word 7,Word 8,Word 9,Word 10,Word 11,Word 12,Word 13,Word 14
4,police,car,money,gun,man,murder,officer,drug,later,shoot,cop,dead,case,death,phone


In [29]:
loglikelihood, perplexity = met(mod, data_vectorized)
print(loglikelihood, perplexity)

-152441.02409168426 5.723486579907784e+105


In [30]:
plot(mod, data_vectorized, vec)