In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import pickle
from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn.model_selection import GridSearchCV
import pandas as pd
import numpy as np

In [11]:
#Importamos los datos
with open('files/tokenized_documents.pkl', 'rb') as f:
    tokenized_documents = pickle.load(f)
with open('files/lemmatized_documents.pkl', 'rb') as f:
    lemmatized_documents = pickle.load(f)
with open('files/feature_names.pkl', 'rb') as f:
    feature_names = pickle.load(f)

In [3]:
no_features = 1000

# tf-idf para NMF, ya que es un modelo matemático basado en algebra lineal.
# Se aplica un Preprocessor que una las palabras de cada documento antes de hacer la matriz tf-idf
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, preprocessor=' '.join,
                                   max_features=no_features, stop_words='english', lowercase=False)
tfidf = tfidf_vectorizer.fit_transform(lemmatized_documents)
tfidf_feature_names = tfidf_vectorizer.get_feature_names()

# # tf para LDA porque es un modelo probabilístico.
# # Se aplica el mismo preprocessor que en tf-idf, ya que tenemos los documentos tokenizados
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, preprocessor=' '.join, lowercase=False,
                                 max_features=no_features, stop_words='english')
tf = tf_vectorizer.fit_transform(lemmatized_documents)
tf_feature_names = tf_vectorizer.get_feature_names()

In [4]:
#Importamos los modelos pre-entrenados

with open('files/nmf.pkl', 'rb') as f:
    nmf = pickle.load(f)

with open('files/lda.pkl', 'rb') as f:
    lda = pickle.load(f)


In [5]:
# Creamos una función para mostrar los topics que ha creado cada modelo
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic %d:" % (topic_idx))
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))

In [8]:
no_top_words = 10
print("NMF topics:\n")
display_topics(nmf, tfidf_feature_names, no_top_words)
print("\nLDA topics:\n")
display_topics(lda, tf_feature_names, no_top_words)

NMF topics:

Topic 0:
like said people dont time think make way good know
Topic 1:
said united government states officials military state islamic american china
Topic 2:
game games team players season points scored coach teams goal
Topic 3:
percent company billion million companies investors business market said year
Topic 4:
trump republican trumps mr campaign republicans donald presidential nominee voters
Topic 5:
graduated dr father university married mother york couple daughter received
Topic 6:
mr said party president redstone interview political office campaign executive
Topic 7:
ms said women woman mother shes family husband female children
Topic 8:
european union britain british europe vote london leave party economic
Topic 9:
art new museum york street city building artists space park
Topic 10:
court judge justice case law federal supreme prosecutors lawyers trial
Topic 11:
police said officers officer shooting department gun killed people city
Topic 12:
yankees mets season ru

In [9]:
# Log Likelyhood: cuanto más mejor
print("Log Likelihood: ", lda.score(tf))
# Perplexity: cuanto más bajo mejor. Perplexity = exp(-1. * log-likelihood por palabra)
print("Perplexity: ", lda.perplexity(tf))
# Parámetros del modelo
print(lda.get_params())

Log Likelihood:  -9959773.95389847
Perplexity:  494.94090095110624
{'batch_size': 128, 'doc_topic_prior': None, 'evaluate_every': -1, 'learning_decay': 0.5, 'learning_method': 'online', 'learning_offset': 50.0, 'max_doc_update_iter': 100, 'max_iter': 5, 'mean_change_tol': 0.001, 'n_components': 15, 'n_jobs': None, 'perp_tol': 0.1, 'random_state': 0, 'topic_word_prior': None, 'total_samples': 1000000.0, 'verbose': 0}


In [11]:
# # Grid Search de metaparámetros LDA
search_params = {'n_components': [10, 15, 20, 25, 30], 'learning_decay': [.5, .7, .9]}
# # Iniciar modelo
lda = LatentDirichletAllocation(max_iter=5, learning_method='online', learning_offset=50.,random_state=0)
# # Iniciar clase Grid Search
model = GridSearchCV(lda, param_grid=search_params, verbose=2, n_jobs=5)
# # Entrenar Grid
model.fit(tf)

In [12]:
# # Mejor modelo
best_lda_model = model.best_estimator_
# # Mejor parametros
print("Best Model's Params: ", model.best_params_)
# # Log Likelihood Score
print("Best Log Likelihood Score: ", model.best_score_)
# # Perplexity
print("Model Perplexity: ", best_lda_model.perplexity(tf))

In [12]:
# Crear matriz de temas para LDA
lda_output = lda.transform(tf)
# Nombres de columnas
topicnames = ["Topic" + str(i) for i in range(lda.n_components)]
# Índices de documentos
docnames = ["Doc" + str(i) for i in range(len(tokenized_documents))]
# Crear pandas dataframe
df_document_topic = pd.DataFrame(np.round(lda_output, 2), columns=topicnames, index=docnames)
# Coger el tema dominante por documento y crear una columna para el mismo
dominant_topic = np.argmax(df_document_topic.values, axis=1)
df_document_topic['dominant_topic'] = dominant_topic
# Estilizamos los valores en verde para ver mejor los temas dominantes
def color_green(val):
 color = 'green' if val > .1 else 'black'
 return 'color: {col}'.format(col=color)
def make_bold(val):
 weight = 700 if val > .1 else 400
 return 'font-weight: {weight}'.format(weight=weight)
# aplicamos los estilos
df_document_topics = df_document_topic.head(15).style.applymap(color_green).applymap(make_bold)
df_document_topics

Unnamed: 0,Topic0,Topic1,Topic2,Topic3,Topic4,Topic5,Topic6,Topic7,Topic8,Topic9,Topic10,Topic11,Topic12,Topic13,Topic14,dominant_topic
Doc0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3
Doc1,0.0,0.0,0.0,0.0,0.0,0.0,0.13,0.0,0.08,0.14,0.11,0.0,0.27,0.0,0.27,12
Doc2,0.0,0.2,0.05,0.02,0.0,0.0,0.0,0.01,0.0,0.13,0.0,0.01,0.0,0.0,0.58,14
Doc3,0.0,0.24,0.0,0.0,0.0,0.14,0.47,0.0,0.0,0.12,0.0,0.0,0.0,0.0,0.02,6
Doc4,0.0,0.0,0.23,0.33,0.13,0.25,0.0,0.0,0.05,0.0,0.0,0.0,0.0,0.0,0.0,3
Doc5,0.0,0.0,0.0,0.0,0.43,0.43,0.0,0.0,0.0,0.12,0.0,0.0,0.0,0.0,0.0,4
Doc6,0.35,0.0,0.03,0.03,0.0,0.09,0.44,0.0,0.0,0.04,0.0,0.0,0.0,0.0,0.0,6
Doc7,0.0,0.3,0.0,0.03,0.0,0.21,0.0,0.0,0.03,0.0,0.07,0.0,0.28,0.0,0.08,1
Doc8,0.04,0.0,0.0,0.72,0.0,0.22,0.0,0.0,0.0,0.0,0.02,0.0,0.0,0.0,0.0,3
Doc9,0.0,0.25,0.0,0.0,0.09,0.0,0.0,0.24,0.0,0.28,0.0,0.0,0.04,0.0,0.11,9


In [13]:
# Hacemos lo mismo para el modelo NMF
nmf_output = nmf.transform(tfidf)
# Nombres de columnas
topicnames = ["Topic" + str(i) for i in range(nmf.n_components)]
# Índices de documentos
docnames = ["Doc" + str(i) for i in range(len(tokenized_documents))]
# Crear pandas dataframe
df_document_topic = pd.DataFrame(np.round(nmf_output, 2), columns=topicnames, index=docnames)
# Coger el tema dominante por documento y crear una columna para el mismo
dominant_topic = np.argmax(df_document_topic.values, axis=1)
df_document_topic['dominant_topic'] = dominant_topic
# Estilizamos los valores en verde para ver mejor los temas dominantes
def color_green(val):
 color = 'green' if val > .1 else 'black'
 return 'color: {col}'.format(col=color)
def make_bold(val):
 weight = 700 if val > .1 else 400
 return 'font-weight: {weight}'.format(weight=weight)
# aplicamos los estilos
df_document_topics = df_document_topic.head(15).style.applymap(color_green).applymap(make_bold)
df_document_topics

Unnamed: 0,Topic0,Topic1,Topic2,Topic3,Topic4,Topic5,Topic6,Topic7,Topic8,Topic9,Topic10,Topic11,Topic12,Topic13,Topic14,dominant_topic
Doc0,0.01,0.0,0.01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.17,0.0,0.0,12
Doc1,0.0,0.0,0.0,0.01,0.0,0.0,0.03,0.04,0.0,0.01,0.02,0.05,0.0,0.01,0.0,11
Doc2,0.0,0.01,0.0,0.0,0.0,0.0,0.06,0.0,0.0,0.03,0.0,0.05,0.0,0.01,0.0,6
Doc3,0.02,0.0,0.0,0.03,0.0,0.0,0.01,0.01,0.0,0.02,0.0,0.01,0.0,0.0,0.0,3
Doc4,0.03,0.04,0.03,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
Doc5,0.04,0.0,0.03,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
Doc6,0.02,0.0,0.0,0.05,0.0,0.0,0.01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3
Doc7,0.03,0.03,0.0,0.0,0.0,0.0,0.0,0.01,0.0,0.03,0.01,0.01,0.0,0.0,0.0,0
Doc8,0.04,0.01,0.06,0.01,0.0,0.0,0.0,0.0,0.0,0.0,0.01,0.0,0.01,0.0,0.0,2
Doc9,0.0,0.02,0.01,0.0,0.01,0.01,0.05,0.0,0.0,0.03,0.0,0.01,0.0,0.0,0.01,6


In [14]:
# Matriz Tema-palabra
df_topic_keywords = pd.DataFrame(lda.components_)
# Asignamos nombres a las columnas con las features y a las filas con los topics
df_topic_keywords.columns = feature_names
df_topic_keywords.index = topicnames
# mostramos los 5 primeros
df_topic_keywords.head()

Unnamed: 0,_____,ability,able,access,according,account,accused,act,action,actually,...,written,wrong,wrote,yankees,year,yearold,years,york,young,youre
Topic0,0.066667,43.7832,44.707071,0.710417,595.13311,1.523547,0.066667,1.084235,26.136708,0.442474,...,0.066729,0.066691,2.012129,0.066667,2139.470914,0.066667,524.001906,341.8389,0.066727,1.640066
Topic1,0.066667,0.066667,23.951247,70.167332,284.912952,0.066667,0.066667,0.066667,0.066667,0.070462,...,0.066667,0.066667,0.067607,0.066667,708.919959,61.55855,1034.747961,3076.258527,49.220272,0.066667
Topic2,0.066667,86.892114,104.262491,173.864847,470.587083,69.746967,169.74393,26.923575,152.8038,0.066667,...,6.76044,0.066667,84.682779,0.066667,698.205056,31.354826,695.381747,6.555391,56.171568,0.066667
Topic3,0.066667,75.815524,258.494466,0.066667,79.660457,6.09366,0.06667,13.761367,20.448813,34.190499,...,0.066667,41.361981,3.472349,1287.805822,960.694958,88.597406,698.647649,67.127182,222.680753,158.262106
Topic4,0.066667,45.052703,87.024193,0.066667,0.066667,0.066667,0.066667,0.10385,12.323469,3.281715,...,0.066667,8.555874,0.066667,0.066667,498.875335,162.406291,347.670248,0.066667,11.031696,17.41101


In [15]:
def show_topics(vectorizer, lda_model, n_words):
    keywords = np.array(vectorizer.get_feature_names())
    topic_keywords = []
    for topic_weights in lda_model.components_:
        top_keyword_locs = (-topic_weights).argsort()[:n_words]
        topic_keywords.append(keywords.take(top_keyword_locs))
    return topic_keywords
        
topic_keywords = show_topics(tf_vectorizer, lda, 10)
        
# Palabras más representativas de cada topic

df_topic_keywords = pd.DataFrame(topic_keywords)
df_topic_keywords.columns = ['Word '+str(i) for i in range(df_topic_keywords.shape[1])]
df_topic_keywords.index = ['Topic '+str(i) for i in range(df_topic_keywords.shape[0])]
df_topic_keywords

Unnamed: 0,Word 0,Word 1,Word 2,Word 3,Word 4,Word 5,Word 6,Word 7,Word 8,Word 9
Topic 0,percent,million,company,said,year,billion,oil,companies,business,chief
Topic 1,new,city,york,art,street,said,building,park,museum,water
Topic 2,said,united,states,government,american,officials,state,military,president,china
Topic 3,said,season,team,players,game,league,games,yankees,teams,hit
Topic 4,game,said,series,points,scored,second,win,play,goal,games
Topic 5,like,new,people,time,dont,way,think,make,good,work
Topic 6,said,company,companies,like,new,facebook,media,people,technology,online
Topic 7,trump,mr,clinton,campaign,republican,mrs,voters,sanders,democratic,senator
Topic 8,ms,said,school,students,family,children,mother,father,university,life
Topic 9,mr,said,president,house,obama,interview,people,time,like,years


In [17]:
# Creamos una columna con el nombre que asignamos a cada topic en función de sus palabras

Topics = ["Empresas","Ciudades","Gobierno","Deportes","Deportes", 
          "Otros", "Empresas Tech", "Política", "Familia", 
          "Presidente", "Ley", "Economía", "Salud", "Europa", "Policía"]
df_topic_keywords["Topics"] = Topics

# Subimos el fichero, ya que lo utiliza la clase principal de nuestro programa para asignar topics
with open('files/df_topic_keywords.pkl', 'wb') as f:
    pickle.dump(df_topic_keywords, f)
df_topic_keywords

Unnamed: 0,Word 0,Word 1,Word 2,Word 3,Word 4,Word 5,Word 6,Word 7,Word 8,Word 9,Topics
Topic 0,percent,million,company,said,year,billion,oil,companies,business,chief,Empresas
Topic 1,new,city,york,art,street,said,building,park,museum,water,Ciudades
Topic 2,said,united,states,government,american,officials,state,military,president,china,Gobierno
Topic 3,said,season,team,players,game,league,games,yankees,teams,hit,Deportes
Topic 4,game,said,series,points,scored,second,win,play,goal,games,Deportes
Topic 5,like,new,people,time,dont,way,think,make,good,work,Otros
Topic 6,said,company,companies,like,new,facebook,media,people,technology,online,Empresas Tech
Topic 7,trump,mr,clinton,campaign,republican,mrs,voters,sanders,democratic,senator,Política
Topic 8,ms,said,school,students,family,children,mother,father,university,life,Familia
Topic 9,mr,said,president,house,obama,interview,people,time,like,years,Presidente
