In [1]:
import os 
import json as js
import pandas as pd
import re # El paquete para tratar texto. Expresiones regulares
from sklearn.feature_extraction.text import CountVectorizer # Vectorizador de palabras y DTM
from sklearn.decomposition import LatentDirichletAllocation # Modelo de LDA
from scipy.sparse import csr_matrix # Para tratar Sparse Matrix
import numpy as np
from wordcloud import WordCloud #importo la función
import time
import datetime
# LDA, tSNE
from sklearn.manifold import TSNE
from bokeh.palettes import all_palettes
from sklearn.decomposition import PCA
from matplotlib import cm

<matplotlib.colors.ListedColormap at 0x222262e3c18>

# LDA Model
Is a generative statistical model that allows sets of observations to be explained by unobserved groups and in this way it can explained why some of the observations are similar between themselves. 

## Process 

### Pre Process Data

- Process the documents that will be used, get them all in a standard form
- Eliminate very short articles 
- Beware that some metadata or unicode characters could slip.

#### Words

Since LDA uses a bag of words methodology is very importatn to have the right words in the document, would want to eliminate very common words since they might not carry a very special meaning, or words that are only use once. We could also check for different things or focus on different aspects:

- **wordlength** (number of letters in the word) or (n-gram)
- **Stoplist** (Words that don't add much meaning to the document, general terms, )
- **Lemmatization** (interpretate as equal all the different conjugations of the same verb)
- **Parts of Speech**:
    + Noun
    + Verb
    + Adverb
    + Adjective
    + Preposition
    
### The Model

In the following link we can find a very good description about the intuition in the model and what it tries to do. 
https://towardsdatascience.com/lda-topic-modeling-an-explanation-e184c90aadcd




In [2]:
def topics_word_distribution(model, count_vectorizer):
    """ This function recieves an lda_model and a count vectorizer as input and returns a the word probability distribution per topic.
    This is the probability that a word belongs to a certain topic"""
    num_topics,num_words= model.components_.shape
    topicos=["Topic #"+str(i) for i in range(0,num_topics)]
    matrix=pd.DataFrame(model.components_, index=topicos, columns=count_vectorizer.get_feature_names())
    return matrix

In [3]:
def most_important_words_per_topic(model, count_vectorizer, n_words):
    """ This function returns the most important words per topic, according to its inputs: an LDA Model, a Count Vectorizer,
    and the number of words wanted
    """
    num_topics,num_words= model.components_.shape
    print(num_topics)
    words = count_vectorizer.get_feature_names() # extraigo las palabras del modelo
    topicos=["Topic #"+str(i) for i in range(0,num_topics)]
    resultado=pd.DataFrame(index=topicos, columns=["Word"+str(i) for i in range(1,n_words+1) ])
    resultado.fillna(" ")
    for topic in range(0,len(topicos)):
        resultado.iloc[topic,:]=[words[j] for j in model.components_[topic].argsort()[-n_words - 1:-1]]
    return(resultado)

In [4]:
def lda_model(date_init, date_end, k_topics, n_vocab=350):
    """ This function recieves a start date, end date, number of topics and a length of voacbulary. All of this parameters 
    will be used to create an LDA model according to those characteristics. An LDA model that only considerates news from the period
    of time selected, with k topics and vocalubary of length n_vocab.
    """
   #Se leen los archivos especificos, para el periodo de tiempo especificado.
    lemmatized_files=os.listdir('C:\\Users\\usuario\\Desktop\\Universidad\\Proyectos Personales'+"\\Text Mining\\process_text")
    lemmatized_files=[k for k in lemmatized_files if len(re.findall("E.+", k)) is not 0 ]
    lemmatized_files=pd.DataFrame({"lemmatized_files":lemmatized_files})
    lemmatized_files_dates=[re.findall("[0-9-]+", j)[0] for j in lemmatized_files["lemmatized_files"]]
    lemmatized_files_dates=pd.to_datetime(lemmatized_files_dates, format="%Y-%m-%d")
    lemmatized_files["dates"]=lemmatized_files_dates
    date_init=pd.to_datetime(date_init)
    date_end=pd.to_datetime(date_end)
    
    selected_files=lemmatized_files.loc[(date_init<=lemmatized_files_dates)&(date_end>=lemmatized_files_dates),"lemmatized_files"]
    #Se construye el tf, en base a ellos
    df=pd.DataFrame({"categoria":[], "contenido":[],"link":[], "titulo":[], "process_time":[]})
    for file in selected_files:
        with open('C:\\Users\\usuario\\Desktop\\Universidad\\Proyectos Personales'+"\\Text Mining\\process_text\\"+file, "r", encoding="utf-8") as t:
            info_dict =js.load(t)
            temp=pd.DataFrame(info_dict)
            df=df.append(temp)
    df.shape
    
    #Cargo las stopwords
        # Now we load the stop words from a json file
    with open('C:\\Users\\usuario\\Desktop\\Universidad\\Proyectos Personales' + "\\Text Mining\\"+"spanish_stopwords.json", 'r' , encoding='utf-8') as d:
            stop_words=js.load(d)
    stop_words=stop_words["words"]
    
        # Now we create the document-word matrix, where each row is a document and each column is a word
    # máximo tamaño de vocabulario
    #We define the parameters that will be applied to the model creating an instance of it
    tf_vectorizer = CountVectorizer(max_df=0.8, min_df=2, max_features=n_vocab, stop_words=stop_words, ngram_range=(1,3)) # Al igual que un modelo, defino el objeto que construirá la matriz
    #We applied the model to a set of documents
    tf = tf_vectorizer.fit_transform(df.contenido) # Aplico el objeto a un conjunto de textos
    #We get the vocabulary
    tf_feature_names = tf_vectorizer.get_feature_names() # Veo el vocabulario
    # Creamos el MODELO LDA

    lda = LatentDirichletAllocation(n_components=k_topics, max_iter=20,doc_topic_prior=0.1, topic_word_prior=0.1, n_jobs=-1,random_state=23) # Construyo el objeto que es el modelo
    lda_fit=lda.fit(tf)
    lda_output=lda_fit.transform(tf)
    docs=['doc'+str(i) for i in range(lda_output.shape[0])] # Nombres de filas
    topics=['topic '+str(i) for i in range(lda_output.shape[1])] # Nombres de columnas
    lda_output=pd.DataFrame(lda_output, index=docs, columns=topics)
    return (lda_fit, lda_output, df,tf_vectorizer) 

In [5]:
def tsne(lda_output,df):
    """ This functions recieves the output of an lda_model, which is a probability distribution of the articles in the 
    k-dimensional topic space and reduces its dimensionality but trying to keep its similiraties and differences in a new 
    2-dimensional space. 
    """
    tsne = TSNE(random_state=2017, perplexity=30, early_exaggeration=120)
    num_docs,num_topics=lda_output.shape
    
    data=np.array(lda_output.iloc[:,0:num_topics])
    
    embedding = tsne.fit_transform(data)
    embedding = pd.DataFrame(embedding, columns=['x_tsne','y_tsne'])
    embedding['topico_dominante'] =  np.argmax(data, axis=1)
    
    df2=pd.concat([df.reset_index(drop=True),embedding], axis=1)

    
    return df2

In [6]:
def pca(lda_output,df2):
        """ This functions recieves the output of an lda_model, which is a probability distribution of the articles in the 
    k-dimensional topic space and reduces its dimensionality by projecting the 2 principal component vectors into a new orthogonal
    2 dimensional space.     """
        pca = PCA(n_components=2)
        num_docs,num_topics=lda_output.shape
        data=np.array(lda_output.iloc[:,0:num_topics])
        pca_result = pd.DataFrame(pca.fit_transform(data),columns=["x_pca", "y_pca"])
        df3=pd.concat([df2.reset_index(drop=True),pca_result], axis=1)
        return df3

In [20]:
def generate_word_cloud(tf_vectorizer,filename,df):
    cloud=WordCloud(background_color='white', width=375, height=375, max_words=50, max_font_size=225, stopwords=tf_vectorizer.get_stop_words(), colormap=cm.get_cmap(name="inferno", lut=None),random_state=123) # Construyo el generador de la nube
    cloud.generate('.'.join(list(df.contenido))) # Genero la nube
    cloud.to_file(filename)

In [15]:
def crear_escenario(list_fechas, num_topics):
    """This function creates a scenario. This scenario is one hypothetical construction of one of the many ways the model
    could be constructed taking into account possible dates, and number of topics."""
    model, output, df, tf=lda_model(list_fechas[0],list_fechas[1], num_topics)
    tsne_df=tsne(output, df)
    df_tot=pca(output, tsne_df)
    topic_word_df=topics_word_distribution(model,tf)
    
    return (df_tot, output,topic_word_df,tf) 

In [14]:
def crear_escenarios(lista_fechas):
    for rango in lista_fechas:
        print(rango)
        os.mkdir("C:\\Users\\usuario\\Desktop\\Universidad\\Proyectos Personales\\Proyectos-Personales\\Dash App\\Escenarios\\"+
                 str(rango[0])+"__"+str(rango[1]))
        for num_topics in range(4,14,2):
            print(num_topics)
            _dir_="C:\\Users\\usuario\\Desktop\\Universidad\\Proyectos Personales\\Proyectos-Personales\\Dash App\\Escenarios\\"+str(rango[0])+"__"+str(rango[1])+"\\"+str(num_topics)+"\\"
            os.mkdir(_dir_)
            df_tot, lda_output, topic_word_df,tf=crear_escenario(rango, num_topics)
            generate_word_cloud(tf,_dir_+"word_cloud.jpg",df_tot)
            df_tot.to_csv(_dir_+"df_tot.csv", encoding="utf-8")
            lda_output.to_csv(_dir_+"lda_output.csv", encoding="utf-8")
            topic_word_df.to_csv(_dir_+"topic_word_df.csv", encoding="utf-8")
            


In [17]:
#Given a range of dates, we create a list of possible combinations of dates, to create a scenario for each possible combination of 
# date, and number of topics.
fecha_inicial =datetime.date(2020,4,13)
fechas_iniciales=[fecha_inicial+datetime.timedelta(14*i) for i in range(0,8)]
fecha_final = datetime.date(2020,4,26)
fechas_finales=[fecha_final+datetime.timedelta(14*i) for i in range(0,8)]
lista_fechas=[]
for fecha_i in fechas_iniciales:
    for fecha_f in fechas_finales:
        fecha_temp=[]
        if fecha_i<fecha_f:
            fecha_temp.append(fecha_i)
            fecha_temp.append(fecha_f)
            lista_fechas.append(fecha_temp)
    

In [21]:
lista_fechas

[[datetime.date(2020, 4, 13), datetime.date(2020, 4, 26)],
 [datetime.date(2020, 4, 13), datetime.date(2020, 5, 10)],
 [datetime.date(2020, 4, 13), datetime.date(2020, 5, 24)],
 [datetime.date(2020, 4, 13), datetime.date(2020, 6, 7)],
 [datetime.date(2020, 4, 13), datetime.date(2020, 6, 21)],
 [datetime.date(2020, 4, 13), datetime.date(2020, 7, 5)],
 [datetime.date(2020, 4, 13), datetime.date(2020, 7, 19)],
 [datetime.date(2020, 4, 13), datetime.date(2020, 8, 2)],
 [datetime.date(2020, 4, 27), datetime.date(2020, 5, 10)],
 [datetime.date(2020, 4, 27), datetime.date(2020, 5, 24)],
 [datetime.date(2020, 4, 27), datetime.date(2020, 6, 7)],
 [datetime.date(2020, 4, 27), datetime.date(2020, 6, 21)],
 [datetime.date(2020, 4, 27), datetime.date(2020, 7, 5)],
 [datetime.date(2020, 4, 27), datetime.date(2020, 7, 19)],
 [datetime.date(2020, 4, 27), datetime.date(2020, 8, 2)],
 [datetime.date(2020, 5, 11), datetime.date(2020, 5, 24)],
 [datetime.date(2020, 5, 11), datetime.date(2020, 6, 7)],
 [da

In [22]:
crear_escenarios(lista_fechas)

[datetime.date(2020, 4, 13), datetime.date(2020, 4, 26)]
4
6
8
10
12
[datetime.date(2020, 4, 13), datetime.date(2020, 5, 10)]
4
6
8
10
12
[datetime.date(2020, 4, 13), datetime.date(2020, 5, 24)]
4
6
8
10
12
[datetime.date(2020, 4, 13), datetime.date(2020, 6, 7)]
4
6
8
10
12
[datetime.date(2020, 4, 13), datetime.date(2020, 6, 21)]
4
6
8
10
12
[datetime.date(2020, 4, 13), datetime.date(2020, 7, 5)]
4
6
8
10
12
[datetime.date(2020, 4, 13), datetime.date(2020, 7, 19)]
4
6
8
10
12
[datetime.date(2020, 4, 13), datetime.date(2020, 8, 2)]
4
6
8
10
12
[datetime.date(2020, 4, 27), datetime.date(2020, 5, 10)]
4
6
8
10
12
[datetime.date(2020, 4, 27), datetime.date(2020, 5, 24)]
4
6
8
10
12
[datetime.date(2020, 4, 27), datetime.date(2020, 6, 7)]
4
6
8
10
12
[datetime.date(2020, 4, 27), datetime.date(2020, 6, 21)]
4
6
8
10
12
[datetime.date(2020, 4, 27), datetime.date(2020, 7, 5)]
4
6
8
10
12
[datetime.date(2020, 4, 27), datetime.date(2020, 7, 19)]
4
6
8
10
12
[datetime.date(2020, 4, 27), datetime.d

In [25]:
a=[1,2,3,4,5]
a[0:-2]

[1, 2, 3]