### 1. Modelado de Topics

El objetivo principal de este ejercicio es el de realizar un **análisis exploratorio** - etapa principal en cualquier problema de analítica, ML, DL y, por supuesto, NLP - de alguno de los datasets disponibles (tweets o reviews de Amazon).

Además del análisis exploratorio, se pide que el alumno realice un **modelado de topics** identificando los principales temas que aparecen en los corpus, así como los tokens que los componen.

Será muy valorable si se incluyen **gráficos descriptivos** que describan los corpus utilizados.

In [None]:
# Importamos lo que vayamos a necesitar
import pandas as pd
import cudf
import numpy as np

import gensim
from gensim.corpora import Dictionary
from gensim.models import LdaModel, CoherenceModel

import pyLDAvis
import pyLDAvis.gensim
import matplotlib.pyplot as plt

- importar varios archivos
- scar el reviewText y el overall(rating) y nuevo campo (clase o similar que seal del df archivo que viene
- crear varias etiquetas para el rating
    - biario: positivo negativo
    - triario: positivo neutro negativo
    - n-ario: 1 por clase (siento el ceil(n/2) el neutro)
- combinar N documentos de cada dataset y un shufle
- Preprocesar el conjunto y sacar el diccionario
- sacar el LDA

In [None]:
# Con esta function extraeremos el df:
#     - file_name: Nombre del archivo
#     - main_category: Categoria principal para la clasificacion
#     - limit: numero maximo de filas que tendra nuesto df
def extract_df(file_name, main_category, limit = 10000):
    data = pd.read_json(file_name, lines=True)
    data = data [['reviewText', 'overall' , 'helpful']]  
    data.rename(columns={"reviewText": "review", "overall": "rating"}, inplace=True)
    data['category'] = main_category

    # Procesamos el atributo helpful para que sea un numero
    aux = np.zeros(len(data))
    for i, it in enumerate(data['helpful']):
        aux[i] = (0 if it[1] == 0 else it[0] / it[1])
    data['helpful'] = aux
    
    index_list = np.array(data.index)
    np.random.shuffle(np.reshape(index_list, (-1, 1)))
    data = data.loc[index_list[:limit], :]
    data.reset_index()
    return data

In [None]:


files_to_read = [
    { 'file_name': './datasets/reviews_CDs_and_Vinyl_5.json.gz', 'main_category': 'Music CD/Vinyl'},
#     { 'file_name': './datasets/reviews_Electronics_5.json.gz', 'main_category': 'Electronics'},
#     { 'file_name': './datasets/reviews_Movies_and_TV_5.json.gz', 'main_category': 'Movies/TV'},
#     { 'file_name': './datasets/reviews_Musical_Instruments_5.json.gz', 'main_category': 'Musical instruments'}
]


frames = [ extract_df(**f) for f in files_to_read ]
result = cudf.concat(frames)
result
b

In [None]:
results

In [None]:
index_list = np.array(data.index)
np.random.shuffle(np.reshape(index_list, (-1, 1)))
shuffled_df = df.loc[index_list[:limit], :]

In [None]:
!pip install cudf

In [None]:
data = pd.read_json('./datasets/reviews_Musical_Instruments_5.json.gz', lines=True)

In [None]:
data.head()

In [None]:
# Extraemos las reviews para sacar los topics de ellas y comprobamos si hay algun valor nolo
reviews = data[['reviewText']]
if reviews.isna().values.any():
    reviews.dropna(inplace=True)

In [None]:
reviews

In [None]:
reviews['reviewText'][0]

In [None]:
def text_preprocessing(text):
    result=[]
    for token in gensim.utils.simple_preprocess(text) :
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(token)
    return result

In [None]:
print('Original text:\n{}\n\n'.format(reviews['reviewText'][0]))
print('Processed text:\n{}'.format(text_preprocessing(reviews['reviewText'][0])))

In [None]:
processed_texts = []
for text in reviews['reviewText'][:5000]:
    processed_texts.append(text_preprocessing(text))


In [None]:
dictionary = Dictionary(processed_texts)

In [None]:
corpus = [dictionary.doc2bow(doc) for doc in processed_texts]

In [None]:
list(dictionary.items())

In [None]:
corpus[0:1]

In [None]:
for it in corpus[0:1]:
    for w, f in it:
        print(dictionary[w], f)

In [None]:
num_topics = 3

lda_model = LdaModel(
    corpus=corpus,
    id2word=dictionary,
    num_topics=num_topics,
    iterations=5,
    passes=10,
    alpha='auto'
)

In [None]:
def compute_coherence_values(dictionary, corpus, texts, limit, start=2, step=3):
    """
    Compute c_v coherence for various number of topics

    Parameters:
    ----------
    dictionary : Gensim dictionary
    corpus : Gensim corpus
    texts : List of input texts
    limit : Max num of topics

    Returns:
    -------
    model_list : List of LDA topic models
    coherence_values : Coherence values corresponding to the LDA model with respective number of topics
    """
    coherence_values = []
    model_list = []
    for num_topics in range(start, limit, step):
        
        # Build LDA model
        model = gensim.models.ldamulticore.LdaMulticore(corpus=corpus,
                                                id2word=dictionary,
                                                num_topics=num_topics)
        
        # Create a list of LDA models
        model_list.append(model)
        
        # Compute the Coherence for each model
        coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())

    return model_list, coherence_values

In [None]:
start_ = 4
end_ = 25
step_ = 1

In [None]:
import time

In [None]:
start_time = time.time()

In [None]:
model_list, coherence_values = compute_coherence_values(
    dictionary=dictionary,
    corpus=corpus,
    texts=processed_texts,
    start=start_,
    limit=end_,
    step=step_
)

In [None]:
time.time()  - start_time

In [None]:
optimal_id = np.argmax(coherence_values)

In [None]:
x = range(start_, end_, step_)
plt.plot(x, coherence_values)
plt.axvline(optimal_id + start_, c='g', ls='--', alpha=0.8)
plt.xlabel('Num Topics')
plt.ylabel('Coherence score')
plt.legend(('coherence_values'), loc='best')
plt.show()

In [None]:
optimal_model = model_list[optimal_id]

In [None]:
word_dict = {};
for i in range(len(optimal_model.get_topics())):
    words = optimal_model.show_topic(i, topn = 20)
    word_dict['Topic #' + '{:02d}'.format(i+1)] = [i[0] for i in words]
pd.DataFrame(word_dict)