### 1. Modelado de Topics

El objetivo principal de este ejercicio es el de realizar un **análisis exploratorio** - etapa principal en cualquier problema de analítica, ML, DL y, por supuesto, NLP - de alguno de los datasets disponibles (tweets o reviews de Amazon).

Además del análisis exploratorio, se pide que el alumno realice un **modelado de topics** identificando los principales temas que aparecen en los corpus, así como los tokens que los componen.

Será muy valorable si se incluyen **gráficos descriptivos** que describan los corpus utilizados.

In [1]:
# Importamos lo que vayamos a necesitar
import pandas as pd
import numpy as np
import cudf
import string

import gensim
from gensim.corpora import Dictionary
from gensim.models import LdaModel, CoherenceModel

import pyLDAvis
import pyLDAvis.gensim

from collections import Counter
import matplotlib.pyplot as plt
from stop_words import get_stop_words
from nltk.stem.snowball import EnglishStemmer
from nltk.probability import FreqDist
from wordcloud import WordCloud

# Nuestras funciones del archivo utils, para el procesado del texto
from utils import file_to_dict, process_text

# borrar
from multiprocessing.pool import Pool
from functools import partial
from os import cpu_count

  regargs, varargs, varkwargs, defaults, formatvalue=lambda value: ""
  from collections import Sequence, defaultdict


## 1. Extraccion y procesado de datos

In [2]:
data = pd.read_csv('./datasets/reviews.csv')

In [3]:
data.head()

Unnamed: 0,review,rating,helpful,category
0,A fun way to bling up your desk and make sure ...,4,1.0,Office products
1,I continue to love this show. Raylan and the r...,5,0.0,Amazon instant videos
2,Arrived in super flash time. Like another rev...,5,0.0,Patio lawn/garden
3,This treat ball works as expected. I used Temp...,4,0.5,Pet supplies
4,I know it's extrange but it works! It is easy ...,4,0.0,Baby


In [4]:
# Sacamos la lista de las categorias que luego usaremos como número de topics
categories = data['category'].unique()

In [5]:
# Extraemos las reviews y las convertimos en una lista que posteriormente usaremos para entrenar nuestro modelo
reviews = data[['review']]
if reviews.isna().values.any():
    reviews.dropna(inplace=True)
reviews_list = reviews['review'].tolist()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


In [6]:
# Constants
lemas = file_to_dict('./datasets/lemmatization-en.txt')
min_length = 50
en_stop_words = gensim.parsing.preprocessing.STOPWORDS
translate_table = dict((ord(char), None) for char in string.punctuation)  
lang='en'
stemmer = EnglishStemmer(ignore_stopwords=True)

In [None]:
# Funcion para procesar el texto
def process_text(min_length=50, lemas_dict={},translate_table={}, stop_words=[], lang='en', stemmer=None, review=None):
    if not review or len(review) < min_length:
        return None
    
    #Sustituimos si hay algun caracter &#DD
    review = re.sub(r'(&#\d+) | (&\w+)', '', review)
    
    words = []
    # Pocessamos nuestra review valida
    for word in re.split(r'[;,.\'\s]\s*', review):
        # lematizamos
        word = lemas_dict.get(word) or word
        # stemmer
        word = stemmer.stem(word) if stemmer else word
        # Quitamos los signos de puntuacion
        word = word.translate(translate_table)
        # Comprobamos que tenga algun valor
        # Comprobamos que tenga una longitud minima de 3 caracteres
        # Comprobamos que no sea un stopword
        if word and len(word) > 3 and word not in stop_words:
            # Comprobamos si es un numero y lo sutituimos si tenemos el idioma
            if lang and word.isdigit() :
                word = num2words(word, lang=lang, ordinal=False)
            # Añadimos a la lista
            # La pasamos a minuscula
            words.append(word.lower())

    return words

In [11]:
def process_row(row):
    if type(row['review']) is str:    
        aux = process_text(min_length, lemas, translate_table, en_stop_words, None, None, row['review'])
        if aux:
            return {
                'review': ' '.join(aux),
                'review_processed': aux,
                'category': row['category']
            }

In [42]:
df = data.apply(lambda row: process_row(row), axis=1)
df.dropna(inplace=True)
df = pd.DataFrame(list(df))

In [43]:
df

Unnamed: 0,review,review_processed,category
0,ways bling desks making surest ones guys wants...,"[ways, bling, desks, making, surest, ones, guy...",Office products
1,continuing loving shows raylan rests character...,"[continuing, loving, shows, raylan, rests, cha...",Amazon instant videos
2,arrived super flashing timing like reviewers h...,"[arrived, super, flashing, timing, like, revie...",Patio lawn/garden
3,this treats balls works expected tempations ca...,"[this, treats, balls, works, expected, tempati...",Pet supplies
4,knows extrange buts works easiest cleans eight...,"[knows, extrange, buts, works, easiest, cleans...",Baby
...,...,...,...
177121,second years happiest straps locks great products,"[second, years, happiest, straps, locks, great...",Musical instruments
177122,this units says evens tried outs friends dodge...,"[this, units, says, evens, tried, outs, friend...",Automotive
177123,years smead hangings folders they plastics rem...,"[years, smead, hangings, folders, they, plasti...",Office products
177124,pros dons needs touching deads looks cons silv...,"[pros, dons, needs, touching, deads, looks, co...",Patio lawn/garden


In [41]:
pool = Pool(cpu_count())

res = [it for it in pool.map(aux, data.head(n=2).iterrows()) if it]
pool.close()

<class 'tuple'>
<class 'tuple'>


TypeError: 'int' object is not iterable

In [22]:
pool = Pool(cpu_count())
func = partial(process_text, min_length, {}, translate_table, en_stop_words, None, None)
res = [it for it in pool.map(func, reviews_list) if it]

In [23]:
res[0]

['bling',
 'desk',
 'sure',
 'want',
 'steal',
 'stapler',
 'returning',
 'immediately',
 'stickers',
 'stayed',
 'offered',
 'personalization',
 'vibrant',
 'great',
 'teen',
 'collegeaged',
 'girl']

In [None]:
dictionary = Dictionary(res)

In [None]:
corpus = [dictionary.doc2bow(doc) for doc in res]

In [None]:
## Poner alguna grafica aqui que pinte datos

In [None]:
word_frequences = [(dictionary[w], f) for t in corpus for w, f in t]

In [None]:
max(aux[0])

In [None]:
most_frequences = [(w,f) for w,f in word_frequences if f > 10]

In [None]:
words = [w for w, f in most_frequences]
freqs = [f for w, f in most_frequences]


In [None]:
# Create and generate a word cloud image:
wordcloud = WordCloud().generate(' '.join(words))

# Display the generated image:
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()

In [None]:
words

In [None]:
plt.barh(words, freqs)
plt.title('Bigram frequencies')
plt.show()

### 2. Extraccion de topics

In [None]:
def extract_topics(corpus, dictionary, texts, num_topics):
    lda_model = LdaModel(
        corpus=corpus,
        id2word=dictionary,
        num_topics=num_topics,
        iterations=5,
        passes=10,
        alpha='auto'
    )
    
    coherence = CoherenceModel(
        model=lda_model, 
        texts=texts, 
        dictionary=dictionary, 
        coherence='c_v').get_coherence()
    
    perplexity = lda_model.log_perplexity(corpus)
    
    return lda_model, coherence, perplexity

In [None]:
def view_topics(lda_model, topn=20):
    word_dict = {};
    for i in range(len(lda_model.get_topics())):
        words = lda_model.show_topic(i, topn = topn)
        word_dict['Topic #' + '{:02d}'.format(i+1)] = [i[0] for i in words]
    return pd.DataFrame(word_dict)

In [None]:
def create_pyLDAvis(lda_model, corpus, dictionary, export=False):
    pyLDAvis.enable_notebook()
    vis = pyLDAvis.gensim.prepare(lda_model, corpus, dictionary)
    if export:
        pyLDAvis.save_html(vis, f'./datasets/lda_model_vis_{len(lda_model.get_topics())}_topics.html')
    return vis

In [None]:
num_topics = len(categories)

lda_model, coherence, perplexity = extract_topics(corpus, dictionary, res, num_topics)

print(f"Para el modelo con {num_topics} topics, tiene una coherecia de {coherence} y una perplejidad de {perplexity}")

In [None]:
# Nuestras categorias iniciales son estas, vamos a ver como las comparamos con las extraidas del lda_model
for i,c in enumerate(categories):
    print(f"Categoria {i}: {c}")

In [None]:
view_topics(lda_model, 10)

### Observamos que podemos clasificar los topics de la siguiente manera:

<span style="color:green">**Topic#01** => </span> Al tener palabras como <span style="color:blue">*songs*, *tracks*, *lyrics* </span>podemos definir que se corresponde con nuestra categoria de <span style="color:orange">*Digital Music*</span>

<span style="color:green">**Topic#02** => </span> Al tener palabras como <span style="color:blue">*taping*, *paper*, *printer* </span>podemos definir que se corresponde con nuestra categoria de <span style="color:orange">*Office products*</span>

<span style="color:green">**Topic#03** => </span> Al tener palabras como <span style="color:blue">*babies*, *cleans*, *seats*, *washing* </span>podemos definir que se corresponde con nuestra categoria de <span style="color:orange">*Baby*</span>, a mi parecer puede ser por por productos para bebe que sean faciles de limpiar

In [None]:
create_pyLDAvis(lda_model, corpus, dictionary, True)

In [None]:
num_topics = 24

lda_model_2, coherence_2, perplexity_2 = extract_topics(corpus, dictionary, res, num_topics)

print(f"Para el modelo con {num_topics} topics, tiene una coherecia de {coherence_2} y una perplejidad de {perplexity_2} ")

In [None]:
view_topics(lda_model_2, 10)

In [None]:
create_pyLDAvis(lda_model_2, corpus, dictionary, True)

In [None]:
from nltk.stem.snowball import EnglishStemmer
stemmer = EnglishStemmer(ignore_stopwords=True)

In [None]:
stemmer

In [None]:
print('{0:15}{1:10}'.format('Token' ,'Stem'))
for word in res[0]:
    print('{0:15}{1:10}'.format(word, stemmer.stem(word)))