In [1]:
import statistics
import json
import pdb
import re

import nltk
from nltk.corpus import stopwords

from senticnet.babelsenticnet import BabelSenticNet
import spacy

In [2]:
file = '/home/accg14/Documents/FIng/Taller ARS/Entrega/AdD-2021-Instagram-research/gyms/sources/crossfitdelsur/crossfitdelsur.json'
UNWANTED_CHARS = ['!', ',', '"', '-', '...','–','XD', 'xD', '¿', '?', '—', '\n', "#", '¡', ':', "“", '.', '(', ')',"¬¬", "\('.')/", "*", '\n', '»', '\x97', '\x85']
SPANISH_STOP_WORDS = stopwords.words('spanish')

# Pre processing stage
En esta etapa, se recuperan los comentarios de las publicaciones de las instituciones (gimnasios) y son procesados, lo cual incluye:
1. Recuperar comentarios (utilizar yield permite mejora de performance) y agruparlos en una estructura.
2. Remover caracteres que generan inputs de baja calidad, emojis, no ascii, signos de exclamación, de interrogación entre otros (todos aquellos no alfabeticos).

In [3]:
def get_posts_comments(file):
    with open(file) as json_file:
        institute_posts = json.load(json_file)
        for posts in institute_posts['GraphImages']:
            comments_data = posts['comments']
            for comment_data in comments_data['data']:
                yield(comment_data['text'])

In [4]:
comments = []
for comment in get_posts_comments(file):
    comments.append(comment)

In [5]:
def remove_unwanted_chars(token):
    for char in UNWANTED_CHARS:
        token = token.replace(char, ' ')
        token = re.sub('@\w*', '', token)
        token = re.sub('\$', ' ', token)
        token = re.sub('\d', '', token)
        
        return token

def remove_emoji(token):
    emoji_pattern = re.compile("["
                            u"\U0001F600-\U0001F64F"  # emoticons
                            u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                            u"\U0001F680-\U0001F6FF"  # transport & map symbols
                            u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                            u"\U00002702-\U000027B0"
                            u"\U000024C2-\U0001F251"
                            "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', token)

def remove_non_ascii(token):
    try:
        _ = token.encode('ascii')
        return token
    except:
        return None

def remove_spanish_stop_words(token):
    if (token in stopwords.words('spanish')):
        return None
    else:
        return token


In [6]:
def sanitize_comment(comment):
    comment = comment.split(" ")

    comment = list(map(remove_emoji, comment))
    comment = list(map(remove_unwanted_chars, comment))

    comment = list(filter(lambda x: len(x)>0, comment))
    comment = list(map(remove_spanish_stop_words, comment))
    comment = list(filter(lambda x: x is not None, comment))
    comment = list(map(remove_non_ascii, comment))
    comment = list(filter(lambda x: x is not None, comment))
    comment = list(map(lambda x: x.lower(), comment))
    comment = list(map(remove_unwanted_chars, comment))
    comment = list(map(lambda x: x.replace(" ", ""), comment))
    return comment

In [7]:
sanitized_comments = []
for comment in comments:
    sanitized_comment = sanitize_comment(comment)
    if len(sanitized_comment)>0:
        sanitized_comments.append(sanitized_comment)
        print(sanitized_comment)

['deseando', 'volver']
['todos', 'queremos', 'volver']
['si', 'por', 'favor']
['minutos', 'seguro']
['mucho']
['clases', 'gente']
['se', 'gente,', 'volveremos']
['estaremos', 'pronto']
['es', 'dar', 'sigo', 'pasos']
['despegado']
['vamo', 'vamoooo', 'siempre', 'apoyando', 'movimiento']
['crack']
['genios', 'vamos', 'am']
['grande', 'javi', '']
['am', 'vamos']
['vamo', 'arriba']
['vamo', 'arriba', 'javi', 'estaremos', 'firmes', 'rulo', 'estatua']
['...', 'apoyo', 'comunidad,', 'abrazo', 'grande', 'seguir', 'empujando', "pa'delante", 'crossfitdelsur']
['grandes', 'genial', 'iniciativa']
['chicos', 'pueden', 'poner', 'link', 'canal', 'youtube?', 'me', 'costando', 'encontrarlo']
['que', 'tremendo']
['excelente,', 'paso', 'eso.', 'mismo,', 'mas', 'q', 'agradecida']
['por', 'suerte', 'procesos', 'caidas', 'uds', 'sostenernos', 'recordarnos', 'si', 'puede']
['es', 'parte', 'camino,', 'importa', 'cuantas', 'veces', 'quedes', 'rodillas,', 'importa', 'veces', 'levantas', 'rock']
['feliz', 'chica

# Processing stage (encoding)
En esta etapa, se obtiene el sentimiento relacionado a cada palabra del comentario de la siguiente forma:
1. Si la palabra aparece en el conjunto, se recuperan su etiqueta y valor de polaridad.
2. Si la palabra no aparece en el conjunto, se obtiene (cuando existe) el lema de la misma y se retornan etiqueta y valor de polaridad del lema (si existe).
3. Cuando ninguna de las anteriores condiciones se cumple, se suprime la palabra del comentario.

Algunos comentarios: el estudio del español a nivel computacional no ha llegado al estadio del ingles, por lo que no todas las palabras tienen un lema asociado, y expresiones tipicas de ciertos lugares (en este contexto UY, quedan por fuera del alcance).

In [8]:
nlp = spacy.load('es_core_news_sm')

In [9]:
def get_token_lemma(token):
    token = nlp(token)
    token_lemma = [t.lemma_ for t in token]
    if len(token_lemma)>0:
        return token_lemma[0]
    else:
        return [token]

In [10]:
def classify_token_polarity(token, token_lemma):
    bsn = BabelSenticNet('es')
    try:
        polarity_label = bsn.polarity_label(token)
        polarity_value = bsn.polarity_value(token)
        return polarity_label, polarity_value
    except:
        try:
            polarity_label = bsn.polarity_label(token_lemma)
            polarity_value = bsn.polarity_value(token_lemma)
            return polarity_label, polarity_value
        except:        
            return 'neutral', 0

In [11]:
comments_polarity_label = []
comments_polarity_value = []

In [16]:
def get_token_classification(token):
    token_lemma = get_token_lemma(token)
    token_polarity_label, token_polarity_value = classify_token_polarity(token, token_lemma)
    return token_polarity_label, token_polarity_value

def get_comment_classification(comment_polarity_label, comment_polarity_value, variance=False):
    polarity_label = max(set(comment_polarity_label), key = comment_polarity_label.count)
    polarity_value = statistics.mean(comment_polarity_value)
    if variance:
        polarity_variance = statistics.variance(comment_polarity_value)
        return polarity_label, polarity_value, polarity_variance
    else:
        return polarity_label, polarity_value

In [17]:
for sanitized_comment in sanitized_comments:
    comment_polarity_label = []
    comment_polarity_value = []
    for token in sanitized_comment:
        token_polarity_label, token_polarity_value = get_token_classification(token)

        comment_polarity_label.append(token_polarity_label)
        comment_polarity_value.append(token_polarity_value)

    label, value = get_comment_classification(comment_polarity_label, comment_polarity_value, False)
    comments_polarity_label.append(label)
    comments_polarity_value.append(value)

final_label, final_value, final_variance = get_comment_classification(comments_polarity_label, comments_polarity_value, True)

In [21]:
print(final_label)
print(final_value)
print(final_variance)

positive
0.09664844209288653
0.05220099494668499
