# Introducción

Librerías

In [1]:
import statistics
import spacy
import nltk
import json
import pdb
import re

from nltk.corpus import stopwords
from senticnet.babelsenticnet import BabelSenticNet
from gyms import GYMS

Utilidades

In [2]:
UNWANTED_CHARS = ['!', ',', '"', '-', '...','–','XD', 'xD', '¿', '?', '—', '\n', "#", '¡', ':', "“", '.', '(', ')',"¬¬", "\('.')/", "*", '\n', '»', '\x97', '\x85']
SPANISH_STOP_WORDS = stopwords.words('spanish')

file = '/home/accg14/Documents/FIng/Taller ARS/Entrega/AdD-2021-Instagram-research/gyms/sources/crossfitdelsur/crossfitdelsur.json'


# Preprocesamiento
En esta etapa, se recuperan los comentarios y cantidad de _likes_ de las publicaciones de las instituciones (gimnasios).


## Extracción desde las fuentes 🗄
La extracción de los metadatos (comentarios, cantidad de _likes_, _timpestamps_ entre otros) de los _posts_ se obtiene mediante el repositorio [instagram-scraper](https://github.com/arc298/instagram-scraper).
Para el actual estudio, se recuperan 30 _posts_ (los más actualizados) de cada gimnasio.

Extracción de los comentarios.

In [3]:
def get_posts_comments(file):
    with open(file) as json_file:
        institute_posts = json.load(json_file)
        for posts in institute_posts['GraphImages']:
            comments_data = posts['comments']
            for comment_data in comments_data['data']:
                yield(comment_data['text'])

Extracción de número de _likes_

In [4]:
def get_posts_likes(file):
    with open(file) as json_file:
        institute_posts = json.load(json_file)
        for posts in institute_posts['GraphImages']:
            likes_received = posts['edge_media_preview_like']['count']
            yield(likes_received)

In [5]:
def get_posts_metadata(file):
    posts_comments = []
    for post_comment in get_posts_comments(file):
        posts_comments.append(post_comment)
    
    posts_likes = []
    for post_like in get_posts_likes(file):
        posts_likes.append(post_like)
    
    return posts_comments, posts_likes

In [6]:
for gym in GYMS.keys():
    gym_file_path = GYMS[gym]['file_path']
    posts_comments, posts_likes = get_posts_metadata(gym_file_path)
    
    GYMS[gym]['posts_comments'] = posts_comments
    GYMS[gym]['posts_likes'] = posts_likes

## Curación de los datos 🧹️
En esta etapa, se remueven caracteres que generan inputs de baja calidad, incluyendo: emojis, caracteres no ascii, signos de exclamación y de interrogación entre otros (todos aquellos no alfabeticos).

In [7]:
def remove_unwanted_chars(token):
    for char in UNWANTED_CHARS:
        token = token.replace(char, ' ')
        token = re.sub('@\w*', '', token)
        token = re.sub('\$', ' ', token)
        token = re.sub('\d', '', token)
        
        return token

In [8]:
def remove_emoji(token):
    emoji_pattern = re.compile("["
                            u"\U0001F600-\U0001F64F"  # emoticons
                            u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                            u"\U0001F680-\U0001F6FF"  # transport & map symbols
                            u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                            u"\U00002702-\U000027B0"
                            u"\U000024C2-\U0001F251"
                            "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', token)

In [9]:
def remove_non_ascii(token):
    try:
        _ = token.encode('ascii')
        return token
    except:
        return None

In [10]:
def remove_spanish_stop_words(token):
    if (token in stopwords.words('spanish')):
        return None
    else:
        return token

In [11]:
def sanitize_comment(comment):
    comment = comment.split(" ")

    comment = list(map(remove_emoji, comment))
    comment = list(map(remove_unwanted_chars, comment))

    comment = list(filter(lambda x: len(x)>0, comment))
    comment = list(map(remove_spanish_stop_words, comment))
    comment = list(filter(lambda x: x is not None, comment))
    comment = list(map(remove_non_ascii, comment))
    comment = list(filter(lambda x: x is not None, comment))
    comment = list(map(lambda x: x.lower(), comment))
    comment = list(map(remove_unwanted_chars, comment))
    comment = list(map(lambda x: x.replace(" ", ""), comment))
    return comment

In [12]:
def get_sanitized_posts_comments(comments):
    sanitized_comments = []
    for comment in comments:
        sanitized_comment = sanitize_comment(comment)
        if len(sanitized_comment)>0:
            sanitized_comments.append(sanitized_comment)  
    return sanitized_comments

In [13]:
for gym in GYMS.keys():
    gym_posts_comments = GYMS[gym]['posts_comments']
    gym_sanitizez_posts_comments = get_sanitized_posts_comments(gym_posts_comments)    
    GYMS[gym]['posts_comments'] = gym_sanitizez_posts_comments

# Obtención de la polaridad 😠😄
En esta etapa se obtiene el sentimiento relacionado a cada palabra (positivo | neutro | negativo) del comentario de la siguiente forma:
1. Si la palabra aparece en el conjunto, se recuperan su etiqueta y valor de polaridad.
2. Si la palabra no aparece en el conjunto, se obtiene (cuando existe) el lema de la misma y se retornan etiqueta y valor de polaridad del lema (si existe).
3. Cuando ninguna de las anteriores condiciones se cumple, se suprime la palabra del comentario.

Luego, la etiqueta de polarización de cada comentario es la instancia con más de repeticiones de las etiquetas de las palabras que lo componen. De igual forma, el valor asociado a dicha polarización es la media de sus palabras.

Al mismo tiempo, se obtiene la media de cantidad de _likes_ como un insumo de información extra.

**Observación**: el estudio del español a nivel computacional no ha llegado al estadio del ingles, por lo que no todas las palabras tienen un lema asociado, y expresiones típicas de ciertos lugares (en este contexto UY, quedan por fuera del alcance).

In [14]:
nlp = spacy.load('es_core_news_sm')

In [15]:
def lemmatize(token):
    token = nlp(token)
    token_lemma = [t.lemma_ for t in token]
    if len(token_lemma)>0:
        return token_lemma[0]
    else:
        return [token]

In [16]:
def classify_token_polarity(token, token_lemma):
    bsn = BabelSenticNet('es')
    try:
        polarity_label = bsn.polarity_label(token)
        polarity_value = bsn.polarity_value(token)
        return polarity_label, polarity_value
    except:
        try:
            polarity_label = bsn.polarity_label(token_lemma)
            polarity_value = bsn.polarity_value(token_lemma)
            return polarity_label, polarity_value
        except:        
            return 'neutral', 0

In [17]:
def classify_token(token):
    token_lemma = lemmatize(token)
    token_polarity_label, token_polarity_value = classify_token_polarity(token, token_lemma)
    return token_polarity_label, token_polarity_value

In [18]:
def classify_polarities(comment_polarity_labels, comment_polarity_values):
    polarity_label = max(set(comment_polarity_labels), key = comment_polarity_labels.count)
    polarity_value = statistics.mean(comment_polarity_values)
    if len(comment_polarity_values)>1:
        polarity_variance = statistics.variance(comment_polarity_values)
    else:
        polarity_variance = 0
    return polarity_label, polarity_value, polarity_variance

In [19]:
def get_metrics(comments_polarities):
    comments_polarity_labels = []
    comments_polarity_values = []
    comments_polarity_variances = []
    
    for comment_polarity in comments_polarities:
        comment_polarity_labels = comment_polarity[0]
        comment_polarity_values = comment_polarity[1]
        polarity_label, polarity_value, polarity_variance = classify_polarities(comment_polarity_labels, comment_polarity_values)
        
        comments_polarity_labels.append(polarity_label)
        comments_polarity_values.append(polarity_value)
        comments_polarity_variances.append(polarity_variance)

    individual_mean_variance = statistics.variance(comments_polarity_variances)
    comments_polarity, comments_value, comments_variance = classify_polarities(comments_polarity_labels, comments_polarity_values)

    return comments_polarity, comments_value, comments_variance, individual_mean_variance

In [20]:
def classify_comments(comments):
    comments_polarity_labels = []
    comments_polarity_values = []

    for comment in comments:
        comment_polarity_labels = []
        comment_polarity_values = []
        for token in comment:
            token_polarity_label, token_polarity_value = classify_token(token)
            comment_polarity_labels.append(token_polarity_label)
            comment_polarity_values.append(token_polarity_value)

        comments_polarity_labels.append(comment_polarity_labels)
        comments_polarity_values.append(comment_polarity_values)

    comments_polarities = list(zip(comments_polarity_labels, comments_polarity_values))
    comments_polarity, comments_value, comments_variance, individual_mean_variance = get_metrics(comments_polarities)

    return comments_polarity, comments_value, comments_variance, individual_mean_variance

In [21]:
for gym in GYMS.keys():
    gym_posts_comments = GYMS[gym]['posts_comments']
    comments_polarity, comments_value, comments_variance, individual_mean_variance = classify_comments(gym_posts_comments)
    gym_mean_posts_likes = statistics.mean(GYMS[gym]['posts_likes'])

    GYMS[gym]['gym_sentiments'] = {
        'comments_polarity': comments_polarity,
        'comments_value': str(comments_value),
        'comments_variance': str(comments_variance),
        'individual_mean_variance': str(individual_mean_variance),
        'mean_posts_likes': str(gym_mean_posts_likes)
    }

In [22]:
print("\033[1m"+"Clasificación final de los sentimientos asociados a los gimnasios: "+"\033[0m")
for gym in GYMS.keys():
    gym_sentiments = GYMS[gym]['gym_sentiments']
    print("     GYM: "+"\033[1m"+gym+"\033[0m")
    print("          Etiqueta: "+"\033[1m"+gym_sentiments['comments_polarity']+"\033[0m")
    print("          Valor: "+"\033[1m"+gym_sentiments['comments_value']+"\033[0m")
    print("          Varianza total: "+"\033[1m"+gym_sentiments['comments_variance']+"\033[0m")
    print("          Varianza media: "+"\033[1m"+gym_sentiments['individual_mean_variance']+"\033[0m")
    print("          Likes media: "+"\033[1m"+gym_sentiments['mean_posts_likes']+"\033[0m")
    print("-----------------------------------------------")
    

[1mClasificación final de los sentimientos asociados a los gimnasios: [0m
     GYM: [1macptrainingcenter[0m
          Etiqueta: [1mpositive[0m
          Valor: [1m0.14256150793650793[0m
          Varianza total: [1m0.03936698501812341[0m
          Varianza media: [1m0.04719250910449502[0m
          Likes media: [1m35.8[0m
-----------------------------------------------
     GYM: [1mavenidacrossfit[0m
          Etiqueta: [1mpositive[0m
          Valor: [1m0.04432234498543487[0m
          Varianza total: [1m0.03612181207513422[0m
          Varianza media: [1m0.043584599147544784[0m
          Likes media: [1m93.46666666666667[0m
-----------------------------------------------
     GYM: [1mcrossfitdelsur[0m
          Etiqueta: [1mpositive[0m
          Valor: [1m0.09269698412698413[0m
          Varianza total: [1m0.05404152206400625[0m
          Varianza media: [1m0.016636835910456636[0m
          Likes media: [1m86.76666666666667[0m
-------------------

# Conclusión 🤝

# Trabajo futuro 📓

# Perfiles estudiados 📱

* [@acptrainingcenter](https://www.instagram.com/acptrainingcenter/)
* [@avenidacrossfit](https://www.instagram.com/avenidacrossfit/)
* [@crossfitdelsur](https://www.instagram.com/crossfitdelsur/)
* [@instintocf](https://www.instagram.com/instintocf/)
* [@montevideoknockout](https://www.instagram.com/montevideoknockout/)
* [@mvdrustybox](https://www.instagram.com/mvdrustybox/)
* [@sensefit](https://www.instagram.com/sensefit/)
* [@tekoacrossfit](https://www.instagram.com/tekoacrossfit/)