# Estadísticas del etiquetado de emociones

In [427]:
import pandas as pd
import numpy as np
import plotly.express as px
import json
import matplotlib.pyplot as plt

In [428]:
with open('./jazmin flores 18-8-21.json') as json_file:
    jazmin = json.load(json_file)
    
with open('./omar hernandez 18-8-21.json') as json_file:
    mara = json.load(json_file)
    
with open('./mara vazquez 18-8-21.json') as json_file:
    omar = json.load(json_file)

In [429]:
psicologos = {'Jazmín Flores':jazmin, 'Mara Vázquez':mara, 'Omar Hernández':omar}

## Filtro un rango de fechas

In [430]:
# formato YYYY-MM-DD

import datetime
format = "%Y-%m-%d %H:%M:%S"

start_date = datetime.date(2021, 8, 1)
end_date = datetime.date(2021, 8, 20)

In [431]:
# hago un diccionario que contenga 1 df por cada psicólogo
df_dict = {}

### El id de los audios es la URL

In [432]:
for k in psicologos.keys():

    data = psicologos[k]
    df = pd.DataFrame(columns = ['id', 'url', 'date', 'emotion', 'intensity', 'confidence', 'updated', 'channel', 'lead_time', 'time_interval'])

    i = 0
    for d in data:
        identity = d['id']
        url = d['data']['url'].split('/')[-1]
        channel = int(d['data']['url'][-10])
        lead_time = d['annotations'][0]['lead_time']

        # para calcular el campo 'updated'
        created_str = d['annotations'][0]['created_at'].replace('T', ' ')[:-8]
        updated_str = d['annotations'][0]['updated_at'].replace('T', ' ')[:-8]
        created_time = datetime.datetime.strptime(created_str, format)
        updated_time = datetime.datetime.strptime(updated_str, format)
        time_interval = (updated_time - created_time).total_seconds()
        
        # primero filtro por fecha
        date = created_time.date()
        
        # si no está en el rango, la paso de largo
        if date < start_date or date > end_date:
#             print('fuera_de_rango', date, start_date, end_date)
            continue

        updated = 0
        if time_interval>0:
            updated = 1

        # completo las columnas
        df.loc[i, 'id'] = identity
        df.loc[i, 'url'] = url
        df.loc[i, 'updated'] = updated
        df.loc[i, 'date'] = date
        df.loc[i, 'channel'] = channel
        df.loc[i, 'lead_time'] = lead_time
        df.loc[i, 'time_interval'] = time_interval

        # completo los 3 campos restantes
        results = d['annotations'][0]['result']
        for r in results:
            field_name = r['from_name']
            df.loc[i, field_name] = r['value']['choices'][0]

        i = i+1
        
    df_dict[k] = df

## 1) Del total de los audios de una semana, en cuántos hubo updates?

In [433]:
update_ratio = {} # ratio cantidad de updates sobre cantidad de etiquetas
n_labels = {} # cuántos audios etiquetó

for k in psicologos.keys():
    
#     print(k)
#     print(df.head())
    
    df = df_dict[k]
    
    n_updates = df['updated'].sum() # cuántos updates hizo la persona
    N = df.shape[0] # cuántos audios etiquetó
    
    update_ratio[k] = n_updates/float(N)
    n_labels[k] = N

fig = px.histogram(x=update_ratio.keys(), y=update_ratio.values(), title="Ratio de updates sobre cantidad de audios", width=600, height=400).update_xaxes(categoryorder="total descending")
fig.update_layout(xaxis_title="Psicólogo", yaxis_title="Ratio")
fig.show()

## 2) Cuántas etiquetas hubo en el período seleccionado?

In [434]:
fig = px.histogram(x=n_labels.keys(), y=n_labels.values(), title="Cantidad de etiquetas por persona en el período seleccionado", width=600, height=400).update_xaxes(categoryorder="total descending")
fig.update_layout(xaxis_title="Psicólogo", yaxis_title="Cantidad de etiquetas")
fig.show()

## 3) Distribución de confianza por persona

In [435]:
for k in psicologos.keys():
    df = df_dict[k]
    
    df_agg = df[['id', 'confidence']].groupby(['confidence']).count()
    df_agg.reset_index(inplace=True)

    fig = px.histogram(df_agg, x=df_agg['confidence'], y=df_agg['id'], title=k, width=600, height=400).update_xaxes(categoryorder="total descending")
    fig.update_layout(xaxis_title="Confianza", yaxis_title="Cantidad de etiquetas")
    fig.show()

## 4) Tiempo total de etiquetado por persona
Sumar lead_time de todos los audios, por persona, por semana.

Me falta agregarlo a los df.

In [436]:
lead_time = {} # tiempo total por persona
for k in psicologos.keys():
    data = psicologos[k]
    l = [] # lead_time de cada audio
    for i in range(len(data)):
        l.append(data[i]['annotations'][0]['lead_time'])
    lead_time[k] = sum(l)/3600.0
    
fig = px.histogram(x=lead_time.keys(), y=lead_time.values(), title='Tiempo total de etiquetado por persona (en horas)', width=600, height=400).update_xaxes(categoryorder="total descending")
fig.update_layout(xaxis_title="Psicólogo", yaxis_title="Tiempo (h)")
fig.show()

## 5) Cantidad de updates por cada emoción

In [437]:
for k in psicologos.keys():
    df = df_dict[k]
    
    df_agg = df[['emotion', 'updated']].groupby(['emotion']).sum()
    df_agg.reset_index(inplace=True)

    fig = px.histogram(df_agg, x=df_agg['emotion'], y=df_agg['updated'], title=k, width=600, height=400)
    fig.show()

## Cuántos etiquetados por día hizo cada persona?

In [438]:
for k in psicologos.keys():
    df = df_dict[k]
    
    df_agg = df.groupby(['date']).count()
    df_agg.reset_index(inplace=True)

#    fig = px.histogram(df_agg, x=df_agg['date'], y=df_agg['id'], title=k, width=600, height=400)
    fig = px.line(df_agg, x=df_agg['date'], y=df_agg['id'], title=k, width=600, height=400, markers=True)
    fig.update_layout(xaxis_title="Fecha", yaxis_title="Cantidad de etiquetas")
    fig.show()

## Cantidad de etiquetas por cada emoción

In [439]:
for k in psicologos.keys():
    df = df_dict[k]

    fig = px.histogram(df, x="emotion", color="channel", title=k, width=800, height=600).update_xaxes(categoryorder="total descending")
    fig.show()

## Cantidad por cada valor de intensidad

In [440]:
for k in psicologos.keys():
    df = df_dict[k]

    fig = px.histogram(df, x="intensity", color="channel", title=k, width=600, height=400)
    fig.show()

## Cantidad por cada valor de confianza

In [441]:
for k in psicologos.keys():
    df = df_dict[k]

    fig = px.histogram(df, x="confidence", color="channel", title=k, width=600, height=400)
    fig.show()

## Lead time

In [442]:
for k in psicologos.keys():
    df = df_dict[k]

#    fig = px.histogram(df, x="lead_time", color="channel", title=k, width=600, height=400)
    fig = px.strip(df, x="lead_time", color="channel", title=k, width=600, height=400)
    fig.update_layout(xaxis_title="Lead time (s)", yaxis_title="Channel")
    fig.show()

## Tiempo entre create y update

In [443]:
for k in psicologos.keys():
    df = df_dict[k]

#    fig = px.histogram(df, x="time_interval", color="channel", title=k, width=600, height=400)
    fig = px.strip(df, x="time_interval", color="channel", title=k, width=600, height=400)
    fig.show()

## Coincidencias en el tipo de emoción

In [444]:
for k in psicologos.keys():
    df = df_dict[k]
    
    #print(df.sort_values(by=['url']).head())

In [445]:
# me quedo con el channel de uno solo, porque debería ser igual en todos
df_tmp = pd.merge(df_dict['Jazmín Flores'][['url', 'id', 'emotion', 'channel']], df_dict['Omar Hernández'][['url', 'id', 'emotion']], on='url', how='inner')
#df_merge.head()

In [446]:
df_tmp.rename(columns = {'id_x':'id_jazmin', 'emotion_x':'emotion_jazmin',
                              'id_y':'id_omar', 'emotion_y':'emotion_omar'}, inplace = True)
#df_tmp.head()

In [447]:
df_merge = pd.merge(df_tmp, df_dict['Mara Vázquez'][['url', 'id', 'emotion']], on='url', how='inner')
#df_merge.head()

In [448]:
df_merge.rename(columns = {'id':'id_mara', 'emotion':'emotion_mara'}, inplace = True)
#df_merge.head()

In [449]:
df_merge['acuerdo_jazmin_omar'] = df_merge['emotion_jazmin']==df_merge['emotion_omar']
df_merge['acuerdo_jazmin_mara'] = df_merge['emotion_jazmin']==df_merge['emotion_mara']
df_merge['acuerdo_mara_omar'] = df_merge['emotion_mara']==df_merge['emotion_omar']
df_merge['acuerdo_todos'] = ((df_merge['emotion_mara']==df_merge['emotion_omar']) & (df_merge['emotion_mara']==df_merge['emotion_jazmin']))
df_merge['acuerdo_al_menos_2'] = df_merge['acuerdo_jazmin_omar'] | df_merge['acuerdo_jazmin_mara'] | df_merge['acuerdo_mara_omar']
#df_merge.head()

### Coincidencias de a pares

In [450]:
acuerdo_cols = [col for col in df_merge.columns if 'acuerdo' in col]
acuerdo_dict = {}

for c in acuerdo_cols:
    acuerdo_dict[c] = df_merge[c].value_counts()[True]

In [451]:
#acuerdo_dict

In [452]:
fig = px.histogram(x=acuerdo_dict.keys(), y=acuerdo_dict.values(), title='Coincidencias en el tipo de emoción', width=600, height=500).update_xaxes(categoryorder="total descending")
fig.update_layout(xaxis_title="Acuerdo", yaxis_title="Count")
fig.show()

### Coincidencias por cada tipo de emoción

In [453]:
# listo las emociones

emotion_values = []

for k in psicologos.keys():
    df = df_dict[k]
    #print(k)
    #print(df.emotion.unique())
    for e in df.emotion.unique():
        emotion_values.append(e)
    
# print(emotion_values)
print(set(emotion_values))

{nan, 'Neutro', 'Disgusto', 'Miedo', 'Ira', 'Sorpresa', 'Alegría', 'No_valorable', 'Tristeza'}


In [454]:
#df_dict['Jazmín Flores'][pd.isnull(df_dict['Jazmín Flores']['emotion'])].head()

### Qué son los NaN de arriba?

In [455]:
for e in set(emotion_values):
    try:
        df_acuerdo = df_merge[['url', 'emotion_jazmin', 'emotion_omar', 'emotion_mara']].copy()
        df_acuerdo['acuerdo_jazmin_omar'] = (df_merge['emotion_jazmin']==e) & (df_merge['emotion_jazmin']==df_merge['emotion_omar'])
        df_acuerdo['acuerdo_jazmin_mara'] = (df_merge['emotion_jazmin']==e) & (df_merge['emotion_jazmin']==df_merge['emotion_mara'])
        df_acuerdo['acuerdo_mara_omar'] = (df_merge['emotion_mara']==e) & (df_merge['emotion_mara']==df_merge['emotion_omar'])
        df_acuerdo['acuerdo_al_menos_2'] = df_acuerdo['acuerdo_jazmin_omar'] | df_acuerdo['acuerdo_jazmin_mara'] | df_acuerdo['acuerdo_mara_omar']
        df_acuerdo['acuerdo_todos'] = (df_merge['emotion_jazmin']==e) & ((df_merge['emotion_mara']==df_merge['emotion_omar']) & (df_merge['emotion_mara']==df_merge['emotion_jazmin']))

        acuerdo_cols = [col for col in df_acuerdo.columns if 'acuerdo' in col]
        acuerdo_dict = {}
        for c in acuerdo_cols:
            acuerdo_dict[c] = df_acuerdo[c].value_counts()[True]

        fig = px.histogram(x=acuerdo_dict.keys(), y=acuerdo_dict.values(), title='Coincidencias en el etiquetado - '+e, width=600, height=500).update_xaxes(categoryorder="total descending")
        fig.update_layout(xaxis_title="Acuerdo", yaxis_title="Count")
        fig.show()
    except:
        continue

In [456]:
#df_acuerdo[df_acuerdo['acuerdo_al_menos_2'] == True].head()

## Desacuerdo

In [457]:
#df_acuerdo.info()

In [458]:
df_acuerdo['acuerdo_todos'].value_counts()

False    4512
True        4
Name: acuerdo_todos, dtype: int64

In [459]:
#df_acuerdo.head()

In [460]:
for e in set(emotion_values):
    #print(e)
    try:
        # en este caso hay que quedarse sólo con las filas en las que al menos uno de los 3 eligió la emoción e
        df_tmp = df_merge[['url', 'emotion_jazmin', 'emotion_omar', 'emotion_mara']].copy()
        ind = (df_tmp['emotion_jazmin'] == e) | (df_tmp['emotion_omar'] == e) | (df_tmp['emotion_mara'] == e)
        df_tmp_2 = df_tmp.loc[ind]
        
        df_desacuerdo = pd.DataFrame()
        df_desacuerdo['desacuerdo_jazmin_omar'] = (df_tmp_2['emotion_jazmin']==e) & (df_tmp_2['emotion_jazmin']==df_tmp_2['emotion_omar'])
        df_desacuerdo['desacuerdo_jazmin_mara'] = (df_tmp_2['emotion_jazmin']==e) & (df_tmp_2['emotion_jazmin']==df_tmp_2['emotion_mara'])
        df_desacuerdo['desacuerdo_mara_omar'] = (df_tmp_2['emotion_mara']==e) & (df_tmp_2['emotion_mara']==df_tmp_2['emotion_omar'])
        df_desacuerdo['desacuerdo_todos'] = (df_tmp_2['emotion_jazmin']==e) & ((df_tmp_2['emotion_mara']==df_tmp_2['emotion_omar']) & (df_tmp_2['emotion_mara']==df_tmp_2['emotion_jazmin']))

        desacuerdo_cols = [col for col in df_desacuerdo.columns if 'desacuerdo' in col]
        desacuerdo_dict = {}
        for c in desacuerdo_cols:
            desacuerdo_dict[c] = df_desacuerdo[c].value_counts()[False]

        fig = px.histogram(x=desacuerdo_dict.keys(), y=desacuerdo_dict.values(), title='Desacuerdo en el etiquetado - '+e, width=600, height=500).update_xaxes(categoryorder="total descending")
        fig.update_layout(xaxis_title="Desacuerdo", yaxis_title="Count")
        fig.show()
        
    except:
        continue

## Voting

In [461]:
df_voting = df_merge[['url', 'id_jazmin', 'id_omar', 'id_mara', 'emotion_mara', 'emotion_jazmin', 'emotion_omar', 'channel']]
#df_voting.head()

In [463]:
df_voting['vote'] = df_voting[['emotion_mara', 'emotion_jazmin', 'emotion_omar']].mode(axis=1)[0].copy()
df_voting.head()



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0,url,id_jazmin,id_omar,id_mara,emotion_mara,emotion_jazmin,emotion_omar,channel,vote
0,0e79ea1b-07c0-4e16-af94-3214ef65fb6e_000000000...,88080,70014,83564,No_valorable,Sorpresa,Sorpresa,0,Sorpresa
1,fd2b8fe1-a92a-4141-9fe8-9d16e62b288d_000000000...,92281,74215,87765,Alegría,Neutro,Alegría,1,Alegría
2,fd2b8fe1-a92a-4141-9fe8-9d16e62b288d_000000000...,92280,74214,87764,Alegría,Neutro,Alegría,1,Alegría
3,fd2b8fe1-a92a-4141-9fe8-9d16e62b288d_000000000...,92279,74213,87763,Alegría,Neutro,Miedo,1,Alegría
4,fd2b8fe1-a92a-4141-9fe8-9d16e62b288d_000000000...,92278,74212,87762,Disgusto,Miedo,Miedo,0,Miedo


In [464]:
fig = px.histogram(df_voting, x="vote", color="channel", title='Distribución de la votación', width=800, height=600).update_xaxes(categoryorder="total descending")
fig.show()