In [1]:
import pandas as pd

In [2]:
def evaluar_recomendaciones(ruta_recomendaciones, ruta_playlist, ruta_test_data, ruta_train_data):
    # Leer recomendaciones desde archivo JSON
    recomendaciones = pd.read_json(ruta_recomendaciones).T
    recomendaciones['pid'] = recomendaciones.index 

    # Leer y agrupar los datos del playlist
    playlist = pd.read_csv(ruta_playlist)
    playlist = playlist.groupby('pid').agg(list)

    # Leer datos de entrenamiento y prueba
    test_data = pd.read_csv(ruta_test_data)
    train_data = pd.read_csv(ruta_train_data)

    # Agrupar los datos por 'pid'
    playlist_train = train_data.groupby('pid').agg(list)
    playlist_test = test_data.groupby('pid').agg(list)

    # Fusionar las recomendaciones con los datos de prueba
    merged = pd.merge(recomendaciones, playlist_test, on='pid')
    merged.drop(columns=['pos'], inplace=True)

    # Calcular las recomendaciones acertadas
    merged['recomendaciones_acertadas@10'] = merged.apply(lambda x: [i for i in x['top_10'] if i in x['tid']], axis=1)
    merged['recomendaciones_acertadas@20'] = merged.apply(lambda x: [i for i in x['top_20'] if i in x['tid']], axis=1)
    
    return merged

ruta_playlist = "data\\playlist.csv"
ruta_test_data = "data\\test_data.csv"
ruta_train_data = "data\\train_data.csv"

In [3]:
r_models = {'ann' : 'data\\recommendations_als.json','lsh' : 'data\\recommendations_lsh.json','als' : 'data\\recommendations_lsh.json', 'random' : 'data\\recommendations_random.json'}
df_recomendaciones = {}
for modelos in r_models.keys():
    tops = evaluar_recomendaciones(r_models[modelos], ruta_playlist, ruta_test_data, ruta_train_data)
    df_recomendaciones[modelos] = tops

In [4]:
df_recomendaciones.keys()
df_recomendaciones['ann']

Unnamed: 0,top_10,top_20,pid,tid,recomendaciones_acertadas@10,recomendaciones_acertadas@20
0,"[4158, 7457, 128, 7357, 8884, 5173, 6066, 4931...","[4158, 7457, 128, 7357, 8884, 5173, 6066, 4931...",0,"[22, 34, 20, 30, 26, 35, 18, 49, 31, 29]",[],[]
1,"[5974, 7357, 4380, 824, 4177, 5939, 286, 3236,...","[5974, 7357, 4380, 824, 4177, 5939, 286, 3236,...",1,"[84, 74, 64, 53, 77, 76, 57, 86]",[],[]
2,"[5974, 7357, 5173, 7457, 1406, 6066, 4158, 323...","[5974, 7357, 5173, 7457, 1406, 6066, 4158, 323...",2,"[148, 146, 107, 108, 150, 132, 93, 115, 94, 11...",[],[]
3,"[7457, 5173, 2879, 6066, 1871, 4158, 9921, 128...","[7457, 5173, 2879, 6066, 1871, 4158, 9921, 128...",3,"[200, 203, 216, 202, 237, 277, 232, 161, 265, ...",[],[]
4,"[1871, 5053, 128, 7215, 594, 3097, 9921, 5173,...","[1871, 5053, 128, 7215, 594, 3097, 9921, 5173,...",4,"[280, 282, 281]",[],[]
...,...,...,...,...,...,...
9995,"[2470, 1418, 1848, 2787, 853, 4361, 5191, 235,...","[2470, 1418, 1848, 2787, 853, 4361, 5191, 235,...",106995,"[2131, 169633, 3084, 169631, 11192, 7208, 1763...",[],[]
9996,"[2470, 1418, 1848, 2787, 853, 4361, 5191, 235,...","[2470, 1418, 1848, 2787, 853, 4361, 5191, 235,...",106996,"[19773, 4697, 169637, 6663, 36628, 57389, 1977...",[],[]
9997,"[2470, 1418, 1848, 2787, 853, 4361, 5191, 235,...","[2470, 1418, 1848, 2787, 853, 4361, 5191, 235,...",106997,"[1613, 169646, 169649]",[],[]
9998,"[2470, 1418, 1848, 2787, 853, 4361, 5191, 235,...","[2470, 1418, 1848, 2787, 853, 4361, 5191, 235,...",106998,"[77504, 4821, 5982, 3606, 1333, 38799, 12931, ...",[],[]


# Aciertos dentro de recomendaciones correctas

In [6]:
def view_aciertos(df):
    #filtra los aciertos, es decir, listas con largo mayor a 0
    top_10 = df.loc[df['recomendaciones_acertadas@10'].apply(len) > 0]
    top_20 = df.loc[df['recomendaciones_acertadas@20'].apply(len) > 0]

    return top_10, top_20

In [7]:
top_10_als,top_20_als = view_aciertos(df_recomendaciones['als'])
top_10_ann,top_20_ann = view_aciertos(df_recomendaciones['ann'])
top_10_lsh,top_20_lsh = view_aciertos(df_recomendaciones['lsh'])
top_10_random,top_20_random = view_aciertos(df_recomendaciones['random'])

top_10_ann.drop(columns=['recomendaciones_acertadas@20'], inplace=True)

top_20_ann.drop(columns=['recomendaciones_acertadas@10'], inplace=True)

top_10_lsh.drop(columns=['recomendaciones_acertadas@20'], inplace=True)

top_20_lsh.drop(columns=['recomendaciones_acertadas@10'], inplace=True)

top_10_als.drop(columns=['recomendaciones_acertadas@20'], inplace=True)

top_20_als.drop(columns=['recomendaciones_acertadas@10'], inplace=True)

top_10_random.drop(columns=['recomendaciones_acertadas@20'], inplace=True)

top_20_random.drop(columns=['recomendaciones_acertadas@10'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  top_10_ann.drop(columns=['recomendaciones_acertadas@20'], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  top_20_ann.drop(columns=['recomendaciones_acertadas@10'], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  top_10_lsh.drop(columns=['recomendaciones_acertadas@20'], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide

### Se tiene que dropear la columna que no corresponda al top de recomendaciones

In [8]:
def calculo_aciertos(df):
    if 'recomendaciones_acertadas@10' in df.columns:
        df['porcentaje_aciertos@10'] = df.apply(lambda row: len(row['recomendaciones_acertadas@10']) / len(row['tid']) * 100, axis=1)
    if 'recomendaciones_acertadas@20' in df.columns:
        df['porcentaje_aciertos@20'] = df.apply(lambda row: len(row['recomendaciones_acertadas@20']) / len(row['tid']) * 100, axis=1)

# Aplicar la función a los DataFrames de recomendaciones
calculo_aciertos(top_10_als)
calculo_aciertos(top_20_als)
calculo_aciertos(top_10_ann)
calculo_aciertos(top_20_ann)
calculo_aciertos(top_10_lsh)
calculo_aciertos(top_20_lsh)
calculo_aciertos(top_10_random)
calculo_aciertos(top_20_random)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['porcentaje_aciertos@10'] = df.apply(lambda row: len(row['recomendaciones_acertadas@10']) / len(row['tid']) * 100, axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['porcentaje_aciertos@20'] = df.apply(lambda row: len(row['recomendaciones_acertadas@20']) / len(row['tid']) * 100, axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/

In [9]:
als = {'top_10': top_10_als, 'top_20': top_20_als}
ann = {'top_10': top_10_ann, 'top_20': top_20_ann}
lsh = {'top_10': top_10_lsh, 'top_20': top_20_lsh}
random = {'top_10': top_10_random, 'top_20': top_20_random}

In [10]:
top_20_ann['porcentaje_aciertos@20'].mean()

10.097851891596434

In [11]:
aciertos_in = {}

In [12]:
for i,model in zip(['als','ann','lsh', 'random'],[als,ann,lsh,random]): 
    for key in model.keys():
        df = model[key]
        # calculemos la media de los porcentajes de aciertos
        if 'porcentaje_aciertos@10' in df.columns:
            x = 'porcentaje_aciertos@10'
            print(f"Porcentaje de aciertos @10 para {key} es: {round(df['porcentaje_aciertos@10'].mean(),4)}%")
        if 'porcentaje_aciertos@20' in df.columns:
            x = 'porcentaje_aciertos@20'
            print(f"Porcentaje de aciertos @20 para {key} es: {round(df['porcentaje_aciertos@20'].mean(),3)}%")
        print("\n")

        if i not in aciertos_in:
            aciertos_in[i] = {}
        aciertos_in[i][key] = df[x].mean()

Porcentaje de aciertos @10 para top_10 es: 9.3348%


Porcentaje de aciertos @20 para top_20 es: 9.323%


Porcentaje de aciertos @10 para top_10 es: 8.5921%


Porcentaje de aciertos @20 para top_20 es: 10.098%


Porcentaje de aciertos @10 para top_10 es: 9.3348%


Porcentaje de aciertos @20 para top_20 es: 9.323%


Porcentaje de aciertos @10 para top_10 es: 9.0272%


Porcentaje de aciertos @20 para top_20 es: 8.317%




In [13]:
aciertos_in

{'als': {'top_10': 9.334752747252747, 'top_20': 9.323358189429616},
 'ann': {'top_10': 8.592110771132663, 'top_20': 10.097851891596434},
 'lsh': {'top_10': 9.334752747252747, 'top_20': 9.323358189429616},
 'random': {'top_10': 9.027160704792283, 'top_20': 8.317446659508853}}

# Playlist en las que al menos se acerto una vez

In [15]:
total_pid = 10000

per_pid_aciertos = {}

for i,model in zip(['als','ann','lsh', 'random'],[als,ann,lsh,random]): 
    for key in model.keys():
        df = model[key]
        largo = df.shape[0]
        print(f"El total de playlist en {key} es: {largo}")
        if i not in per_pid_aciertos:
            per_pid_aciertos[i] = {}
        per_pid_aciertos[i][key] = largo/total_pid

El total de playlist en top_10 es: 10
El total de playlist en top_20 es: 14
El total de playlist en top_10 es: 54
El total de playlist en top_20 es: 106
El total de playlist en top_10 es: 10
El total de playlist en top_20 es: 14
El total de playlist en top_10 es: 6
El total de playlist en top_20 es: 13


In [16]:
per_pid_aciertos

{'als': {'top_10': 0.001, 'top_20': 0.0014},
 'ann': {'top_10': 0.0054, 'top_20': 0.0106},
 'lsh': {'top_10': 0.001, 'top_20': 0.0014},
 'random': {'top_10': 0.0006, 'top_20': 0.0013}}

In [19]:
# Función para calcular las métricas para una fila dada
def calcular_metricas_fila(row, k):
    # Definir la columna de recomendaciones acertadas
    columna_acertadas = f'recomendaciones_acertadas@{k}'
    # Obtener el número de recomendaciones acertadas
    num_acertadas = len(row[columna_acertadas])
    # Obtener el número total de recomendaciones y de canciones reales
    num_recomendaciones = min(len(row[f'top_{k}']), k)
    num_reales = len(row['tid'])
    
    # Calcular precisión y recuperación
    precision = num_acertadas / num_recomendaciones if num_recomendaciones > 0 else 0
    recall = num_acertadas / num_reales if num_reales > 0 else 0

    # Calcular F1-score
    if precision + recall > 0:
        f1_score = 2 * (precision * recall) / (precision + recall)
    else:
        f1_score = 0

    return precision, recall, f1_score

# Función para aplicar las métricas a todo el DataFrame
def calcular_metricas(df, k):
    precision_list = []
    recall_list = []
    f1_score_list = []
    
    for _, row in df.iterrows():
        precision, recall, f1_score = calcular_metricas_fila(row, k)
        precision_list.append(precision)
        recall_list.append(recall)
        f1_score_list.append(f1_score)
    
    # Calcular métricas promedio
    precision_promedio = sum(precision_list) / len(precision_list)
    recall_promedio = sum(recall_list) / len(recall_list)
    f1_score_promedio = sum(f1_score_list) / len(f1_score_list)
    
    return precision_promedio, recall_promedio, f1_score_promedio

# Calcular métricas para top 10 y top 20
metricas = {}
for i,model in zip(['als','ann','lsh','random'],[als,ann,lsh,random]):
    df = df_recomendaciones[i]
    precision_10, recall_10, f1_score_10 = calcular_metricas(df, 10)
    precision_20, recall_20, f1_score_20 = calcular_metricas(df, 20)

    if i not in metricas:
        metricas[i] = {}
    metricas[i]['precision_10'] = precision_10
    metricas[i]['recall_10'] = recall_10
    metricas[i]['f1_score_10'] = f1_score_10
    metricas[i]['precision_20'] = precision_20
    metricas[i]['recall_20'] = recall_20
    metricas[i]['f1_score_20'] = f1_score_20
    

# Mostrar resultados
print(f"Top 10 - Precisión: {precision_10:.4f}, Recuperación: {recall_10:.4f}, F1-Score: {f1_score_10:.4f}")
print(f"Top 20 - Precisión: {precision_20:.4f}, Recuperación: {recall_20:.4f}, F1-Score: {f1_score_20:.4f}")

Top 10 - Precisión: 0.0001, Recuperación: 0.0001, F1-Score: 0.0001
Top 20 - Precisión: 0.0001, Recuperación: 0.0001, F1-Score: 0.0001
