# Libraries

In [1]:
import json

# User path config

In [2]:
DATA_PATH = "data/output/amazon-reviews-data-with-response-phi3-prompt.json"

# Functions

In [3]:
def evaluate_metrics(output, model_response, k=10):
    """
    Evaluate model's recommendation performance using precision@k, recall@k, and MRR.
    
    Args:
        output (str): The ground truth output containing the actual items (comma-separated string).
        model_response (str): The predicted items by the model (comma-separated string).
        k (int): The number of top items to consider for precision@k, recall@k, etc.
    
    Returns:
        dict: A dictionary containing precision@k, recall@k, and MRR.
    """
    # Parse the output and model_response strings into lists
    actual_items = [item.strip() for item in output.split(",") if "item_" in item]
    predicted_items = [item.strip() for item in model_response.split(",") if "item_" in item][:k]  # Only consider top-K predictions

    # Remove duplicates in predictions (since duplicates don't count multiple times)
    predicted_items = list(dict.fromkeys(predicted_items))

    # Calculate true positives
    true_positives = set(actual_items) & set(predicted_items)
    
    # Calculate precision@k (How many of the top-K predicted are correct)
    precision_at_k = len(true_positives) / min(len(predicted_items), k)

    # Calculate recall@k (How many of the relevant items are found in the top-K)
    recall_at_k = len(true_positives) / len(actual_items) if len(actual_items) > 0 else 0

    # Calculate MRR (Mean Reciprocal Rank)
    mrr = 0
    for rank, predicted_item in enumerate(predicted_items, 1):
        if predicted_item in actual_items:
            mrr = 1 / rank
            break

    # Compile metrics into a dictionary
    metrics = {
        "precision@{}".format(k): precision_at_k,
        "recall@{}".format(k): recall_at_k,
        "mrr": mrr
    }

    return metrics

In [4]:
def evaluate_metrics_on_list(data_list, k=10):
    """
    Evaluate precision@k, recall@k, and MRR on a list of data.
    
    Args:
        data_list (list): A list of dictionaries containing 'output' and 'model_response'.
        k (int): The number of top items to consider for precision@k, recall@k, etc.
        
    Returns:
        dict: Average precision@k, recall@k, and MRR for the entire dataset.
    """
    precision_sum = 0
    recall_sum = 0
    mrr_sum = 0
    num_samples = len(data_list)

    for data in data_list:
        output = data["output"]
        model_response = data["model_response"]

        # Calculate metrics for each entry
        metrics = evaluate_metrics(output, model_response, k)
        
        # Accumulate the metrics
        precision_sum += metrics[f"precision@{k}"]
        recall_sum += metrics[f"recall@{k}"]
        mrr_sum += metrics["mrr"]

    # Calculate the average metrics over all samples
    avg_metrics = {
        f"precision@{k}": precision_sum / num_samples,
        f"recall@{k}": recall_sum / num_samples,
        "mrr": mrr_sum / num_samples
    }

    return avg_metrics


In [5]:
def filter_endoftext_items(data_list):
    filtered_data = []
    
    for entry in data_list:
        # Check if all tokens in 'output' are '<|endoftext|>'
        output_items = [item.strip() for item in entry['output'].split(',')]
        
        # If the output is not all '<|endoftext|>', keep the entry
        if any(item != '<|endoftext|>' for item in output_items):
            filtered_data.append(entry)
    
    return filtered_data

# Processing

In [6]:
# Open and read the JSON file
with open(DATA_PATH, 'r') as f:
    data_list = json.load(f)

In [7]:
data_list_with_atleast_one_item_output = filter_endoftext_items(data_list)

In [8]:
print("Original test data:", len(data_list))
print("Filtered test data with at least one output item:", len(data_list_with_atleast_one_item_output))

Original test data: 627
Filtered test data with at least one output item: 230


### All test data

In [9]:
# Evaluate the metrics
avg_metrics = evaluate_metrics_on_list(data_list, k=10)
print("Average Metrics:", avg_metrics)

Average Metrics: {'precision@10': 0.008040302777144884, 'recall@10': 0.01967038809144072, 'mrr': 0.015151515151515152}


### Exclude materials from test data
Exclude materials that do not have any item in the output.

In [10]:
# Evaluate the metrics
avg_metrics = evaluate_metrics_on_list(data_list_with_atleast_one_item_output, k=10)
print("Average Metrics:", avg_metrics)

Average Metrics: {'precision@10': 0.02191856452726018, 'recall@10': 0.0536231884057971, 'mrr': 0.041304347826086954}


In [11]:
data_list_with_atleast_one_item_output[78]

{'input': '<|user_AFIGEAZMENNV67CZRJH5P66MNIWA|>',
 'output': '<|item_B07PJ8VDSP|>, <|endoftext|>, <|endoftext|>, <|endoftext|>, <|endoftext|>, <|endoftext|>, <|endoftext|>, <|endoftext|>, <|endoftext|>, <|endoftext|>',
 'instruction': "Given a user purchased an an item with the following details, predict the next 10 items the user would  purchase. Item id is <|item_B07TX13LTP|>. Rating of the item by user from 1 to 5 is 1.0. Text of the user review is Short life span . dont borther. Main category of the item is Appliances. Item name is DIKOO WR51X10101 Heater Harness Defrost Assembly Compatible for General Electric Refrigerators Replaces AP4355467, 1399613,EA1993872, WR51X10053. Price USD is 13.29. Item details is brand name is: DIKOO. model info is: WR51X10101DIK. item weight is: 7.8 ounces. package dimensions is: 9.96 x 3.98 x 3.43 inches. item model number is: WR51X10101DIK. part number is: WR51X10101DIK. form factor is: Compact. batteries included? is: No. batteries required? is: 