In [68]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go

# Generating Plots for Final Report

- Llama-3.1-70b  Recommendations vs. Content-Based Filtering
- Content-Based Filtering vs Collaborative Filtering
- Collaborative-Based Filtering vs Hybrid Filtering


For each comparison mentioned above, include charts for Hit Rate, Mean Average Precision, and Normalized Discounted Cumulative Gain charts*

In [114]:
def create_bar_chart_plotly(df, x, y, color, title, labels, category_orders=None, subplot_titles=None):
    df[x] = df[x].astype(str)

    if isinstance(y, list):
        fig = make_subplots(1, len(y), subplot_titles=subplot_titles)
        
        for idx, metric in enumerate(y):
            temp_fig = px.bar(
                df,
                x=x,
                y=metric,
                color=color,
                barmode="group",
                # title=title,
                labels=labels,
                category_orders=category_orders,
            )
            for trace in temp_fig.data:
                trace.legendgroup = trace.name
                trace.showlegend = True if idx==0 else False
                fig.append_trace(
                    trace,
                    row=1, col=idx+1,
                )
        
        fig.update_xaxes(tickvals=sorted(df[x].unique()))
        fig.update_layout(
            title_text=title,
            legend=dict(orientation='h', yanchor='bottom', xanchor='left', y=-0.3)
            # showlegend=False
        )

    else:
        fig = px.bar(
            df,
            x=x,
            y=y,
            color=color,
            barmode="group",
            title=title,
            labels=labels,
            category_orders=category_orders
        )

        fig.update_xaxes(tickvals=sorted(df[x].unique()))
        fig.update_layout(showlegend=False)
        
    fig.show()

### Vanilla LLM Recommendations vs. Content-Based Filtering

In [187]:
df = pd.read_csv("offline_eval_results.csv")
llm_df = pd.read_csv("llm_offline_eval_results.csv")

df = pd.concat([df, llm_df]).reset_index(drop=True)

df = df.sort_values(by=['k', 'name'], ascending=[True, True])

# dropping Image Summary (Tone) rows
df = df[df['name']!="Image Summary (Tone)"]

df.head(5)


Unnamed: 0,name,k,hit_rate,mean_avg_prec,ndcg
8,Image Summary (Keywords),1,0.180213,0.180213,0.0
20,Llama-3.1 Recommendations,1,0.27785,0.27785,0.0
0,Normal,1,0.165152,0.165152,0.0
12,Screenshot Summary (Keywords),1,0.185926,0.185926,0.0
16,Screenshot and Header Image Keywords,1,0.203064,0.203064,0.0


In [188]:
create_bar_chart_plotly(
    df, 
    'k', 
    ['hit_rate', 'mean_avg_prec', 'ndcg'], 
    'name', 
    title="Llama-3.1-70b vs. Content-based Recommenders", 
    labels={"k": "k", "hit_rate": "Hit Rate", "name": "Method"},
    category_orders={
        'name': [
            'Llama-3.1 Recommendations',
            'Normal',
            'Image Summary (Keywords)',
            'Screenshot Summary (Keywords)',
            'Screenshot and Header Image Keywords',
        ]
    },
    subplot_titles=['HitRate@k', 'MAP@k', 'NDCG@k']
)

### Content-Based Filtering vs Collaborative Filtering

In [189]:
collab_filtering_df = pd.read_csv("collaborative_offline_eval_results.csv")
collab_filtering_df

# combine results from collaborative filtering with results from content-based filtering
cbf_df = pd.read_csv("offline_eval_results.csv")
cbf_df = cbf_df[cbf_df['name']=='Screenshot and Header Image Keywords']

cf_cbf_df = pd.concat([collab_filtering_df, cbf_df])
cf_cbf_df

Unnamed: 0,name,k,hit_rate,mean_avg_prec,ndcg
0,Memory-Based Collaborative Filtering,1,0.641392,0.641392,0.0
1,Memory-Based Collaborative Filtering,5,0.89665,0.490548,0.767332
2,Memory-Based Collaborative Filtering,10,0.95248,0.40715,0.763806
3,Memory-Based Collaborative Filtering,20,0.977668,0.345812,0.74482
16,Screenshot and Header Image Keywords,1,0.203064,0.203064,0.0
17,Screenshot and Header Image Keywords,5,0.502207,0.155596,0.375539
18,Screenshot and Header Image Keywords,10,0.654375,0.131135,0.422058
19,Screenshot and Header Image Keywords,20,0.776681,0.106037,0.443664


In [190]:
create_bar_chart_plotly(
    cf_cbf_df, 
    'k', 
    ['hit_rate', 'mean_avg_prec', 'ndcg'], 
    'name', 
    title="Enhanced Content-Based Recommender vs. Collaborative Recommender", 
    labels={"k": "k", "hit_rate": "Hit Rate", "name": "Method"},
    category_orders={
        'name': [
            'Memory-Based Collaborative Filtering',
            'Screenshot and Header Image Keywords',
        ]
    },
    subplot_titles=['HitRate@k', 'MAP@k', 'NDCG@k']
)

### Collaborative-Based Filtering vs Hybrid Filtering

In [191]:
# hybrid_filtering_df = pd.read_csv("hybrid_hyperparameter_tuning_20250729_014354.csv")
# hybrid_filtering_df_v2 = pd.read_csv("hybrid_hyperparameter_tuning_20250729_155537.csv")
# hybrid_filtering_df = pd.concat([hybrid_filtering_df, hybrid_filtering_df_v2])

# def get_method_name(row):
#     return f"Weight={row['weight']} | Num_Train_Examples={row['num_train_examples']}"
# hybrid_filtering_df['method'] = hybrid_filtering_df[['weight', 'num_train_examples']].apply(get_method_name, axis=1)

# temp_hybrid_filtering_df = hybrid_filtering_df.rename(columns={'method': 'name'})
# temp_hybrid_filtering_df = temp_hybrid_filtering_df[['name', 'k', 'hit_rate', 'mean_avg_prec', 'ndcg']]
# temp_hybrid_filtering_df = temp_hybrid_filtering_df[temp_hybrid_filtering_df['name']=="Weight=0.999 | Num_Train_Examples=nan"]

temp_hybrid_filtering_df = pd.read_csv("switching_hybrid_model_results.csv")
temp_hybrid_filtering_df['name'] = "Switching Hyrbid Model"

combined_df = pd.concat([collab_filtering_df, temp_hybrid_filtering_df])
combined_df

Unnamed: 0,name,k,hit_rate,mean_avg_prec,ndcg,game_threshold,num_train_examples,mrr
0,Memory-Based Collaborative Filtering,1,0.641392,0.641392,0.0,,,
1,Memory-Based Collaborative Filtering,5,0.89665,0.490548,0.767332,,,
2,Memory-Based Collaborative Filtering,10,0.95248,0.40715,0.763806,,,
3,Memory-Based Collaborative Filtering,20,0.977668,0.345812,0.74482,,,
0,Switching Hyrbid Model,1,0.641392,0.641392,0.0,5.0,,0.641392
1,Switching Hyrbid Model,5,0.89665,0.490574,0.767351,5.0,,0.742708
2,Switching Hyrbid Model,10,0.95248,0.407141,0.763806,5.0,,0.750434
3,Switching Hyrbid Model,20,0.977668,0.345803,0.74482,5.0,,0.75229


In [192]:
create_bar_chart_plotly(
    combined_df, 
    'k', 
    ['hit_rate', 'mean_avg_prec', 'ndcg'], 
    'name', 
    title="Collaborative Recommender vs Hybrid Recommender", 
    labels={"k": "k", "hit_rate": "Hit Rate", "name": "Method"},
    subplot_titles=['HitRate@k', 'MAP@k', 'NDCG@k']
)

In [193]:
create_bar_chart_plotly(
    combined_df[combined_df['k']=='5'], 
    'k', 
    ['hit_rate', 'mean_avg_prec', 'ndcg'], 
    'name', 
    title="Collaborative Recommender vs Hybrid Recommender", 
    labels={"k": "k", "hit_rate": "Hit Rate", "name": "Method"},
    subplot_titles=['HitRate@k', 'MAP@k', 'NDCG@k']
)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



## Find Cases Where Hyrbid Outperforms Model-Based

In [None]:
full_casc_results_df = pd.read_csv("switching_hybrid_model_full_results.csv")
top_5_casc_rec_df = full_casc_results_df[full_casc_results_df['k']==5]

full_collab_results_df = pd.read_csv("collaborative_offline_eval_results_full.csv")
top_5_collab_rec_df = full_collab_results_df[full_collab_results_df['k']==5]

combined_recs_df = top_5_casc_rec_df.merge(top_5_collab_rec_df, on='userid', how='inner', suffixes=["_casc", "_collab"])
temp_df = combined_recs_df[combined_recs_df['NDCG@k_casc']>combined_recs_df['NDCG@k_collab']]

user = temp_df['userid'].values[0]

casc_df = temp_df[['k_casc', 'precision@k_casc', 'recall@k_casc', 'NDCG@k_casc']]
casc_df.columns = ['k', 'precision@5', 'recall@5', 'NDCG@5']
casc_df['name'] = "Switch Hybrid Model"

collab_df = temp_df[['k_collab', 'precision@k_collab', 'recall@k_collab', 'NDCG@k_collab']]
collab_df.columns = ['k', 'precision@5', 'recall@5', 'NDCG@5']
collab_df['name'] = "Collaborative Filtering Model"

graph_df = pd.concat([collab_df, casc_df])

create_bar_chart_plotly(
    graph_df,
    'k', 
    ['precision@5', 'recall@5', 'NDCG@5'], 
    'name', 
    title=f"Recommendation Results For User: {user}", 
    labels={"k": "k", "hit_rate": "Hit Rate", "name": "Method"},
    subplot_titles=['precision@5', 'recall@5', 'NDCG@5']
)




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

