In [88]:
import os 
import pandas as pd 
import json 
import sys 
import numpy as np 

In [89]:
DATA_PATH = "../../data/trace-crs-chatbot/"
ORIGINAL_EXPORTS_PATH = DATA_PATH + "original_firestore_exports/"

with open(DATA_PATH + "merged/combined_collections_filtered.json") as f:
    data = json.load(f)

In [90]:
def get_user_choice(session):
    try:
        feedback = session['conversations'][0]['feedback_answers']
        for fb in feedback: 
            if fb["q_id"] == 0:
                return fb["answer"]
    except Exception as e:
        return None
    
    return None

In [91]:
def get_recs(session, baseline=False):
    if baseline:
        key = 'baseline_recommendation'
    else: 
        key = 'context_aware_recommendation'

    rec = session['cfe_responses'][0]['context'][key]
    if rec is None:
        return {
            'rec': None,
            'explanation': None
        }
    return {
        'rec': rec['recommendation'], 
        'explanation': rec['explanation']
    }   

In [92]:
def get_shown_rec(session):
    return session['cfe_responses'][0]['recommendation_shown']

In [93]:
recs_combined = []

for session in data:
    try:
        ca_rec = get_recs(session, baseline=False)
        ba_rec = get_recs(session, baseline=True)
        result = {
            'session_id': session['session_id'],
            'user_choice': get_user_choice(session),
            'shown_rec': get_shown_rec(session),
            'context_aware_rec': ca_rec['rec'],
            'context_aware_explanation': ca_rec['explanation'],
            'baseline_rec': ba_rec['rec'],
            'baseline_explanation': ba_rec['explanation']
        }
        recs_combined.append(result)
    except Exception as e:
        print(f"Error processing session {session['session_id']}: {e}")

Error processing session 20260128_101415_869458: 'NoneType' object is not subscriptable
Error processing session 20260122_205440_968054: list index out of range
Error processing session 20260128_222315_849959: list index out of range
Error processing session 20260201_151138_000293: list index out of range
Error processing session 20260123_145540_135839: list index out of range
Error processing session 20260129_084630_223031: list index out of range
Error processing session 20260121_141722_775629: list index out of range
Error processing session 20260129_084835_707171: list index out of range
Error processing session 20260201_151040_801929: list index out of range
Error processing session 20260126_130348_513307: list index out of range
Error processing session 20260123_145454_553257: list index out of range
Error processing session 20260201_152103_549729: list index out of range
Error processing session 20260126_111518_494183: list index out of range
Error processing session 20260126_10

In [94]:
rec_df = pd.DataFrame(recs_combined)

In [95]:
rec_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 75 entries, 0 to 74
Data columns (total 7 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   session_id                 75 non-null     object
 1   user_choice                53 non-null     object
 2   shown_rec                  75 non-null     object
 3   context_aware_rec          49 non-null     object
 4   context_aware_explanation  49 non-null     object
 5   baseline_rec               56 non-null     object
 6   baseline_explanation       56 non-null     object
dtypes: object(7)
memory usage: 4.2+ KB


In [96]:
rec_df.head()

Unnamed: 0,session_id,user_choice,shown_rec,context_aware_rec,context_aware_explanation,baseline_rec,baseline_explanation
0,20260126_131715_384614,1️⃣ Ljubljana (recommended),Ljubljana,,,,
1,20260130_172226_892099,1️⃣ Graz (recommended),Graz,Graz,"Graz, a UNESCO World Heritage site and City of...",Vienna,"Vienna is a renowned cultural capital, offerin..."
2,20260121_140506_239613,1️⃣ Ibiza (recommended),Ibiza,,,,
3,20260129_182052_049806,,Paris,Krakow,"Krakow offers a vibrant nightlife, excellent b...",Paris,Paris is recommended as a general leisure dest...
4,20260129_125935_136152,2️⃣ Málaga (alternative),Faro,Faro,Faro provides a serene yet engaging winter esc...,Málaga,Málaga offers a vibrant city experience with c...


In [110]:
rec_filter = rec_df[rec_df['user_choice'].notnull() & rec_df['shown_rec'].notnull()]

### in how many sessions is the CA rec the same as the shown rec (from cfe)?

In [104]:
rec_ca = rec_df[rec_df['context_aware_rec'].notnull()]
rec_ca["ca_shown"] = [
    c2 in c1 if pd.notna(c1) and pd.notna(c2) else False
    for c1, c2 in zip(rec_ca["shown_rec"], rec_ca["context_aware_rec"])
]

rec_ca["ca_shown"].value_counts(normalize=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rec_ca["ca_shown"] = [


ca_shown
True     0.755102
False    0.244898
Name: proportion, dtype: float64

### in those sessions where shown rec was NOT ca rec, how many times does the user still choose the alternative (i.e. ca rec)?

In [105]:
rec_not_ca = rec_ca[~rec_ca['ca_shown']]
rec_not_ca.head()

Unnamed: 0,session_id,user_choice,shown_rec,context_aware_rec,context_aware_explanation,baseline_rec,baseline_explanation,ca_shown
3,20260129_182052_049806,,Paris,Krakow,"Krakow offers a vibrant nightlife, excellent b...",Paris,Paris is recommended as a general leisure dest...,False
8,20260131_104028_599409,,Madrid,Lisbon,Lisbon offers a rich local culture and histori...,Madrid,"Madrid is a popular European capital, known fo...",False
9,20260129_152850_188357,2️⃣ Warsaw (alternative),Prague,Warsaw,"Warsaw offers a dynamic nightlife, a fascinati...",Prague,Prague is a highly popular and renowned Europe...,False
10,20260126_185529_961407,1️⃣ Chamonix (recommended),Chamonix,Innsbruck,Innsbruck provides excellent multi-day hiking ...,Chamonix,Chamonix is a premier destination for challeng...,False
27,20260121_141441_436190,1️⃣ Madrid (recommended),Madrid,Seville,Seville provides a rich cultural experience wi...,Madrid,Madrid is a prime choice for a luxury cultural...,False


In [106]:
rec_not_ca["ca_chosen"] = [
    c2 in c1 if pd.notna(c1) and pd.notna(c2) else False
    for c1, c2 in zip(rec_not_ca["user_choice"], rec_not_ca["context_aware_rec"])
]
rec_not_ca["ca_chosen"].value_counts(normalize=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rec_not_ca["ca_chosen"] = [


ca_chosen
False    0.833333
True     0.166667
Name: proportion, dtype: float64

In [109]:
rec_filter.info()

<class 'pandas.core.frame.DataFrame'>
Index: 53 entries, 0 to 74
Data columns (total 8 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   session_id                 53 non-null     object
 1   user_choice                53 non-null     object
 2   shown_rec                  53 non-null     object
 3   context_aware_rec          34 non-null     object
 4   context_aware_explanation  34 non-null     object
 5   baseline_rec               39 non-null     object
 6   baseline_explanation       39 non-null     object
 7   ca_shown                   53 non-null     bool  
dtypes: bool(1), object(7)
memory usage: 3.4+ KB


In [75]:
rec_ca_none = rec_filter[rec_filter['context_aware_rec'].isnull()]
rec_ca_none[rec_ca_none['baseline_rec'].notnull()].head()

Unnamed: 0,session_id,user_choice,shown_rec,context_aware_rec,context_aware_explanation,baseline_rec,baseline_explanation
32,20260126_183757_064706,2️⃣ Barcelona (alternative),Krakow,,,Barcelona,Barcelona is a vibrant city known for its stun...
40,20260126_185242_275306,1️⃣ Valencia (recommended),Valencia,,,Paris,"Paris is a timeless choice for any traveler, o..."
50,20260121_145705_751910,1️⃣ Copenhagen (recommended),Copenhagen,,,Paris,As no specific travel preferences were provide...


In [111]:
rec_filter["ca_chosen"] = [
    c2 in c1 if pd.notna(c1) and pd.notna(c2) else False
    for c1, c2 in zip(rec_filter["user_choice"], rec_filter["context_aware_rec"])
]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rec_filter["ca_chosen"] = [


In [112]:
rec_filter[(rec_filter['context_aware_rec'].notnull())]['ca_chosen'].value_counts(normalize=True)

ca_chosen
True     0.617647
False    0.382353
Name: proportion, dtype: float64

In [113]:
ca_chosen_sessions = rec_filter[(rec_filter['context_aware_rec'].notnull()) & (rec_filter['ca_chosen'] == True)]['session_id'].tolist()

In [114]:
len(ca_chosen_sessions)

21

## get q3 answers for ca chosen sessions

In [115]:
q3_ans_dist = []
for session in data:
    if session['session_id'] not in ca_chosen_sessions:
        continue
    try:
        feedback = session['conversations'][0]['feedback_answers']
        for fb in feedback: 
            if fb["q_id"] == 3:
                q3_ans_dist.append(fb["answer"])
    except Exception as e:
        continue

In [118]:
q3_series = pd.Series(q3_ans_dist)
q3_series.value_counts(normalize=True)

⚪⚪⚪⚪⚪ Not at all      0.266667
⚫⚪⚪⚪⚪ Slightly        0.266667
⚫⚫⚪⚪⚪ Moderately      0.266667
⚫⚫⚫⚫⚪ Strongly        0.133333
⭐️⭐️⭐️⭐️ Very well    0.066667
Name: proportion, dtype: float64

In [120]:
b_chosen_sessions = rec_filter[(rec_filter['baseline_rec'].notnull()) & (rec_filter['ca_chosen'] == False)]['session_id'].tolist()

In [121]:
q3_ans_dist_b = []
for session in data:
    if session['session_id'] not in b_chosen_sessions:
        continue
    try:
        feedback = session['conversations'][0]['feedback_answers']
        for fb in feedback: 
            if fb["q_id"] == 3:
                q3_ans_dist_b.append(fb["answer"])
    except Exception as e:
        continue

In [122]:
q3_series_b = pd.Series(q3_ans_dist_b)
q3_series_b.value_counts(normalize=True)

⚫⚫⚫⚫⚪ Strongly        0.333333
⚫⚫⚪⚪⚪ Moderately      0.200000
⚪⚪⚪⚪⚪ Not at all      0.200000
⭐️ Not at all         0.066667
⭐️⭐️⭐️ Moderately     0.066667
⚫⚪⚪⚪⚪ Slightly        0.066667
⭐️⭐️⭐️⭐️ Very well    0.066667
Name: proportion, dtype: float64

### get q4 feedback

In [82]:
q4_ans_dist = []
for session in data:
    try:
        feedback = session['conversations'][0]['feedback_answers']
        for fb in feedback: 
            if fb["q_id"] == 4:
                q4_ans_dist.append({
                    "session_id": session['session_id'],
                    "answer": fb["answer"]})
    except Exception as e:
        continue

In [83]:
q4_ans_dist

[{'session_id': '20260126_153942_188662',
  'answer': "I really liked the experience, especially the clarifying questions, which are a great way to narrow down the search window (I wouldn't mind if it asked more questions). I also really liked that it gave the user profile it works with to the user. \nWhat I noticed is that it asked how flexible I am time-wise, but didn't give a time window with the suggestion when I said I was. Also it creates a new user profile for every new query (and asks the same questions/or similar). Also it could also ask questions about what i dont like about travel/ a specific location f.e. crowds, loud nightlife"},
 {'session_id': '20260126_184758_853057', 'answer': 'skipped'},
 {'session_id': '20260127_154111_084888',
  'answer': 'maybe give some numerical data about, for example, costs for accomodation and travel, or like the mean cost of restaurants.'},
 {'session_id': '20260128_101415_869458', 'answer': 'skipped'},
 {'session_id': '20260129_095947_694631

## get system latency

In [86]:
time_taken = []
for session in data:
    try: 
        cfe_response = session['cfe_responses'][0]
        time_taken_secs = cfe_response['time_taken_seconds']
        time_taken.append({
            "session_id": session['session_id'],
            "time_taken_seconds": time_taken_secs
        })
    except Exception as e:
        print("Error processing session {}: {}".format(session['session_id'], e))
        continue

Error processing session 20260126_105915_107334: list index out of range
Error processing session 20260129_083820_369217: list index out of range
Error processing session 20260121_141722_775629: list index out of range
Error processing session 20260129_084811_785941: list index out of range
Error processing session 20260129_084835_707171: list index out of range
Error processing session 20260121_141658_025445: list index out of range
Error processing session 20260121_134503_724052: list index out of range
Error processing session 20260126_111518_494183: list index out of range
Error processing session 20260122_205440_968054: list index out of range
Error processing session 20260129_084630_223031: list index out of range
Error processing session 20260126_132350_879189: list index out of range
Error processing session 20260123_191827_085163: list index out of range
Error processing session 20260126_130348_513307: list index out of range
Error processing session 20260126_071701_332095: li

In [87]:
tts = pd.DataFrame(time_taken)
tts['time_taken_seconds'].describe()

count    52.000000
mean     22.774992
std       5.554101
min      12.565411
25%      19.112859
50%      21.350383
75%      25.332189
max      37.596892
Name: time_taken_seconds, dtype: float64