## Create the sample input file for CollabREC experiments

In [1]:
import pandas as pd 
import numpy as np 
import os 

In [2]:
path = "../data/SynthTRIPs/generated-queries/Llama3Point2Vision90B_generated_queries.json"
df = pd.read_json(path)

In [3]:
df.head()

Unnamed: 0,config_id,config,context,city,query_v,query_p0,query_p1
0,c_p_0_pop_low_easy,"{'p_id': 'p_0', 'persona': 'A top-scoring play...",Adana has low popularity. Adana has low season...,"[Adana, Adiyaman, Agri, Arad, Arkhangelsk, Bac...","""Less crowded European cities to visit in Febr...","""European cities with ice hockey facilities, l...",Which European cities have ice hockey arenas a...
1,c_p_0_pop_low_medium,"{'p_id': 'p_0', 'persona': 'A top-scoring play...",Sivas has low popularity and low budget. In Si...,"[Sivas, Rivne, Konya, Craiova, Adana]","""Low budget European city breaks with old tree...","""European cities with low budget options and o...",Which European cities have ice hockey arenas a...
2,c_p_0_pop_low_hard,"{'p_id': 'p_0', 'persona': 'A top-scoring play...",Syktyvkar has low popularity and high budget. ...,"[Syktyvkar, Malatya, Kars, Ioannina]","""Unconventional European destinations with mus...","""European cities with low popularity, high bud...",Which European cities have ice hockey arenas a...
3,c_p_0_pop_low_sustainable,"{'p_id': 'p_0', 'persona': 'A top-scoring play...","Craiova has low popularity , low budget, and g...","[Craiova, Gaziantep]","""Low budget destinations in Europe with good a...","""Low budget European cities with great air qua...",Which European cities have ice hockey arenas a...
4,c_p_0_pop_medium_easy,"{'p_id': 'p_0', 'persona': 'A top-scoring play...",Aalborg has medium popularity. Aalborg has low...,"[Aalborg, Ancona, Astrakhan, Bari, Belgorod, B...","""Medium-sized European cities to visit in Janu...","""European cities with ice hockey facilities, m...",Which European cities have ice hockey arenas a...


In [4]:
def get_interest(config):
    return config if "interests" in config["filters"].keys() else None

configs = df['config'].apply(get_interest)

In [5]:
int_df = df[df['config'].isin(configs.tolist())]

Easy queries are too broad with only one filter and are hence ignored for experimentation

In [6]:
def filter_easy(config_id):
    return None if "easy" in config_id else config_id 

non_easy_queries = int_df['config_id'].apply(filter_easy)
non_easy_df = int_df[int_df['config_id'].isin(non_easy_queries)]

In [8]:
non_easy_df.info()
final_df = non_easy_df

<class 'pandas.core.frame.DataFrame'>
Index: 1334 entries, 1 to 2301
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   config_id  1334 non-null   object
 1   config     1334 non-null   object
 2   context    1334 non-null   object
 3   city       1334 non-null   object
 4   query_v    1334 non-null   object
 5   query_p0   1334 non-null   object
 6   query_p1   1334 non-null   object
dtypes: object(7)
memory usage: 83.4+ KB


## Stratifying based on popularity and complexity -> 9 combinations

In [9]:
def find_level_pop(config_id):
    pop_level = config_id.split("_")[4:]
    return "_".join(pop_level)

final_df['pop_level'] = final_df['config_id'].apply(find_level_pop)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_df['pop_level'] = final_df['config_id'].apply(find_level_pop)


In [11]:
final_df['pop_level'].value_counts()

pop_level
medium_hard           200
high_hard             200
low_hard              177
medium_medium         138
high_medium           131
high_sustainable      131
low_medium            128
medium_sustainable    120
low_sustainable       109
Name: count, dtype: int64

In [12]:
final_df['pop_level'] = final_df['pop_level'].astype(str)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_df['pop_level'] = final_df['pop_level'].astype(str)


In [13]:
sampled = final_df.groupby('pop_level', group_keys=False).apply(lambda x: x.sample(100, random_state=42))

  sampled = final_df.groupby('pop_level', group_keys=False).apply(lambda x: x.sample(100, random_state=42))


In [14]:
sampled.info()

<class 'pandas.core.frame.DataFrame'>
Index: 900 entries, 1105 to 42
Data columns (total 8 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   config_id  900 non-null    object
 1   config     900 non-null    object
 2   context    900 non-null    object
 3   city       900 non-null    object
 4   query_v    900 non-null    object
 5   query_p0   900 non-null    object
 6   query_p1   900 non-null    object
 7   pop_level  900 non-null    object
dtypes: object(8)
memory usage: 63.3+ KB


In [19]:
def extract_filters_from_config(config):
    return config['filters']

sampled['filters'] = sampled['config'].apply(extract_filters_from_config)

In [20]:
query_df = sampled[['config_id', 'filters', 'query_v']]

In [22]:
query_df.rename(columns={'query_v': 'query'}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  query_df.rename(columns={'query_v': 'query'}, inplace=True)


In [24]:
query_df.to_json("../data/input_queries.json", orient='records')