# Experiments

In [1]:
from sereia import Sereia
from sereia.evaluation import EvaluationHandler

In [2]:
database_credentials = "mongodb://root:root%40server@localhost:27017/admin"
DATASET_NAME = 'twitter'

In [3]:
sereia = Sereia(
    DATASET_NAME,
    database_credentials,
    config_directory='./config/',
    topk_qms=9999,
    max_qm_size=5,
    max_cjn_size=3,
    topk_cjns=9999,
    topk_cjns_per_qm=1,
    assume_golden_standards_in_topk=True,
)

In [4]:
# sereia.create_indexes()

In [5]:
sereia.use_database(DATASET_NAME)

In [6]:
sereia.use_queryset(DATASET_NAME)

In [7]:
sereia.load_indexes()

In [8]:
sereia.print_runtime_configs()

Maximum QM size: 5
Top-K QMs considered: 9999
Maximum CJN size: 3
Top-K CJNs considered: 9999
Maximum CJNs per QM: 1


In [9]:
result = sereia.run_queryset()

Running keyword query: location brazil
top cn: TWITTER.s(user.location{location}).v(user.location{brazil})
Assuming golden CJN for location brazil (['location', 'brazil'])in position 1
top cn from gs: TWITTER.s(user.location{location}).v(user.location{brazil})
Generated query
[{'$match': {'$expr': {'$regexMatch': {'input': '$user.location',
                                       'options': 'i',
                                       'regex': 'brazil'}}}},
 {'$project': {'id': 1}}]
Running keyword query: star + wars
top cn: TWITTER.v(user.location{star,wars})
Assuming golden CJN for star + wars ([['star', 'wars']])in position 4
top cn from gs: TWITTER.v(text{star,wars})
Generated query
[{'$match': {'$expr': {'$regexMatch': {'input': '$text',
                                       'options': 'i',
                                       'regex': 'star wars'}}}},
 {'$project': {'id': 1}}]
Running keyword query: philippines 2011
top cn: TWITTER.v(user.location{philippines},created_at{2011})


In [10]:
evaluation_handler = EvaluationHandler(
    DATASET_NAME,
    sereia.config,
)

evaluation_handler.load_golden_standards()

In [11]:
evaluated_results = evaluation_handler.evaluate_results(
    result,
    results_filename='results/' + DATASET_NAME + '-Assisted',
)

QM Evaluation {'mrr': 0.4636363636363637, 'p@1': 0.2727272727272727, 'p@2': 0.36363636363636365, 'p@3': 0.36363636363636365, 'p@4': 0.7272727272727273, 'p@5': 1.0, 'p@6': 1.0, 'p@7': 1.0, 'p@8': 1.0, 'p@9': 1.0, 'p@10': 1.0, 'relevant_positions': [1, 4, 1, 1, 5, 4, 4, 5, 4, 5, 2]}
CJN Evaluation {'mrr': 0.4636363636363637, 'p@1': 0.2727272727272727, 'p@2': 0.36363636363636365, 'p@3': 0.36363636363636365, 'p@4': 0.7272727272727273, 'p@5': 1.0, 'p@6': 1.0, 'p@7': 1.0, 'p@8': 1.0, 'p@9': 1.0, 'p@10': 1.0, 'relevant_positions': [1, 4, 1, 1, 5, 4, 4, 5, 4, 5, 2]}
Results filename: results/twitter-Assisted


In [16]:
import json

results_file = f'results/{DATASET_NAME}-Assisted.json'
with open(results_file) as f:
    results_data = json.load(f)
quality_results = results_data['evaluation']
# quality_results['retrieval_score']

precision_data = []
recall_data = []
for item in quality_results['retrieval_score']:
    # print(item)
    precision_data.append(item['precision'])
    recall_data.append(item['recall'])

print(sum(precision_data)/len(precision_data))
print(sum(recall_data)/len(recall_data))

1.0
1.0


In [17]:
from pprint import pprint as pp
pp(quality_results['retrieval_score'])

[{'keyword_query': 'location brazil',
  'num_documents_expected': 9638,
  'num_documents_retrieved': 9638,
  'precision': 1.0,
  'recall': 1.0},
 {'keyword_query': 'star + wars',
  'num_documents_expected': 93,
  'num_documents_retrieved': 93,
  'precision': 1.0,
  'recall': 1.0},
 {'keyword_query': 'philippines 2011',
  'num_documents_expected': 5491,
  'num_documents_retrieved': 5491,
  'precision': 1.0,
  'recall': 1.0},
 {'keyword_query': 'world + cup',
  'num_documents_expected': 62,
  'num_documents_retrieved': 62,
  'precision': 1.0,
  'recall': 1.0},
 {'keyword_query': 'percy + jackson',
  'num_documents_expected': 11,
  'num_documents_retrieved': 11,
  'precision': 1.0,
  'recall': 1.0},
 {'keyword_query': 'olympics',
  'num_documents_expected': 66,
  'num_documents_retrieved': 66,
  'precision': 1.0,
  'recall': 1.0},
 {'keyword_query': 'disney',
  'num_documents_expected': 1010,
  'num_documents_retrieved': 1010,
  'precision': 1.0,
  'recall': 1.0},
 {'keyword_query': 'hung