# Experiments

In [1]:
from sereia import Sereia
from sereia.evaluation import EvaluationHandler

In [2]:
database_credentials = "mongodb://root:root%40server@localhost:27017/admin"
DATASET_NAME = 'imdb'

In [3]:
sereia = Sereia(
    DATASET_NAME,
    database_credentials,
    config_directory='./config/',
    topk_qms=200,
    max_qm_size=5,
    max_cjn_size=3,
    topk_cjns=35,
    topk_cjns_per_qm=1,
    assume_golden_standards_in_topk=True,
)

In [4]:
# sereia.create_indexes()

In [5]:
sereia.use_database(DATASET_NAME)

In [6]:
sereia.use_queryset(DATASET_NAME)

In [7]:
sereia.load_indexes()

In [8]:
sereia.print_runtime_configs()

Maximum QM size: 5
Top-K QMs considered: 200
Maximum CJN size: 3
Top-K CJNs considered: 35
Maximum CJNs per QM: 1


In [9]:
result = sereia.run_queryset()

Found casablanca
Running keyword query: denzel + washington
Assuming golden CJN for denzel + washington ([['denzel', 'washington']])in position 1
Generated query
[{'$set': {'cast_dup': '$cast'}},
 {'$unwind': '$cast_dup'},
 {'$match': {'$expr': {'$regexMatch': {'input': '$cast_dup.name',
                                       'options': 'i',
                                       'regex': 'denzel washington'}}}},
 {'$group': {'_id': '$_id',
             'cast': {'$first': '$cast'},
             'crew': {'$first': '$crew'},
             'id': {'$first': '$id'}}},
 {'$project': {'id': 1}}]
Running keyword query: clint + eastwood
Assuming golden CJN for clint + eastwood ([['clint', 'eastwood']])in position 3
Generated query
[{'$set': {'cast_dup': '$cast'}},
 {'$unwind': '$cast_dup'},
 {'$match': {'$expr': {'$regexMatch': {'input': '$cast_dup.name',
                                       'options': 'i',
                                       'regex': 'clint eastwood'}}}},
 {'$group': {'_id

In [10]:
# result = sereia.run_queryset()

In [11]:
evaluation_handler = EvaluationHandler(
    DATASET_NAME,
    sereia.config,
)

evaluation_handler.load_golden_standards()

In [12]:
evaluated_results = evaluation_handler.evaluate_results(
    result,
    results_filename='results/' + DATASET_NAME + '-Assisted',
)

QM Evaluation {'mrr': 0.4342993028476899, 'p@1': 0.11538461538461539, 'p@2': 0.5384615384615384, 'p@3': 0.7692307692307693, 'p@4': 0.8076923076923077, 'p@5': 0.8461538461538461, 'p@6': 0.8461538461538461, 'p@7': 0.8846153846153846, 'p@8': 0.9230769230769231, 'p@9': 0.9230769230769231, 'p@10': 0.9230769230769231, 'relevant_positions': [1, 3, 3, 2, 2, 2, 2, 2, 2, 3, 31, 2, 3, 1, 1, 24, 2, 2, 5, 2, 4, 2, 7, 8, 3, 3]}
CJN Evaluation {'mrr': 0.4332267091882476, 'p@1': 0.11538461538461539, 'p@2': 0.5384615384615384, 'p@3': 0.7692307692307693, 'p@4': 0.8076923076923077, 'p@5': 0.8461538461538461, 'p@6': 0.8461538461538461, 'p@7': 0.8846153846153846, 'p@8': 0.8846153846153846, 'p@9': 0.8846153846153846, 'p@10': 0.8846153846153846, 'relevant_positions': [1, 3, 3, 2, 2, 2, 2, 2, 2, 3, 26, 2, 3, 1, 1, 24, 2, 2, 5, 2, 4, 2, 7, 11, 3, 3]}
Results filename: results/imdb-Assisted


In [14]:
# # # # # result['results'][0].qms()
# # # # # result['results'][0].cjns()
# # # # # result['results'][0].qms()
# # # # # sereia.index_handler.value_index['casablanca']

# sereia.execute_mongo_query('credits', 
# [{'$set': {'cast_dup': '$cast'}},
#  {'$unwind': '$cast_dup'},
#  {'$match': {'$expr': {'$regexMatch': {'input': '$cast_dup.name',
#                                        'options': 'i',
#                                        'regex': 'will smith'}}}},
#  {'$group': {'_id': '$_id',
#              'cast': {'$first': '$cast'},
#              'crew': {'$first': '$crew'},
#              'id': {'$first': '$id'}}},
#  {'$project': {'id': 1}}])

# Loading data from results file

In [15]:
import json

results_file = f'results/{DATASET_NAME}-Assisted.json'

In [16]:
with open(results_file) as f:
    results_data = json.load(f)

In [17]:
num_qms = results_data['evaluation']['num_query_matches']
num_cns = results_data['evaluation']['num_candidate_networks']

In [18]:
quality_results = results_data['evaluation']
quality_results['retrieval_score']

[{'keyword_query': 'denzel + washington',
  'precision': 1.0,
  'recall': 1.0,
  'num_documents_retrieved': 48,
  'num_documents_expected': 48},
 {'keyword_query': 'clint + eastwood',
  'precision': 1.0,
  'recall': 1.0,
  'num_documents_retrieved': 63,
  'num_documents_expected': 63},
 {'keyword_query': 'will + smith',
  'precision': 1.0,
  'recall': 1.0,
  'num_documents_retrieved': 30,
  'num_documents_expected': 30},
 {'keyword_query': 'harrison + ford',
  'precision': 1.0,
  'recall': 1.0,
  'num_documents_retrieved': 58,
  'num_documents_expected': 58},
 {'keyword_query': 'tom + hanks',
  'precision': 1.0,
  'recall': 1.0,
  'num_documents_retrieved': 71,
  'num_documents_expected': 71},
 {'keyword_query': 'johnny + depp',
  'precision': 1.0,
  'recall': 1.0,
  'num_documents_retrieved': 69,
  'num_documents_expected': 69},
 {'keyword_query': 'angelina + jolie',
  'precision': 1.0,
  'recall': 1.0,
  'num_documents_retrieved': 40,
  'num_documents_expected': 40},
 {'keyword_query

In [19]:
# result['results'][24].qms()
precision_data = []
recall_data = []
for item in quality_results['retrieval_score']:
    # print(item)
    precision_data.append(item['precision'])
    recall_data.append(item['recall'])

In [20]:
sum(precision_data)/len(precision_data)

0.8125

In [21]:
sum(recall_data)/len(recall_data)

0.9166666666666666