# Experiments

In [19]:
from sereia import Sereia
from sereia.evaluation import EvaluationHandler

In [20]:
database_credentials = "mongodb://root:root%40server@localhost:27017/admin"
DATASET_NAME = 'imdb'

In [21]:
sereia = Sereia(
    DATASET_NAME,
    database_credentials,
    config_directory='./config/',
    topk_qms=200,
    max_qm_size=5,
    max_cjn_size=3,
    topk_cjns=200,
    topk_cjns_per_qm=1,
    # assume_golden_standards_in_topk=True,
)

In [22]:
# sereia.create_indexes()

In [23]:
sereia.use_database(DATASET_NAME)

In [24]:
sereia.use_queryset(DATASET_NAME)

In [25]:
sereia.load_indexes()

In [26]:
sereia.print_runtime_configs()

Maximum QM size: 5
Top-K QMs considered: 200
Maximum CJN size: 3
Top-K CJNs considered: 200
Maximum CJNs per QM: 1


In [27]:
result = sereia.run_queryset()

Found casablanca
Running keyword query: denzel + washington
Generated query
[{'$set': {'cast_dup': '$cast'}},
 {'$unwind': '$cast_dup'},
 {'$match': {'$expr': {'$regexMatch': {'input': '$cast_dup.name',
                                       'options': 'i',
                                       'regex': 'denzel washington'}}}},
 {'$group': {'_id': '$_id',
             'cast': {'$first': '$cast'},
             'crew': {'$first': '$crew'},
             'id': {'$first': '$id'}}},
 {'$project': {'id': 1}}]
Running keyword query: clint + eastwood
Generated query
[{'$match': {'$expr': {'$regexMatch': {'input': '$tagline',
                                       'options': 'i',
                                       'regex': 'clint eastwood'}}}},
 {'$project': {'id': 1}}]
Running keyword query: will + smith
Generated query
[{'$set': {'crew_dup': '$crew'}},
 {'$unwind': '$crew_dup'},
 {'$match': {'$expr': {'$regexMatch': {'input': '$crew_dup.name',
                                       'optio

In [28]:
evaluation_handler = EvaluationHandler(
    DATASET_NAME,
    sereia.config,
)

evaluation_handler.load_golden_standards()

In [29]:
evaluated_results = evaluation_handler.evaluate_results(
    result,
    results_filename='results/' + DATASET_NAME,
)

QM Evaluation {'mrr': 0.4342993028476899, 'p@1': 0.11538461538461539, 'p@2': 0.5384615384615384, 'p@3': 0.7692307692307693, 'p@4': 0.8076923076923077, 'p@5': 0.8461538461538461, 'p@6': 0.8461538461538461, 'p@7': 0.8846153846153846, 'p@8': 0.9230769230769231, 'p@9': 0.9230769230769231, 'p@10': 0.9230769230769231, 'relevant_positions': [1, 3, 3, 2, 2, 2, 2, 2, 2, 3, 31, 2, 3, 1, 1, 24, 2, 2, 5, 2, 4, 2, 7, 8, 3, 3]}
CJN Evaluation {'mrr': 0.4332267091882476, 'p@1': 0.11538461538461539, 'p@2': 0.5384615384615384, 'p@3': 0.7692307692307693, 'p@4': 0.8076923076923077, 'p@5': 0.8461538461538461, 'p@6': 0.8461538461538461, 'p@7': 0.8846153846153846, 'p@8': 0.8846153846153846, 'p@9': 0.8846153846153846, 'p@10': 0.8846153846153846, 'relevant_positions': [1, 3, 3, 2, 2, 2, 2, 2, 2, 3, 26, 2, 3, 1, 1, 24, 2, 2, 5, 2, 4, 2, 7, 11, 3, 3]}
Results filename: results/imdb


In [30]:
# QM = [1, 3, 3, 2, 2, 2, 2, 2, 1, 2, -1, 2, 3, 1, 1, 24, 1, 5, 4, 2, 4, 2, 6, 5, 1, 3]
# CN = [1, 3, 3, 2, 2, 2, 2, 2, 1, 2, -1, 2, 3, 1, 1, 24, 1, 3, 4, 2, 4, 2, 6, 8, 1, 3]
# result['results'][10].kms()

In [31]:
# result['results'][24].qms()
# sereia.index_handler.value_index['casablanca')

# Loading data from results file

In [37]:
import json

results_file = f'results/{DATASET_NAME}.json'

In [38]:
with open(results_file) as f:
    results_data = json.load(f)

In [39]:
num_qms = results_data['evaluation']['num_query_matches']
num_cns = results_data['evaluation']['num_candidate_networks']

In [40]:
quality_results = results_data['evaluation']

In [41]:
quality_results['retrieval_score']

[{'keyword_query': 'denzel + washington',
  'precision': 1.0,
  'recall': 1.0,
  'num_documents_retrieved': 48,
  'num_documents_expected': 48},
 {'keyword_query': 'clint + eastwood',
  'precision': 1.0,
  'recall': 0.047619047619047616,
  'num_documents_retrieved': 3,
  'num_documents_expected': 63},
 {'keyword_query': 'will + smith',
  'precision': 0.4,
  'recall': 0.2,
  'num_documents_retrieved': 15,
  'num_documents_expected': 30},
 {'keyword_query': 'harrison + ford',
  'precision': 1.0,
  'recall': 0.017241379310344827,
  'num_documents_retrieved': 1,
  'num_documents_expected': 58},
 {'keyword_query': 'tom + hanks',
  'precision': 0.45454545454545453,
  'recall': 0.14084507042253522,
  'num_documents_retrieved': 22,
  'num_documents_expected': 71},
 {'keyword_query': 'johnny + depp',
  'precision': 0.8333333333333334,
  'recall': 0.07246376811594203,
  'num_documents_retrieved': 6,
  'num_documents_expected': 69},
 {'keyword_query': 'angelina + jolie',
  'precision': 0.33333333

In [42]:
# result['results'][24].qms()
precision_data = []
recall_data = []
for item in quality_results['retrieval_score']:
    # print(item)
    precision_data.append(item['precision'])
    recall_data.append(item['recall'])

print(sum(precision_data)/len(precision_data))
print(sum(recall_data)/len(recall_data))

0.5264199134199133
0.30655200333806515


In [18]:
# # result['results'][16].cjns()

# sereia.execute_mongo_query(
#     'credits',
#     [
#             {
#                 "$set": {
#                     "cast_dup": "$cast"
#                 }
#             },
#             {
#                 "$unwind": "$cast_dup"
#             },
#             {
#                 "$match": {
#                     "cast_dup.name": "Denzel Washington"
#                 }
#             },
#             {
#                 "$group": {
#                     "_id": "$_id",
#                     "cast": {
#                         "$first": "$cast"
#                     },
#                     "crew": {
#                         "$first": "$crew"
#                     },
#                     "id": {
#                         "$first": "$id"
#                     }
#                 }
#             },
#             {
#                 "$project": {
#                     "id": 1
#                 }
#             }
#         ])

# # for num, item in enumerate(zip(num_qms, num_cns)):
# #     print(f'Query #{num + 1}: {item}')