In [1]:
from itertools import islice
from modules.compression import Compress
from modules.dataset import Dataset
from modules.index import Indexer
from modules.query import Query

import ast

## Italian Lyrics Retrieval System

#### Load Dataset and Remove Duplicates (simhash)
Firstly, unpack the compressed documents in the data folders. Remove duplicate documents from the same author (see example in the report).

In [None]:
dataset = Dataset()

dataset.load_dataset('../data/documents/')
dataset.remove_duplicates()

#### Index Creation

In [None]:
indexer = Indexer()

indexer.create_positional_index(dataset)
indexer.create_indexes(dataset)

#### Compression

In [None]:
lite_index = Indexer.load_lite_index('../index/lite_index.txt')

compress = Compress(block_size = 4)

compress.compress_index(lite_index)

#### Query

Example:
- Author: Raffaella Carrà, Raffaella, Carrà
- Title: Tanti Auguri
- Lyrics: "Com'è bello far l'amore da Trieste in giù"
- Lyrics: Trieste

In [2]:
print("Loading the indexes... (this may take a while)")

author_trie, title_trie, positional_trie = Indexer.load_indexes()

with open('../index/documents_info.txt', 'r') as f:
	document_info = ast.literal_eval(f.read())

with open('../index/avglen.txt', 'r') as f:
	avglen = float(f.read())

query_engine = Query(author_trie, title_trie, positional_trie)

print("Which type of query do you want to perform? (a = author, t = title, l = lyrics)")

query_type = input()

while True:
	assert query_type in ['a', 't', 'l'], "Invalid query type."

	print(f"Enter your [{query_type}] query: (for phrase query on lyrics, use double quotes)")
	query = input()

	results = {}

	if query_type == 'a':
		print(query_engine.perform_author_query(query))
	elif query_type == 't':
		print(query_engine.perform_title_query(query))
	elif query_type == 'l':
		if query[0] == '"' and query[-1] == '"':
			results = query_engine.perform_phrase_query(query)
		else:
			results = query_engine.perform_okapi_bm25(query, document_info, avglen)
	else:
		print("Invalid query type.")

	for key, value in islice(results.items(), 5):
		print(f'{key} -> {value}')

	print("Do you want to perform another query? (y/n)")
	choice = input()
	if choice == 'n':
		break
	elif choice == 'y':
		print("Which type of query do you want to perform? (a = author, t = title, l = lyrics)")
		query_type = input()
	else:
		print("Invalid choice.")
		exit()

Loading the indexes... (this may take a while)


Loading lite index: 100%|██████████| 460/460 [00:00<00:00, 12394.59it/s]
Loading lite index: 100%|██████████| 11726/11726 [00:00<00:00, 140983.19it/s]
Loading positional index: 100%|██████████| 54443/54443 [00:11<00:00, 4691.03it/s]


Which type of query do you want to perform? (a = author, t = title, l = lyrics)
Enter your [l] query: (for phrase query on lyrics, use double quotes)
Preprocessoring query: Fare l'amore da Trieste
Query tokens: ['far', 'amor', 'triest']
23369 -> 18.006218285564955
3801 -> 14.473569045371736
25983 -> 11.73124986192073
24961 -> 10.501738014507374
60 -> 10.122022779230187
Do you want to perform another query? (y/n)
