From 6550828ccef29074eacfe5fe6ff29618661da1d5 Mon Sep 17 00:00:00 2001 From: Boon Low Date: Wed, 21 Nov 2018 17:21:13 +0000 Subject: [PATCH] provide a `sort` boolean option to enable/disable ranking --- README.md | 19 ++++++++++++++++++- lib/information_retrieval.ex | 22 ++++++++++++++-------- test/search_test.exs | 8 ++++++++ 3 files changed, 40 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index 63fa440..715f901 100644 --- a/README.md +++ b/README.md @@ -50,7 +50,7 @@ a corpus and index of 1000 (max) docs from the CSV dataset if pre-created index / corpus are not supplied. The query will be issued on this small index. -Ranked docs (ids) with scores are currently being returned. +Doc ids and scores are currently being returned as results. ```elixir # quick search test with up to 1000 max docs @@ -77,6 +77,23 @@ Ranked docs (ids) with scores are currently being returned. iex> IR.q "van eyck", index: index, corpus: corpus, op: :and +``` + +Ranking and sorting of results can be toggled with the `:sort` option. + +```elixir + iex> IR.q "christopher columbus carlo eyck galileo galilei", sort: false + Indexing.. + Found 5 results. + [{1, 1.25276}, {4, 0.55962}, {5, 0.55962}, {6, 0.55962}, {7, 5.01105}] + + # ranking with relevancy + iex(6)> IR.q "christopher columbus carlo eyck galileo galilei", sort: true + Indexing.. + Found 5 results. + [{7, 5.01105}, {1, 1.25276}, {4, 0.55962}, {5, 0.55962}, {6, 0.55962}] + + ``` ## CSV data diff --git a/lib/information_retrieval.ex b/lib/information_retrieval.ex index f3adae5..153b7f3 100644 --- a/lib/information_retrieval.ex +++ b/lib/information_retrieval.ex @@ -167,6 +167,7 @@ defmodule IR do - `:op` - default `:or`, match ALL (`:and`) or ANY (`:or`) terms in the query - `:corpus`, parsed data required for results display and ranking purposes - `:index`, pre-created search data for querying and ranking purposes + - `:sort` default `true`, rank results by relevancy ### Example @@ -199,7 +200,7 @@ defmodule IR do ``` """ @spec q(binary, keyword) :: list[binary] - def q(query, opts \\ [index: nil, corpus: nil, op: :or]) + def q(query, opts \\ [index: nil, corpus: nil, op: :or, sort: true]) def q(query, opts) do op = if opts[:op], do: opts[:op], else: :or @@ -207,14 +208,14 @@ defmodule IR do # if no index / corpus are provided if is_nil(opts[:index]) or is_nil(opts[:corpus]) do {:ok, index, corpus} = indexing(1000, corpus: true) - q(query, index, corpus, op) + q(query, index, corpus, op, opts[:sort]) else - q(query, opts[:index], opts[:corpus], op) + q(query, opts[:index], opts[:corpus], op, opts[:sort]) end end @doc false - def q(query, index, corpus, op) do + def q(query, index, corpus, op, sort) do terms = query |> analyse posting_sets = terms |> Enum.map(&(index[&1])) @@ -244,11 +245,16 @@ defmodule IR do # # next: could derive scores by computing cosine similarity # between doc vectors and search term vector - ranked_ids_with_scores = term_doc_matrix - |> Enum.map(fn {doc_id, vector} -> {doc_id, Enum.sum(vector) |> Float.round(5)} end) - |> Enum.sort_by(&(elem(&1,1)), &>=/2) + unranked_ids_with_scores = term_doc_matrix + |> Enum.map(fn {doc_id, vector} -> {doc_id, Enum.sum(vector) |> Float.round(5)} end) + + case sort do + true -> + unranked_ids_with_scores |> Enum.sort_by(&(elem(&1,1)), &>=/2) + false -> unranked_ids_with_scores + _ -> unranked_ids_with_scores + end - ranked_ids_with_scores end # single keyword postings diff --git a/test/search_test.exs b/test/search_test.exs index 0f1f712..79b6498 100644 --- a/test/search_test.exs +++ b/test/search_test.exs @@ -46,4 +46,12 @@ defmodule IRSearchTest do assert doc_ids == [] end + test "ranking keywords search results", %{index: index, corpus: corpus} do + doc_ids = IR.q "christopher columbus carlo eyck galileo galilei", index: index, corpus: corpus, sort: false + assert doc_ids |> Enum.map(&(elem(&1,0))) == [1, 4, 5, 6, 7] + + doc_ids = IR.q "christopher columbus carlo eyck galileo galilei", index: index, corpus: corpus, sort: true + assert doc_ids |> Enum.map(&(elem(&1,0))) == [7, 1, 4, 5, 6] + end + end