Skip to content

Commit

Permalink
provide a sort boolean option to enable/disable ranking
Browse files Browse the repository at this point in the history
  • Loading branch information
boonious committed Nov 21, 2018
1 parent 67761b2 commit 6550828
Show file tree
Hide file tree
Showing 3 changed files with 40 additions and 9 deletions.
19 changes: 18 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ a corpus and index of 1000 (max) docs from the
CSV dataset if pre-created index / corpus are not supplied.
The query will be issued on this small index.

Ranked docs (ids) with scores are currently being returned.
Doc ids and scores are currently being returned as results.

```elixir
# quick search test with up to 1000 max docs
Expand All @@ -77,6 +77,23 @@ Ranked docs (ids) with scores are currently being returned.
iex> IR.q "van eyck", index: index, corpus: corpus, op: :and


```

Ranking and sorting of results can be toggled with the `:sort` option.

```elixir
iex> IR.q "christopher columbus carlo eyck galileo galilei", sort: false
Indexing..
Found 5 results.
[{1, 1.25276}, {4, 0.55962}, {5, 0.55962}, {6, 0.55962}, {7, 5.01105}]

# ranking with relevancy
iex(6)> IR.q "christopher columbus carlo eyck galileo galilei", sort: true
Indexing..
Found 5 results.
[{7, 5.01105}, {1, 1.25276}, {4, 0.55962}, {5, 0.55962}, {6, 0.55962}]


```

## CSV data
Expand Down
22 changes: 14 additions & 8 deletions lib/information_retrieval.ex
Original file line number Diff line number Diff line change
Expand Up @@ -167,6 +167,7 @@ defmodule IR do
- `:op` - default `:or`, match ALL (`:and`) or ANY (`:or`) terms in the query
- `:corpus`, parsed data required for results display and ranking purposes
- `:index`, pre-created search data for querying and ranking purposes
- `:sort` default `true`, rank results by relevancy
### Example
Expand Down Expand Up @@ -199,22 +200,22 @@ defmodule IR do
```
"""
@spec q(binary, keyword) :: list[binary]
def q(query, opts \\ [index: nil, corpus: nil, op: :or])
def q(query, opts \\ [index: nil, corpus: nil, op: :or, sort: true])
def q(query, opts) do
op = if opts[:op], do: opts[:op], else: :or

# index and build a corpus for 1000 documents from the dataset
# if no index / corpus are provided
if is_nil(opts[:index]) or is_nil(opts[:corpus]) do
{:ok, index, corpus} = indexing(1000, corpus: true)
q(query, index, corpus, op)
q(query, index, corpus, op, opts[:sort])
else
q(query, opts[:index], opts[:corpus], op)
q(query, opts[:index], opts[:corpus], op, opts[:sort])
end
end

@doc false
def q(query, index, corpus, op) do
def q(query, index, corpus, op, sort) do
terms = query |> analyse
posting_sets = terms |> Enum.map(&(index[&1]))

Expand Down Expand Up @@ -244,11 +245,16 @@ defmodule IR do
#
# next: could derive scores by computing cosine similarity
# between doc vectors and search term vector
ranked_ids_with_scores = term_doc_matrix
|> Enum.map(fn {doc_id, vector} -> {doc_id, Enum.sum(vector) |> Float.round(5)} end)
|> Enum.sort_by(&(elem(&1,1)), &>=/2)
unranked_ids_with_scores = term_doc_matrix
|> Enum.map(fn {doc_id, vector} -> {doc_id, Enum.sum(vector) |> Float.round(5)} end)

case sort do
true ->
unranked_ids_with_scores |> Enum.sort_by(&(elem(&1,1)), &>=/2)
false -> unranked_ids_with_scores
_ -> unranked_ids_with_scores
end

ranked_ids_with_scores
end

# single keyword postings
Expand Down
8 changes: 8 additions & 0 deletions test/search_test.exs
Original file line number Diff line number Diff line change
Expand Up @@ -46,4 +46,12 @@ defmodule IRSearchTest do
assert doc_ids == []
end

test "ranking keywords search results", %{index: index, corpus: corpus} do
doc_ids = IR.q "christopher columbus carlo eyck galileo galilei", index: index, corpus: corpus, sort: false
assert doc_ids |> Enum.map(&(elem(&1,0))) == [1, 4, 5, 6, 7]

doc_ids = IR.q "christopher columbus carlo eyck galileo galilei", index: index, corpus: corpus, sort: true
assert doc_ids |> Enum.map(&(elem(&1,0))) == [7, 1, 4, 5, 6]
end

end

0 comments on commit 6550828

Please sign in to comment.