Skip to content
This repository has been archived by the owner on Mar 21, 2018. It is now read-only.

Commit

Permalink
feat(weight): Add more weight values in default weight
Browse files Browse the repository at this point in the history
  • Loading branch information
pixelastic committed Nov 3, 2015
1 parent 2aea461 commit f6c43c4
Show file tree
Hide file tree
Showing 2 changed files with 87 additions and 13 deletions.
28 changes: 22 additions & 6 deletions lib/record_extractor.rb
Original file line number Diff line number Diff line change
Expand Up @@ -135,10 +135,9 @@ def node_css_selector(node)
node.css_path.gsub('html > body > ', '')
end

# Returns a custom numeric value representing how relevant to its hierarchy
# this record is. This value can be used in the custom ranking to display more
# relevant records first.
def weight(data)
# The more words are in common between this node and its parent heading, the
# higher the score
def weight_heading_relevance(data)
# Get list of unique words in headings
title_words = %i(title h1 h2 h3 h4 h5 h6)
.select { |title| data.key?(title) }
Expand All @@ -152,21 +151,38 @@ def weight(data)
(title_words & text_words).size
end

# Returns a weight based on the tag_name
def weight_tag_name(item)
tag_name = item[:tag_name]
# No a heading, no weight
return 0 unless %w(h1 h2 h3 h4 h5 h6).include?(tag_name)
# h1: 100, h2: 90, ..., h6: 50
100 - (tag_name.gsub('h', '').to_i - 1) * 10
end

# Returns an object of all weights
def weight(item, index)
{
tag_name: weight_tag_name(item),
heading_relevance: weight_heading_relevance(item),
position: index
}
end

def extract
items = []
html_nodes.each_with_index do |node, index|
next unless node.text.size > 0

item = metadata.clone
item[:objectID] = "#{item[:slug]}_#{index}"
item.merge!(node_hierarchy(node))
item[:tag_name] = node.name
item[:raw_html] = node_raw_html(node)
item[:text] = node_text(node)
item[:unique_hierarchy] = unique_hierarchy(item)
item[:css_selector] = node_css_selector(node)
item[:css_selector_parent] = node_css_selector(node_heading_parent(node))
item[:weight] = weight(item)
item[:weight] = weight(item, index)

# We pass item through the user defined custom hook
item = custom_hook_each(item, node)
Expand Down
72 changes: 65 additions & 7 deletions spec/record_extractor_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -316,7 +316,7 @@
end
end

describe 'weight' do
describe 'weight_heading_relevance' do
it 'gets the number of words in text also in the title' do
# Given
data = {
Expand All @@ -325,7 +325,7 @@
}

# When
actual = page_file.weight(data)
actual = page_file.weight_heading_relevance(data)

# Then
expect(actual).to eq 2
Expand All @@ -341,7 +341,7 @@
}

# When
actual = page_file.weight(data)
actual = page_file.weight_heading_relevance(data)

# Then
expect(actual).to eq 3
Expand All @@ -357,7 +357,7 @@
}

# When
actual = page_file.weight(data)
actual = page_file.weight_heading_relevance(data)

# Then
expect(actual).to eq 2
Expand All @@ -372,7 +372,7 @@
}

# When
actual = page_file.weight(data)
actual = page_file.weight_heading_relevance(data)

# Then
expect(actual).to eq 2
Expand All @@ -386,7 +386,7 @@
}

# When
actual = page_file.weight(data)
actual = page_file.weight_heading_relevance(data)

# Then
expect(actual).to eq 1
Expand All @@ -407,13 +407,71 @@
}

# When
actual = page_file.weight(data)
actual = page_file.weight_heading_relevance(data)

# Then
expect(actual).to eq 2
end
end

describe 'weight_tag_name' do
it 'gives a score of 0 to non-headings' do
# Given
data = {
tag_name: 'p'
}

# When
actual = page_file.weight_tag_name(data)

# Then
expect(actual).to eq 0
end
it 'gives a score of 100 to h1' do
# Given
data = {
tag_name: 'h1'
}

# When
actual = page_file.weight_tag_name(data)

# Then
expect(actual).to eq 100
end
it 'gives a score of 40 to h6' do
# Given
data = {
tag_name: 'h6'
}

# When
actual = page_file.weight_tag_name(data)

# Then
expect(actual).to eq 50
end
end

describe 'weight' do
it 'returns an object with all weights' do
# Given
item = {
tag_name: 'p'
}
allow(page_file).to receive(:weight_tag_name) { 10 }
allow(page_file).to receive(:weight_heading_relevance) { 20 }

# When
actual = page_file.weight(item, 42)

# Then
expect(actual).to include(tag_name: 10)
expect(actual).to include(heading_relevance: 20)
expect(actual).to include(position: 42)
end
end

describe 'custom_hook_each' do
it 'let the user call a custom hook to modify a record' do
# Given
Expand Down

0 comments on commit f6c43c4

Please sign in to comment.