This repository has been archived by the owner on Mar 21, 2018. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 13
/
record_extractor.rb
213 lines (174 loc) · 5.73 KB
/
record_extractor.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
require 'algoliasearch'
require 'nokogiri'
require 'json'
# Given an HTML file as input, will return an array of records to index
class AlgoliaSearchRecordExtractor
attr_reader :file
def initialize(file)
@file = file
@config = file.site.config
default_config = {
'record_css_selector' => 'p'
}
@config = default_config.merge(file.site.config['algolia'])
end
# Hook to modify a record after extracting
def custom_hook_each(item, _node)
item
end
# Hook to modify all records after extracting
def custom_hook_all(items)
items
end
# Returns metadata from the current file
def metadata
metadata = {}
@file.data.each { |key, value| metadata[key.to_sym] = value }
metadata[:type] = @file.class.name.split('::')[1].downcase
metadata[:url] = @file.url
metadata[:slug] = slug
metadata[:posted_at] = @file.date.to_time.to_i if @file.respond_to? :date
metadata[:tags] = tags
metadata
end
# Returns the slug of the document
def slug
# Jekyll v3 has it in data
return @file.data['slug'] if @file.data.key?('slug')
# Old Jekyll v2 has it at the root
return @file.slug if @file.respond_to? :slug
# Otherwise, we guess it from the filename
basename = File.basename(@file.path)
extname = File.extname(basename)
File.basename(basename, extname)
end
# Extract a list of tags
def tags
tags = nil
# Jekyll v3 has it in data, while v2 have it at the root
if @file.data.key?('tags')
tags = @file.data['tags']
elsif @file.respond_to? :tags
tags = @file.tags
end
return tags if tags.nil?
# Anyway, we force cast it to string as some plugins will extend the tags to
# full featured objects
tags.map(&:to_s)
end
# Get the list of all HTML nodes to index
def html_nodes
document = Nokogiri::HTML(@file.content)
document.css(@config['record_css_selector'])
end
# Check if node is a heading
def node_heading?(node)
%w(h1 h2 h3 h4 h5 h6).include?(node.name)
end
# Get the closest heading parent
def node_heading_parent(node, level = 'h7')
# If initially called on a heading, we only accept stronger headings
level = node.name if level == 'h7' && node_heading?(node)
previous = node.previous_element
# No previous element, we go up to the parent
unless previous
parent = node.parent
# No more parent, then no heading found
return nil if parent.name == 'body'
return node_heading_parent(parent, level)
end
# This is a heading, we return it
return previous if node_heading?(previous) && previous.name < level
node_heading_parent(previous, level)
end
# Get all the parent headings of the specified node
# If the node itself is a heading, we include it
def node_hierarchy(node, state = { level: 7 })
tag_name = node.name
level = tag_name.delete('h').to_i
if node_heading?(node) && level < state[:level]
state[tag_name.to_sym] = node_text(node)
state[:level] = level
end
heading = node_heading_parent(node)
# No previous heading, we can stop the recursion
unless heading
state.delete(:level)
return state
end
node_hierarchy(heading, state)
end
# Return the raw HTML of the element to index
def node_raw_html(node)
node.to_s
end
# Return the text of the element, sanitized to be displayed
def node_text(node)
node.content.gsub('<', '<').gsub('>', '>')
end
# Returns a unique string of hierarchy from title to h6, used for distinct
def unique_hierarchy(data)
headings = %w(title h1 h2 h3 h4 h5 h6)
headings.map { |heading| data[heading.to_sym] }.compact.join(' > ')
end
# Returns a hash of two CSS selectors. One for the node itself, and one its
# closest heading parent
def node_css_selector(node)
return nil if node.nil?
# Use the CSS id if one is set
return "##{node['id']}" if node['id']
# Default Nokogiri selector
node.css_path.gsub('html > body > ', '')
end
# The more words are in common between this node and its parent heading, the
# higher the score
def weight_heading_relevance(data)
# Get list of unique words in headings
title_words = %i(title h1 h2 h3 h4 h5 h6)
.select { |title| data.key?(title) }
.map { |title| data[title].to_s.split(/\W+/) }
.flatten
.compact
.map(&:downcase)
.uniq
# Intersect words in headings with words in test
text_words = data[:text].downcase.split(/\W+/)
(title_words & text_words).size
end
# Returns a weight based on the tag_name
def weight_tag_name(item)
tag_name = item[:tag_name]
# No a heading, no weight
return 0 unless %w(h1 h2 h3 h4 h5 h6).include?(tag_name)
# h1: 100, h2: 90, ..., h6: 50
100 - (tag_name.delete('h').to_i - 1) * 10
end
# Returns an object of all weights
def weight(item, index)
{
tag_name: weight_tag_name(item),
heading_relevance: weight_heading_relevance(item),
position: index
}
end
def extract
items = []
html_nodes.each_with_index do |node, index|
next if node.text.empty?
item = metadata.clone
item.merge!(node_hierarchy(node))
item[:tag_name] = node.name
item[:raw_html] = node_raw_html(node)
item[:text] = node_text(node)
item[:unique_hierarchy] = unique_hierarchy(item)
item[:css_selector] = node_css_selector(node)
item[:css_selector_parent] = node_css_selector(node_heading_parent(node))
item[:weight] = weight(item, index)
# We pass item through the user defined custom hook
item = custom_hook_each(item, node)
next if item.nil?
items << item
end
custom_hook_all(items)
end
end