Skip to content

Commit

Permalink
added rule and page_worker parsing functions.
Browse files Browse the repository at this point in the history
  • Loading branch information
c2h2 committed May 8, 2012
1 parent 9434349 commit 963408e
Show file tree
Hide file tree
Showing 3 changed files with 67 additions and 4 deletions.
10 changes: 10 additions & 0 deletions app/models/page.rb
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,16 @@ def get_port
p.updated_at = self.updated_at
p
end

def parse_page
if self.content.nil?
return nil
end
@noko_doc = Nokogiri::HTML(self.content)
@found_links = @noko_doc.css('a').map{|l| l['href'].to_s}
Util.log "Found #{@found_links.count} links"
@found_links
end
end

class PagePort
Expand Down
40 changes: 40 additions & 0 deletions app/models/rule.rb
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,44 @@ class Rule
field :domain, :type => String
field :detail, :type => String
field :allow, :type => Boolean


def self.ok? url
the_domain = self.domain url
rules = Rule.where(:domain => the_domain)
if rules.size == 0
return false
else
rules.each do |rule|
regexp = Regexp.new(rule.detail)
=begin
if rule.allow
#white filter
if url =~ rule.detail
return true
end
else
#black filter
if url =~ rule.detail
return false
end
end
=end
#this logic is = above logic
unless (url =~ regexp).nil?
return rule.allow
end
end

end
end

def self.domain url
domain = nil
begin
domain=URI.parse(url).host
rescue
end
domain
end
end
21 changes: 17 additions & 4 deletions lib/medusa/page_worker.rb
Original file line number Diff line number Diff line change
Expand Up @@ -40,14 +40,27 @@ def process_one_page
end
end

def save_url
#determine if valid
def valid_url? url
(url.to_s =~ /^(http|https):\/\/[a-z0-9]+([\-\.]{1}[a-z0-9]+)*\.[a-z]{2,5}(:[0-9]{1,5})?(\/.*)?$/ix)
end


def save_url url
#determine if valid
unless valid_url?(url)
return
end
#determine if intersted. (domain, regex, and similarity)

unless Rule.ok?(url)
return
end

#determine if duplicated.
if Link.exists? url
return
end

l=Link.new
l.save

end

Expand Down

0 comments on commit 963408e

Please sign in to comment.