Skip to content
Browse files

Added basic processing of wikitext, instructing the program to ignore…

… links already wikified.
  • Loading branch information...
1 parent 7a54d93 commit ca0d3ffbf3db8dfc3cb63ddbb6974a712092975d @agrimm committed May 31, 2008
View
7 app/controllers/read_controller.rb
@@ -2,6 +2,7 @@ class ReadController < ApplicationController
def read
@repository_choices = Repository.find(:all).map {|rc| [rc.short_description, rc.id]}
+ @markup_choices = [ ["Auto-detect (default)", "auto-detect"], ["MediaWiki formatting", "mediawiki"], ["Plain text", "plain"] ]
if request.post?
@errors = []
if params[:document_text].blank?
@@ -13,9 +14,13 @@ def read
unless repository
@errors << "Can't find repository"
end
+ markup = params[:markup]
+ unless @markup_choices.map{|pair| pair.last}.include?(markup)
+ @errors << "Invalid markup choice"
+ end
if @errors.empty?
begin
- @parse_results = Article.parse_text_document(document_text, repository)
+ @parse_results = Article.parse_text_document(document_text, repository, markup)
rescue ArgumentError => error
if error.message == "Document has too many words"
@errors << "Please submit a text fewer than #{Article.maximum_allowed_document_size} words long"
View
118 app/models/article.rb
@@ -23,51 +23,68 @@ def self.break_up_phrase(phrase)
#Determine if a phrase is boring
#That is, it has one or zero non-boring words
- def self.phrase_is_boring?(phrase)
+ # and that the wiki text doesn't already link to it (if applicable)
+ def self.phrase_is_boring?(phrase, existing_article_titles)
+ #if existing_article_titles.any?{|existing_article_title| existing_article_title.chars.downcase.to_s.include?(phrase.chars.downcase)} #Unicode safe, too slow? :(
+ if existing_article_titles.any?{|existing_article_title| existing_article_title.downcase.include?(phrase.downcase)} #Not unicode safe?
+ return true
+ end
words = break_up_phrase(phrase)
#count how many words are non-boring
boring_words = %w{a and also are be been for get has in is just me of on only see than this the there was january february march april may june july august september october november december}
number_non_boring_words = 0
words.each do |word|
- number_non_boring_words += 1 unless boring_words.include?(word.chars.downcase)
+ number_non_boring_words += 1 unless boring_words.include?(word.downcase) #Not unicode safe?
+ #number_non_boring_words += 1 unless boring_words.include?(word.chars.downcase) #Unicode safe
end
return true unless number_non_boring_words > 1
end
#Return all articles that match the requested phrase
#Probably should only return one article, but return an array just in case
- def self.find_matching_articles(phrase, repository)
- return [] if phrase_is_boring?(phrase)
+ def self.find_matching_articles(phrase, repository, existing_article_titles)
+ return [] if phrase_is_boring?(phrase, existing_article_titles)
articles = find(:all, :conditions => ["title = ? and repository_id = ?", phrase, repository], :limit => 1)
articles
end
#Informs the caller if they should try a longer phrase than the current one in order to get a match
- def self.try_longer_phrase?(phrase, repository)
- if phrase_is_boring?(phrase)
+ def self.try_longer_phrase?(phrase, repository, existing_article_titles)
+ if phrase_is_boring?(phrase, existing_article_titles)
return true #Otherwise it chews up too much server time
end
potentially_matching_articles = find(:all, :conditions => ["title like ? and repository_id = ?", phrase + "%", repository], :limit=>1)
return !potentially_matching_articles.empty?
end
+ #The main method called from the controller
#Read in a document, and return an array of phrases and their matching articles
#Strategy: split into words, then iterate through the words
- def self.parse_text_document(document_text, repository)
+ def self.parse_text_document(document_text, repository, markup)
parse_results = []
words = break_up_phrase(document_text)
raise(ArgumentError, "Document has too many words") if words.size > maximum_allowed_document_size
+ if (markup == "auto-detect")
+ markup = self.markup_autodetect(document_text)
+ end
+ if (markup == "mediawiki")
+ wiki_text = document_text.dup
+ parsed_wiki_text = self.parse_wiki_text(wiki_text)
+ existing_article_titles = self.parse_existing_wiki_links(parsed_wiki_text)
+ else
+ existing_article_titles = []
+ end
i = 0
while(true)
j = 0
phrase = words[i + j]
while(true)
- matching_articles = find_matching_articles(phrase, repository)
+ matching_articles = find_matching_articles(phrase, repository, existing_article_titles)
matching_articles.each do |matching_article|
parse_results << [phrase, matching_article]
end
- break unless (try_longer_phrase?(phrase, repository) and i + j + 1 < words.size)
+ break unless (try_longer_phrase?(phrase, repository, existing_article_titles) and i + j + 1 < words.size)
j = j + 1
phrase += " "
phrase += words[i + j]
@@ -83,7 +100,7 @@ def self.parse_text_document(document_text, repository)
#a method to get rid of the duplicate results
def self.clean_results(parse_results)
- parse_results.delete_if {|x| !(x[0].include?(" ") )}
+ parse_results.delete_if {|x| !(x[0].include?(" ") )} #This line may be redundant
#Get rid of results with a phrase shorter than another phrase in parse_results
#Get rid of results with a phrase already included in cleaned_results
cleaned_results = []
@@ -110,4 +127,85 @@ def self.clean_results(parse_results)
cleaned_results
end
+ #Remove from MediaWiki text anything that is surrounded by <nowiki>
+ def self.parse_nowiki(wiki_text)
+ loop do
+ #Delete anything paired by nowiki, non-greedily
+ #Assumes that there aren't nested nowikis
+ substitution_made = wiki_text.gsub!(%r{<nowiki>(.*?)</nowiki>}im,"")
+ break unless substitution_made
+ end
+ wiki_text
+ end
+
+ #Remove from MediaWiki text anything within a template
+ def self.parse_templates(wiki_text)
+ loop do
+ #Delete anything with paired {{ and }}, so long as no opening braces are inside
+ #Should closing braces inside be forbidden as well?
+ substitution_made = wiki_text.gsub!(%r{\{\{([^\{]*?)\}\}}im,"")
+ break unless substitution_made
+ end
+ wiki_text
+ end
+
+ #Remove from MediaWiki text anything in an external link
+ #This will remove the description of the link as well - for now
+ def self.parse_external_links(wiki_text)
+ #Delete everything starting with an opening square bracket, continuing with non-bracket characters until a colon, then any characters until it reaches a closing square bracket
+ wiki_text.gsub!(%r{\[[^\[]+?:[^\[]*?\]}im, "")
+ wiki_text
+ end
+
+ #Remove paired XHTML-style syntax
+ def self.parse_paired_tags(wiki_text)
+ #Remove paired tags
+ wiki_text.gsub!(%r{<([a-zA-Z]*)>(.*?)</\1>}im, '\2')
+ wiki_text
+ end
+
+ #Remove non-paired XHTML-style syntax
+ def self.parse_unpaired_tags(wiki_text)
+ wiki_text.gsub!(%r{<[a-zA-Z]*/>}im, "")
+ wiki_text
+ end
+
+ #Remove links to other namespaces (eg [[Wikipedia:Manual of Style]]) , to media (eg [[Image:Wiki.png]]) and to other wikis (eg [[es:Plancton]])
+ def self.parse_non_direct_links(wiki_text)
+ wiki_text.gsub!(%r{\[\[[^\[\]]*?:([^\[]|\[\[[^\[]*\]\])*?\]\]}im, "")
+ wiki_text
+ end
+
+ #Remove from wiki_text anything that could confuse the program
+ def self.parse_wiki_text(wiki_text)
+ wiki_text = self.parse_nowiki(wiki_text)
+ wiki_text = self.parse_templates(wiki_text)
+ wiki_text = self.parse_paired_tags(wiki_text)
+ wiki_text = self.parse_unpaired_tags(wiki_text)
+ wiki_text = self.parse_non_direct_links(wiki_text)
+ wiki_text = self.parse_external_links(wiki_text) #Has to come after parse_non_direct_links for now
+ wiki_text
+ end
+
+ #Look for existing wikilinks in a piece of text
+ def self.parse_existing_wiki_links(wiki_text)
+ unparsed_match_arrays = wiki_text.scan(%r{\[\[([^\]\#\|]*)([^\]]*?)\]\]}im)
+ parsed_wiki_article_titles = []
+ unparsed_match_arrays.each do |unparsed_match_array|
+ unparsed_title = unparsed_match_array.first
+ parsed_title = unparsed_title.gsub(/_+/, " ")
+ parsed_wiki_article_titles << parsed_title
+ end
+ parsed_wiki_article_titles.uniq
+ end
+
+ #Determine if the text is in some sort of markup
+ def self.markup_autodetect(document_text)
+ markup = "plain"
+ if document_text =~ %r{\[\[[^\[\]]+\]\]}im
+ markup = "mediawiki"
+ end
+ markup
+ end
+
end
View
4 app/views/read/read.rhtml
@@ -14,9 +14,11 @@
<% form_tag(:action => :read) do %>
<table>
-<tr><td colspan=2> <%= text_area_tag(:document_text, params[:document_text], :cols=> 80, :rows=> 10) %></td></tr>
+<tr><td colspan=2> <%= text_area_tag(:document_text, h(params[:document_text]), :cols=> 80, :rows=> 10) %></td></tr>
<tr><td>Web site:</td>
<td><%= select_tag "repository_id", options_for_select(@repository_choices) %></td></tr>
+<tr><td>Markup (if any)</td>
+<td><%= select_tag "markup", options_for_select(@markup_choices) %></td></tr>
<tr><td> <%= submit_tag("Submit text") %> </td><td></td></tr>
</table>
<% end %>
View
32 config/deploy.example.rb
@@ -0,0 +1,32 @@
+require 'mongrel_cluster/recipes'
+
+set :application, "weatherinlondon"
+set :repository, "https://theweatherinlondon.googlecode.com/svn/trunk/"
+
+set :user, "sample_user_name"
+
+# If you aren't deploying to /u/apps/#{application} on the target
+# servers (which is the default), you can specify the actual location
+# via the :deploy_to variable:
+set :deploy_to, "/home/#{user}/#{application}"
+
+set :deploy_via, :export
+
+# If you aren't using Subversion to manage your source code, specify
+# your SCM below:
+# set :scm, :subversion
+
+#No mongrel cluster available
+#set :mongrel_conf, "#{current_path}/config/mongrel_cluster.yml"
+
+set :symlink_commands, "ln -nfs #{deploy_to}/#{shared_dir}/config/database.yml #{release_path}/config/database.yml"
+
+role :app, "theweatherinlondon.com"
+role :web, "theweatherinlondon.com"
+role :db, "theweatherinlondon.com", :primary => true
+
+#Courtesy of paulhammond.org and also pragmatic deployment, how to deal with database.yml and friends
+desc "link in production database credentials, and other similar files"
+task :after_update_code do
+ run "#{symlink_commands}"
+end
View
12 test/fixtures/articles.yml
@@ -1,5 +1,13 @@
# Read about fixtures at http://ar.rubyonrails.org/classes/Fixtures.html
-one:
+
+Maria Theresa of Austria:
id: 1
-two:
+ uri: http://af.uncyclopedia.wikia.com/wiki/Maria_Theresa_of_Austria
+ title: Maria Theresa of Austria
+ repository_id: 1
+
+Maria Theresa:
id: 2
+ uri: http://af.uncyclopedia.wikia.com/wiki/Maria_Theresa
+ title: Maria Theresa
+ repository_id: 1
View
10 test/fixtures/repositories.yml
@@ -1,7 +1,7 @@
# Read about fixtures at http://ar.rubyonrails.org/classes/Fixtures.html
-# one:
-# column: value
-#
-# two:
-# column: value
+#An uncyclopedia language that does not yet exist
+Afrikaans_uncyclopedia:
+ id: 1
+ abbreviation: af-uncyclopedia
+ short_description: "Afrikaans Uncyclopedia"
View
127 test/unit/article_test.rb
@@ -2,15 +2,17 @@
class ArticleTest < Test::Unit::TestCase
fixtures :articles
+ fixtures :repositories
# Replace this with your real tests.
def test_truth
assert true
end
def test_clean_results
- article1 = Article.create!(:title=> "a", :uri=>"http://www.example.com/1")
- article2 = Article.create!(:title=> "b", :uri=>"http://www.exampel.com/2")
+ repository = Repository.find(:first)
+ article1 = Article.create!(:title=> "a", :uri=>"http://www.example.com/1", :repository=>repository)
+ article2 = Article.create!(:title=> "b", :uri=>"http://www.example.com/2", :repository=>repository)
identical_results = [ ["Winter Olympic", article1] , ["Winter Olympic", article2] ]
cleaned_results = Article.clean_results(identical_results)
assert identical_results.size == 2, "Wrong number of original items"
@@ -20,4 +22,125 @@ def test_clean_results
assert containing_results.size == 2, "Wrong number of original items"
assert cleaned_results.size == 1, "Wrong number of final items"
end
+
+ def test_parse_nowiki
+ generalized_syntax_parsing_testing(:parse_nowiki, "<nowiki>", "</nowiki>", true)
+ generalized_syntax_parsing_testing(:parse_wiki_text, "<nowiki>", "</nowiki>", true)
+ end
+
+ def test_parse_templates
+ generalized_syntax_parsing_testing(:parse_templates, "{{", "}}", true)
+ generalized_syntax_parsing_testing(:parse_wiki_text, "{{", "}}", true)
+ end
+
+ def test_parse_external_links
+ generalized_syntax_parsing_testing(:parse_external_links, "[http://www.example.com", "]", true)
+ generalized_syntax_parsing_testing(:parse_wiki_text, "[http://www.example.com", "]", true)
+ end
+
+ def test_parse_paired_tags
+ generalized_syntax_parsing_testing(:parse_paired_tags, "<ref>", "</ref>", false)
+ generalized_syntax_parsing_testing(:parse_wiki_text, "<ref>", "</ref>", false)
+ end
+
+ def test_parse_unpaired_tags
+ generalized_syntax_parsing_testing(:parse_unpaired_tags, "<references/>", nil, false)
+ generalized_syntax_parsing_testing(:parse_wiki_text, "<references/>", nil, false)
+ end
+
+ def test_parse_non_direct_links
+ generalized_syntax_parsing_testing(:parse_non_direct_links, "[[fr:", "]]", true)
+ generalized_syntax_parsing_testing(:parse_wiki_text, "[[fr:", "]]", true)
+ end
+
+ #More generalized testing of syntax parsing
+ #Assumptions: text is of a form
+ # pre_syntax_text SYNTAX_START inside_syntax_text SYNTAX_FINISH post_syntax_text
+ #and if parsing_removes_inner_section is true, it'll end up as
+ # pre_syntax_text post_syntax_text
+ #else
+ # pre_syntax_text inside_syntax_text post_syntax_text
+ def generalized_syntax_parsing_testing(method_symbol, syntax_start, syntax_finish, parsing_removes_inner_section)
+ pre_syntax_options = ["Internationalization\nLocalization\n", " Internationalization ", "Iñtërnâtiônàlizætiøn", " Iñtërnâtiônàlizætiøn ", " This is Iñtërnâtiônàlizætiøn (ie ǧø ĉȑȧẓẙ with the umlauts!?!). ","Hello: ", "[[Innocent bystander]]"]
+ syntax_options = [ [syntax_start, syntax_finish], ["",""] ]
+ inside_syntax_options = ["http://www.example.com", "Multi\nLine\nExample\n"]
+ post_syntax_options = ["Iñtërnâtiônàlizætiøn", " Iñtërnâtiônàlizætiøn ", " This is Iñtërnâtiônàlizætiøn (ie ǧø ĉȑȧẓẙ with the umlauts!?!). ", "Hello: ", "[[Innocent bystander]]"]
+ syntax_test_pairs = []
+ pre_syntax_options.each do |pre_syntax_option|
+ syntax_options.each do |syntax_option|
+ inside_syntax_options.each do |inside_syntax_option|
+ post_syntax_options.each do |post_syntax_option|
+ syntax_start_option = syntax_option[0] || "" #May be syntax_start, or may be ""
+ syntax_finish_option = syntax_option[1] || "" #May be syntax_finish, or may be ""
+ unparsed_text = pre_syntax_option + syntax_start_option + inside_syntax_option + syntax_finish_option + post_syntax_option
+ if (not (parsing_removes_inner_section) or (syntax_start_option.blank? and syntax_finish_option.blank?) )
+ #Don't remove the inside text
+ parsed_text = pre_syntax_option + inside_syntax_option + post_syntax_option
+ else
+ #Remove the inside text
+ parsed_text = pre_syntax_option + post_syntax_option
+ end
+ syntax_test_pairs << [unparsed_text, parsed_text]
+ end
+ end
+ end
+ end
+ syntax_test_pairs_duplicate = syntax_test_pairs.dup
+ syntax_test_pairs_duplicate.each do |first_pair|
+ syntax_test_pairs_duplicate.each do |second_pair|
+ syntax_test_pairs << [first_pair[0] + second_pair[0], first_pair[1] + second_pair[1] ]
+ end
+ end
+ syntax_test_pairs.each do |syntax_test_pair|
+ unparsed_text = syntax_test_pair[0]
+ parsed_text = syntax_test_pair[1]
+ assert_equal parsed_text, Article.send(method_symbol, unparsed_text)
+ end
+ end
+
+ def test_parse_existing_wiki_links
+ wiki_text = "The rain in [[London]] is quite [[London#climate|wet]]"
+ assert_equal ["London"], Article.parse_existing_wiki_links(wiki_text)
+ end
+
+ def test_nested_templates
+ wiki_text = "abc {{def {{ghi}} jkl}} mno"
+ assert_equal "abc mno", Article.parse_templates(wiki_text)
+ end
+
+ def test_trickier_non_direct_links
+ wiki_texts = ["start [[Image:wiki.png]]finish", "start[[Image:wiki.png|The logo of this [[wiki]]]] finish", "start[[:Image:wiki.png|The logo of this [[wiki]], which is the English Wikipedia]] finish"]
+ wiki_texts.each do |wiki_text|
+ assert_equal "start finish", Article.parse_non_direct_links(wiki_text)
+ assert_equal "start finish", Article.parse_wiki_text(wiki_text)
+ end
+ end
+
+ def test_no_side_effects_on_document_text
+ document_text = "[[en:Wikipedia]]"
+ original_document_text = document_text.dup
+ repository = Repository.find_by_abbreviation("af-uncyclopedia")
+ markup = "auto-detect"
+ Article.parse_text_document(document_text, repository, markup)
+ assert_equal document_text, original_document_text
+ end
+
+ #Test that an article having the full title wikified deals with shortened versions of the title
+ def test_handle_shorted_versions_of_wikified_titles
+ repository = Repository.find_by_abbreviation("af-uncyclopedia")
+ markup = "auto-detect"
+ long_article = Article.find_by_title_and_repository_id("Maria Theresa of Austria", repository)
+ short_article = Article.find_by_title_and_repository_id("Maria Theresa", repository)
+ document_text_results_pairs = []
+ document_text_results_pairs << ["#{long_article.title}", [ [long_article.title, long_article ] ] ]
+ document_text_results_pairs << ["[[#{long_article.title}]]", [ ] ]
+ document_text_results_pairs << ["#{long_article.title} : #{short_article.title} was born in", [ [long_article.title, long_article ] ] ]
+ document_text_results_pairs << ["[[#{long_article.title}]] : #{short_article.title} was born in", [ ] ]
+ document_text_results_pairs.each do |document_text_results_pair|
+ document_text = document_text_results_pair[0]
+ expected_results = document_text_results_pair[1]
+ results = Article.parse_text_document(document_text, repository, markup)
+ assert_equal expected_results, results
+ end
+ end
end

0 comments on commit ca0d3ff

Please sign in to comment.
Something went wrong with that request. Please try again.