Added basic processing of wikitext, instructing the program to ignore…

… links already wikified.
agrimm · May 31, 2008 · ca0d3ff · ca0d3ff
1 parent 7a54d93
commit ca0d3ff
Show file tree

Hide file tree

Showing 7 changed files with 289 additions and 21 deletions.
diff --git a/app/controllers/read_controller.rb b/app/controllers/read_controller.rb
@@ -2,6 +2,7 @@ class ReadController < ApplicationController
 
   def read
     @repository_choices = Repository.find(:all).map {|rc| [rc.short_description, rc.id]}
+    @markup_choices = [ ["Auto-detect (default)", "auto-detect"], ["MediaWiki formatting", "mediawiki"], ["Plain text", "plain"] ]
     if request.post?
       @errors = []
       if params[:document_text].blank?
@@ -13,9 +14,13 @@ def read
       unless repository
         @errors << "Can't find repository"
       end
+      markup = params[:markup]
+      unless @markup_choices.map{|pair| pair.last}.include?(markup)
+        @errors << "Invalid markup choice"
+      end
       if @errors.empty?
         begin
-          @parse_results = Article.parse_text_document(document_text, repository)
+          @parse_results = Article.parse_text_document(document_text, repository, markup)
         rescue ArgumentError => error
           if error.message == "Document has too many words"
             @errors << "Please submit a text fewer than #{Article.maximum_allowed_document_size} words long" 

diff --git a/app/models/article.rb b/app/models/article.rb
@@ -23,51 +23,68 @@ def self.break_up_phrase(phrase)
 
   #Determine if a phrase is boring
   #That is, it has one or zero non-boring words
-  def self.phrase_is_boring?(phrase)
+  #  and that the wiki text doesn't already link to it (if applicable)
+  def self.phrase_is_boring?(phrase, existing_article_titles)
+    #if existing_article_titles.any?{|existing_article_title| existing_article_title.chars.downcase.to_s.include?(phrase.chars.downcase)} #Unicode safe, too slow? :(
+    if existing_article_titles.any?{|existing_article_title| existing_article_title.downcase.include?(phrase.downcase)} #Not unicode safe?
+      return true
+    end
     words = break_up_phrase(phrase)
     #count how many words are non-boring
     boring_words = %w{a and also are be been for get has in is just me of on only see than this the there was january february march april may june july august september october november december}
     number_non_boring_words = 0
     words.each do |word|
-      number_non_boring_words += 1 unless boring_words.include?(word.chars.downcase)
+      number_non_boring_words += 1 unless boring_words.include?(word.downcase) #Not unicode safe?
+      #number_non_boring_words += 1 unless boring_words.include?(word.chars.downcase) #Unicode safe
     end
     return true unless number_non_boring_words > 1
   end
 
   #Return all articles that match the requested phrase
   #Probably should only return one article, but return an array just in case
-  def self.find_matching_articles(phrase, repository)
-    return [] if phrase_is_boring?(phrase)
+  def self.find_matching_articles(phrase, repository, existing_article_titles)
+    return [] if phrase_is_boring?(phrase, existing_article_titles)
     articles = find(:all, :conditions => ["title = ? and repository_id = ?", phrase, repository], :limit => 1)
     articles
   end
 
   #Informs the caller if they should try a longer phrase than the current one in order to get a match
-  def self.try_longer_phrase?(phrase, repository)
-    if phrase_is_boring?(phrase)
+  def self.try_longer_phrase?(phrase, repository, existing_article_titles)
+    if phrase_is_boring?(phrase, existing_article_titles)
       return true #Otherwise it chews up too much server time
     end
     potentially_matching_articles = find(:all, :conditions => ["title like ? and repository_id = ?", phrase + "%", repository], :limit=>1)
     return !potentially_matching_articles.empty?
   end
 
+  #The main method called from the controller
   #Read in a document, and return an array of phrases and their matching articles
   #Strategy: split into words, then iterate through the words
-  def self.parse_text_document(document_text, repository)
+  def self.parse_text_document(document_text, repository, markup)
     parse_results = []
     words = break_up_phrase(document_text)
     raise(ArgumentError, "Document has too many words") if words.size > maximum_allowed_document_size
+    if (markup == "auto-detect")
+      markup = self.markup_autodetect(document_text)
+    end
+    if (markup == "mediawiki")
+      wiki_text = document_text.dup
+      parsed_wiki_text = self.parse_wiki_text(wiki_text)
+      existing_article_titles = self.parse_existing_wiki_links(parsed_wiki_text)
+    else
+      existing_article_titles = []
+    end
     i = 0
     while(true)
       j = 0
       phrase = words[i + j]
       while(true)
-        matching_articles = find_matching_articles(phrase, repository)
+        matching_articles = find_matching_articles(phrase, repository, existing_article_titles)
         matching_articles.each do |matching_article|
           parse_results << [phrase, matching_article]
         end
 
-        break unless (try_longer_phrase?(phrase, repository) and i + j + 1 < words.size)
+        break unless (try_longer_phrase?(phrase, repository, existing_article_titles) and i + j + 1 < words.size)
         j = j + 1
         phrase += " "
         phrase += words[i + j]
@@ -83,7 +100,7 @@ def self.parse_text_document(document_text, repository)
 
   #a method to get rid of the duplicate results
   def self.clean_results(parse_results)
-    parse_results.delete_if {|x| !(x[0].include?(" ") )}
+    parse_results.delete_if {|x| !(x[0].include?(" ") )} #This line may be redundant
     #Get rid of results with a phrase shorter than another phrase in parse_results
     #Get rid of results with a phrase already included in cleaned_results
     cleaned_results = []
@@ -110,4 +127,85 @@ def self.clean_results(parse_results)
     cleaned_results
   end
 
+  #Remove from MediaWiki text anything that is surrounded by <nowiki>
+  def self.parse_nowiki(wiki_text)
+    loop do
+      #Delete anything paired by nowiki, non-greedily
+      #Assumes that there aren't nested nowikis
+      substitution_made = wiki_text.gsub!(%r{<nowiki>(.*?)</nowiki>}im,"")
+      break unless substitution_made
+    end
+    wiki_text
+  end
+
+  #Remove from MediaWiki text anything within a template
+  def self.parse_templates(wiki_text)
+    loop do
+      #Delete anything with paired {{ and }}, so long as no opening braces are inside
+      #Should closing braces inside be forbidden as well?
+      substitution_made = wiki_text.gsub!(%r{\{\{([^\{]*?)\}\}}im,"")
+      break unless substitution_made
+    end
+    wiki_text
+  end
+
+  #Remove from MediaWiki text anything in an external link
+  #This will remove the description of the link as well - for now
+  def self.parse_external_links(wiki_text)
+    #Delete everything starting with an opening square bracket, continuing with non-bracket characters until a colon, then any characters until it reaches a closing square bracket
+    wiki_text.gsub!(%r{\[[^\[]+?:[^\[]*?\]}im, "")
+    wiki_text
+  end
+
+  #Remove paired XHTML-style syntax 
+  def self.parse_paired_tags(wiki_text)
+    #Remove paired tags
+    wiki_text.gsub!(%r{<([a-zA-Z]*)>(.*?)</\1>}im, '\2')
+    wiki_text
+  end
+
+  #Remove non-paired XHTML-style syntax
+  def self.parse_unpaired_tags(wiki_text)
+    wiki_text.gsub!(%r{<[a-zA-Z]*/>}im, "")
+    wiki_text
+  end
+
+  #Remove links to other namespaces (eg [[Wikipedia:Manual of Style]]) , to media (eg [[Image:Wiki.png]]) and to other wikis (eg [[es:Plancton]])
+  def self.parse_non_direct_links(wiki_text)
+    wiki_text.gsub!(%r{\[\[[^\[\]]*?:([^\[]|\[\[[^\[]*\]\])*?\]\]}im, "")
+    wiki_text
+  end
+
+  #Remove from wiki_text anything that could confuse the program
+  def self.parse_wiki_text(wiki_text)
+    wiki_text = self.parse_nowiki(wiki_text)
+    wiki_text = self.parse_templates(wiki_text)
+    wiki_text = self.parse_paired_tags(wiki_text)
+    wiki_text = self.parse_unpaired_tags(wiki_text)
+    wiki_text = self.parse_non_direct_links(wiki_text)
+    wiki_text = self.parse_external_links(wiki_text) #Has to come after parse_non_direct_links for now
+    wiki_text
+  end
+
+  #Look for existing wikilinks in a piece of text
+  def self.parse_existing_wiki_links(wiki_text)
+    unparsed_match_arrays = wiki_text.scan(%r{\[\[([^\]\#\|]*)([^\]]*?)\]\]}im)
+    parsed_wiki_article_titles = []
+    unparsed_match_arrays.each do |unparsed_match_array|
+      unparsed_title = unparsed_match_array.first
+      parsed_title = unparsed_title.gsub(/_+/, " ")
+      parsed_wiki_article_titles << parsed_title
+    end
+    parsed_wiki_article_titles.uniq
+  end
+
+  #Determine if the text is in some sort of markup
+  def self.markup_autodetect(document_text)
+    markup = "plain"
+    if document_text =~ %r{\[\[[^\[\]]+\]\]}im
+      markup = "mediawiki"
+    end
+    markup
+  end
+
 end
diff --git a/app/views/read/read.rhtml b/app/views/read/read.rhtml
@@ -14,9 +14,11 @@
 
 <% form_tag(:action => :read) do %>
 <table>
-<tr><td colspan=2> <%= text_area_tag(:document_text, params[:document_text], :cols=> 80, :rows=> 10) %></td></tr>
+<tr><td colspan=2> <%= text_area_tag(:document_text, h(params[:document_text]), :cols=> 80, :rows=> 10) %></td></tr>
 <tr><td>Web site:</td>
 <td><%= select_tag "repository_id", options_for_select(@repository_choices) %></td></tr>
+<tr><td>Markup (if any)</td>
+<td><%= select_tag "markup", options_for_select(@markup_choices) %></td></tr>
 <tr><td>  <%= submit_tag("Submit text") %> </td><td></td></tr>
 </table>
 <% end %>

diff --git a/config/deploy.example.rb b/config/deploy.example.rb
@@ -0,0 +1,32 @@
+require 'mongrel_cluster/recipes'
+
+set :application, "weatherinlondon"
+set :repository,  "https://theweatherinlondon.googlecode.com/svn/trunk/"
+
+set :user, "sample_user_name"
+
+# If you aren't deploying to /u/apps/#{application} on the target
+# servers (which is the default), you can specify the actual location
+# via the :deploy_to variable:
+set :deploy_to, "/home/#{user}/#{application}"
+
+set :deploy_via, :export
+
+# If you aren't using Subversion to manage your source code, specify
+# your SCM below:
+# set :scm, :subversion
+
+#No mongrel cluster available
+#set :mongrel_conf, "#{current_path}/config/mongrel_cluster.yml"
+
+set :symlink_commands, "ln -nfs #{deploy_to}/#{shared_dir}/config/database.yml #{release_path}/config/database.yml"
+
+role :app, "theweatherinlondon.com"
+role :web, "theweatherinlondon.com"
+role :db,  "theweatherinlondon.com", :primary => true
+
+#Courtesy of paulhammond.org and also pragmatic deployment, how to deal with database.yml and friends
+desc "link in production database credentials, and other similar files" 
+task :after_update_code do
+  run "#{symlink_commands}"
+end
diff --git a/test/fixtures/articles.yml b/test/fixtures/articles.yml
@@ -1,5 +1,13 @@
 # Read about fixtures at http://ar.rubyonrails.org/classes/Fixtures.html
-one:
+
+Maria Theresa of Austria:
   id: 1
-two:
+  uri: http://af.uncyclopedia.wikia.com/wiki/Maria_Theresa_of_Austria
+  title: Maria Theresa of Austria
+  repository_id: 1
+
+Maria Theresa:
   id: 2
+  uri: http://af.uncyclopedia.wikia.com/wiki/Maria_Theresa
+  title: Maria Theresa
+  repository_id: 1
diff --git a/test/fixtures/repositories.yml b/test/fixtures/repositories.yml
@@ -1,7 +1,7 @@
 # Read about fixtures at http://ar.rubyonrails.org/classes/Fixtures.html
 
-# one:
-#   column: value
-#
-# two:
-#   column: value
+#An uncyclopedia language that does not yet exist
+Afrikaans_uncyclopedia:
+  id: 1
+  abbreviation: af-uncyclopedia
+  short_description: "Afrikaans Uncyclopedia"