Skip to content

Commit

Permalink
Added basic processing of wikitext, instructing the program to ignore…
Browse files Browse the repository at this point in the history
… links already wikified.
  • Loading branch information
agrimm committed May 31, 2008
1 parent 7a54d93 commit ca0d3ff
Show file tree
Hide file tree
Showing 7 changed files with 289 additions and 21 deletions.
7 changes: 6 additions & 1 deletion app/controllers/read_controller.rb
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ class ReadController < ApplicationController

def read
@repository_choices = Repository.find(:all).map {|rc| [rc.short_description, rc.id]}
@markup_choices = [ ["Auto-detect (default)", "auto-detect"], ["MediaWiki formatting", "mediawiki"], ["Plain text", "plain"] ]
if request.post?
@errors = []
if params[:document_text].blank?
Expand All @@ -13,9 +14,13 @@ def read
unless repository
@errors << "Can't find repository"
end
markup = params[:markup]
unless @markup_choices.map{|pair| pair.last}.include?(markup)
@errors << "Invalid markup choice"
end
if @errors.empty?
begin
@parse_results = Article.parse_text_document(document_text, repository)
@parse_results = Article.parse_text_document(document_text, repository, markup)
rescue ArgumentError => error
if error.message == "Document has too many words"
@errors << "Please submit a text fewer than #{Article.maximum_allowed_document_size} words long"
Expand Down
118 changes: 108 additions & 10 deletions app/models/article.rb
Original file line number Diff line number Diff line change
Expand Up @@ -23,51 +23,68 @@ def self.break_up_phrase(phrase)

#Determine if a phrase is boring
#That is, it has one or zero non-boring words
def self.phrase_is_boring?(phrase)
# and that the wiki text doesn't already link to it (if applicable)
def self.phrase_is_boring?(phrase, existing_article_titles)
#if existing_article_titles.any?{|existing_article_title| existing_article_title.chars.downcase.to_s.include?(phrase.chars.downcase)} #Unicode safe, too slow? :(
if existing_article_titles.any?{|existing_article_title| existing_article_title.downcase.include?(phrase.downcase)} #Not unicode safe?
return true
end
words = break_up_phrase(phrase)
#count how many words are non-boring
boring_words = %w{a and also are be been for get has in is just me of on only see than this the there was january february march april may june july august september october november december}
number_non_boring_words = 0
words.each do |word|
number_non_boring_words += 1 unless boring_words.include?(word.chars.downcase)
number_non_boring_words += 1 unless boring_words.include?(word.downcase) #Not unicode safe?
#number_non_boring_words += 1 unless boring_words.include?(word.chars.downcase) #Unicode safe
end
return true unless number_non_boring_words > 1
end

#Return all articles that match the requested phrase
#Probably should only return one article, but return an array just in case
def self.find_matching_articles(phrase, repository)
return [] if phrase_is_boring?(phrase)
def self.find_matching_articles(phrase, repository, existing_article_titles)
return [] if phrase_is_boring?(phrase, existing_article_titles)
articles = find(:all, :conditions => ["title = ? and repository_id = ?", phrase, repository], :limit => 1)
articles
end

#Informs the caller if they should try a longer phrase than the current one in order to get a match
def self.try_longer_phrase?(phrase, repository)
if phrase_is_boring?(phrase)
def self.try_longer_phrase?(phrase, repository, existing_article_titles)
if phrase_is_boring?(phrase, existing_article_titles)
return true #Otherwise it chews up too much server time
end
potentially_matching_articles = find(:all, :conditions => ["title like ? and repository_id = ?", phrase + "%", repository], :limit=>1)
return !potentially_matching_articles.empty?
end

#The main method called from the controller
#Read in a document, and return an array of phrases and their matching articles
#Strategy: split into words, then iterate through the words
def self.parse_text_document(document_text, repository)
def self.parse_text_document(document_text, repository, markup)
parse_results = []
words = break_up_phrase(document_text)
raise(ArgumentError, "Document has too many words") if words.size > maximum_allowed_document_size
if (markup == "auto-detect")
markup = self.markup_autodetect(document_text)
end
if (markup == "mediawiki")
wiki_text = document_text.dup
parsed_wiki_text = self.parse_wiki_text(wiki_text)
existing_article_titles = self.parse_existing_wiki_links(parsed_wiki_text)
else
existing_article_titles = []
end
i = 0
while(true)
j = 0
phrase = words[i + j]
while(true)
matching_articles = find_matching_articles(phrase, repository)
matching_articles = find_matching_articles(phrase, repository, existing_article_titles)
matching_articles.each do |matching_article|
parse_results << [phrase, matching_article]
end

break unless (try_longer_phrase?(phrase, repository) and i + j + 1 < words.size)
break unless (try_longer_phrase?(phrase, repository, existing_article_titles) and i + j + 1 < words.size)
j = j + 1
phrase += " "
phrase += words[i + j]
Expand All @@ -83,7 +100,7 @@ def self.parse_text_document(document_text, repository)

#a method to get rid of the duplicate results
def self.clean_results(parse_results)
parse_results.delete_if {|x| !(x[0].include?(" ") )}
parse_results.delete_if {|x| !(x[0].include?(" ") )} #This line may be redundant
#Get rid of results with a phrase shorter than another phrase in parse_results
#Get rid of results with a phrase already included in cleaned_results
cleaned_results = []
Expand All @@ -110,4 +127,85 @@ def self.clean_results(parse_results)
cleaned_results
end

#Remove from MediaWiki text anything that is surrounded by <nowiki>
def self.parse_nowiki(wiki_text)
loop do
#Delete anything paired by nowiki, non-greedily
#Assumes that there aren't nested nowikis
substitution_made = wiki_text.gsub!(%r{<nowiki>(.*?)</nowiki>}im,"")
break unless substitution_made
end
wiki_text
end

#Remove from MediaWiki text anything within a template
def self.parse_templates(wiki_text)
loop do
#Delete anything with paired {{ and }}, so long as no opening braces are inside
#Should closing braces inside be forbidden as well?
substitution_made = wiki_text.gsub!(%r{\{\{([^\{]*?)\}\}}im,"")
break unless substitution_made
end
wiki_text
end

#Remove from MediaWiki text anything in an external link
#This will remove the description of the link as well - for now
def self.parse_external_links(wiki_text)
#Delete everything starting with an opening square bracket, continuing with non-bracket characters until a colon, then any characters until it reaches a closing square bracket
wiki_text.gsub!(%r{\[[^\[]+?:[^\[]*?\]}im, "")
wiki_text
end

#Remove paired XHTML-style syntax
def self.parse_paired_tags(wiki_text)
#Remove paired tags
wiki_text.gsub!(%r{<([a-zA-Z]*)>(.*?)</\1>}im, '\2')
wiki_text
end

#Remove non-paired XHTML-style syntax
def self.parse_unpaired_tags(wiki_text)
wiki_text.gsub!(%r{<[a-zA-Z]*/>}im, "")
wiki_text
end

#Remove links to other namespaces (eg [[Wikipedia:Manual of Style]]) , to media (eg [[Image:Wiki.png]]) and to other wikis (eg [[es:Plancton]])
def self.parse_non_direct_links(wiki_text)
wiki_text.gsub!(%r{\[\[[^\[\]]*?:([^\[]|\[\[[^\[]*\]\])*?\]\]}im, "")
wiki_text
end

#Remove from wiki_text anything that could confuse the program
def self.parse_wiki_text(wiki_text)
wiki_text = self.parse_nowiki(wiki_text)
wiki_text = self.parse_templates(wiki_text)
wiki_text = self.parse_paired_tags(wiki_text)
wiki_text = self.parse_unpaired_tags(wiki_text)
wiki_text = self.parse_non_direct_links(wiki_text)
wiki_text = self.parse_external_links(wiki_text) #Has to come after parse_non_direct_links for now
wiki_text
end

#Look for existing wikilinks in a piece of text
def self.parse_existing_wiki_links(wiki_text)
unparsed_match_arrays = wiki_text.scan(%r{\[\[([^\]\#\|]*)([^\]]*?)\]\]}im)
parsed_wiki_article_titles = []
unparsed_match_arrays.each do |unparsed_match_array|
unparsed_title = unparsed_match_array.first
parsed_title = unparsed_title.gsub(/_+/, " ")
parsed_wiki_article_titles << parsed_title
end
parsed_wiki_article_titles.uniq
end

#Determine if the text is in some sort of markup
def self.markup_autodetect(document_text)
markup = "plain"
if document_text =~ %r{\[\[[^\[\]]+\]\]}im
markup = "mediawiki"
end
markup
end

end
4 changes: 3 additions & 1 deletion app/views/read/read.rhtml
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,11 @@

<% form_tag(:action => :read) do %>
<table>
<tr><td colspan=2> <%= text_area_tag(:document_text, params[:document_text], :cols=> 80, :rows=> 10) %></td></tr>
<tr><td colspan=2> <%= text_area_tag(:document_text, h(params[:document_text]), :cols=> 80, :rows=> 10) %></td></tr>
<tr><td>Web site:</td>
<td><%= select_tag "repository_id", options_for_select(@repository_choices) %></td></tr>
<tr><td>Markup (if any)</td>
<td><%= select_tag "markup", options_for_select(@markup_choices) %></td></tr>
<tr><td> <%= submit_tag("Submit text") %> </td><td></td></tr>
</table>
<% end %>
Expand Down
32 changes: 32 additions & 0 deletions config/deploy.example.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
require 'mongrel_cluster/recipes'

set :application, "weatherinlondon"
set :repository, "https://theweatherinlondon.googlecode.com/svn/trunk/"

set :user, "sample_user_name"

# If you aren't deploying to /u/apps/#{application} on the target
# servers (which is the default), you can specify the actual location
# via the :deploy_to variable:
set :deploy_to, "/home/#{user}/#{application}"

set :deploy_via, :export

# If you aren't using Subversion to manage your source code, specify
# your SCM below:
# set :scm, :subversion

#No mongrel cluster available
#set :mongrel_conf, "#{current_path}/config/mongrel_cluster.yml"

set :symlink_commands, "ln -nfs #{deploy_to}/#{shared_dir}/config/database.yml #{release_path}/config/database.yml"

role :app, "theweatherinlondon.com"
role :web, "theweatherinlondon.com"
role :db, "theweatherinlondon.com", :primary => true

#Courtesy of paulhammond.org and also pragmatic deployment, how to deal with database.yml and friends
desc "link in production database credentials, and other similar files"
task :after_update_code do
run "#{symlink_commands}"
end
12 changes: 10 additions & 2 deletions test/fixtures/articles.yml
Original file line number Diff line number Diff line change
@@ -1,5 +1,13 @@
# Read about fixtures at http://ar.rubyonrails.org/classes/Fixtures.html
one:

Maria Theresa of Austria:
id: 1
two:
uri: http://af.uncyclopedia.wikia.com/wiki/Maria_Theresa_of_Austria
title: Maria Theresa of Austria
repository_id: 1

Maria Theresa:
id: 2
uri: http://af.uncyclopedia.wikia.com/wiki/Maria_Theresa
title: Maria Theresa
repository_id: 1
10 changes: 5 additions & 5 deletions test/fixtures/repositories.yml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# Read about fixtures at http://ar.rubyonrails.org/classes/Fixtures.html

# one:
# column: value
#
# two:
# column: value
#An uncyclopedia language that does not yet exist
Afrikaans_uncyclopedia:
id: 1
abbreviation: af-uncyclopedia
short_description: "Afrikaans Uncyclopedia"
Loading

0 comments on commit ca0d3ff

Please sign in to comment.