New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Script: Replace hardcoded names with missing tags #1052
Comments
Batch 1: Crematogaster, Lasius and Polyrhachis subgenera
Script
EDIT_SUMMARY = 'Replace hardcoded names with `missing` tags [batch 1], see %github1052'
ANTCATBOT = User.find_by!(name: 'AntCatBot')
# For activities and PaperTrail.
RequestStore.store[:current_request_uuid] = SecureRandom.uuid
PaperTrail.request.whodunnit = ANTCATBOT.id
def replace_with_missing_tags! replacement
escaped_replacement = replacement.dup
escaped_replacement = escaped_replacement.gsub!('(', '\\(')
escaped_replacement = escaped_replacement.gsub!(')', '\\)')
items = TaxonHistoryItem.where("taxt REGEXP ?", '.*' + escaped_replacement + ':' + '.*')
puts "#{replacement}: #{items.count}"
items.find_each do |item|
old_taxt = item.taxt.dup
if old_taxt["{missing #{replacement}}"]
puts "nope".red
return
end
new_taxt = item.taxt.dup
new_taxt.gsub!(replacement, "{missing #{replacement}}")
if old_taxt == new_taxt
puts "nothing was changed: #{item.id} - #{replacement}".red
return
end
new_detaxt_missing_tags = new_taxt.dup
new_detaxt_missing_tags.gsub!(Taxt::MISSING_OR_UNMISSING_TAG) do
$LAST_MATCH_INFO[:hardcoded_name]
end
old_detaxt_missing_tags = old_taxt.dup
old_detaxt_missing_tags.gsub!(Taxt::MISSING_OR_UNMISSING_TAG) do
$LAST_MATCH_INFO[:hardcoded_name]
end
if old_detaxt_missing_tags == new_detaxt_missing_tags
item.update!(taxt: new_taxt)
item.create_activity :update, ANTCATBOT, edit_summary: EDIT_SUMMARY
puts "OK".green
else
puts "does not detax in the same way: #{item.id} - #{replacement}".red
end
end
end
# ---
existing_subgenus_names = Subgenus.joins(:name).pluck(:name); nil
missing_subgenus_names = TaxonHistoryItem.where("taxt REGEXP ?", ".*[A-z][a-z]+ \\([A-Z][a-z]+\\):.*").
pluck(:taxt).join(' ').scan(/[A-Z][a-z]+ \([A-Z][a-z]+\)/).uniq.sort; nil
all_replacements = (existing_subgenus_names + missing_subgenus_names).uniq
subgenera_to_match = %w[
Crematogaster
Lasius
Polyrhachis
]
replacements = all_replacements.select do |replacement|
subgenera_to_match.any? { |subgenus| replacement[subgenus] }
end
replacements.each do |replacement|
replacement_with_italics = "<i>#{replacement}</i>"
replace_with_missing_tags! replacement_with_italics
replace_with_missing_tags! replacement
end; nil
# ---
Activity.execute_script_activity User.find_by!(name: 'Fredrik Palmkron'), EDIT_SUMMARY |
Batch 2: 500 more subgenera
Script
BATCH = 2
EDIT_SUMMARY = "Replace hardcoded names with `missing` tags [batch #{BATCH}], see %github1052"
ANTCATBOT = User.find_by!(name: 'AntCatBot')
# For activities and PaperTrail.
RequestStore.store[:current_request_uuid] = SecureRandom.uuid
PaperTrail.request.whodunnit = ANTCATBOT.id
MAX_REPLACE_IN_BATCH = 500
$replaced_in_batch = 0
def replace_with_missing_tags! replacement
escaped_replacement = replacement.dup
escaped_replacement = escaped_replacement.gsub!('(', '\\(')
escaped_replacement = escaped_replacement.gsub!(')', '\\)')
items = TaxonHistoryItem.where("taxt REGEXP ?", '.*' + escaped_replacement + ':' + '.*')
# puts "#{replacement}: #{items.count}"
items.find_each do |item|
old_taxt = item.taxt.dup
if old_taxt["{missing #{replacement}}"]
puts "nope".red
return
end
new_taxt = item.taxt.dup
new_taxt.gsub!(replacement, "{missing #{replacement}}")
if old_taxt == new_taxt
puts "nothing was changed: #{item.id} - #{replacement}".red
return
end
new_detaxt_missing_tags = new_taxt.dup
new_detaxt_missing_tags.gsub!(Taxt::MISSING_OR_UNMISSING_TAG) do
$LAST_MATCH_INFO[:hardcoded_name]
end
old_detaxt_missing_tags = old_taxt.dup
old_detaxt_missing_tags.gsub!(Taxt::MISSING_OR_UNMISSING_TAG) do
$LAST_MATCH_INFO[:hardcoded_name]
end
if old_detaxt_missing_tags == new_detaxt_missing_tags
item.update!(taxt: new_taxt)
item.create_activity :update, ANTCATBOT, edit_summary: EDIT_SUMMARY
puts "OK".green
$replaced_in_batch += 1
else
puts "does not detax in the same way: #{item.id} - #{replacement}".red
end
end
end
# ---
existing_subgenus_names = Subgenus.joins(:name).pluck(:name); nil
missing_subgenus_names = TaxonHistoryItem.where("taxt REGEXP ?", ".*[A-z][a-z]+ \\([A-Z][a-z]+\\):.*").
pluck(:taxt).join(' ').scan(/[A-Z][a-z]+ \([A-Z][a-z]+\)/).uniq.sort; nil
all_replacements = (existing_subgenus_names + missing_subgenus_names).uniq
replacements = all_replacements
replacements.each do |replacement|
if $replaced_in_batch > MAX_REPLACE_IN_BATCH
break
end
replacement_with_italics = "<i>#{replacement}</i>"
replace_with_missing_tags! replacement_with_italics
replace_with_missing_tags! replacement
end; nil
# ---
Activity.execute_script_activity User.find_by!(name: 'Fredrik Palmkron'), EDIT_SUMMARY Batch 3: Remaining simple subgenus cases
Script
BATCH = 3
EDIT_SUMMARY = "Replace hardcoded names with `missing` tags [batch #{BATCH}], see %github1052"
ANTCATBOT = User.find_by!(name: 'AntCatBot')
# For activities and PaperTrail.
RequestStore.store[:current_request_uuid] = SecureRandom.uuid
PaperTrail.request.whodunnit = ANTCATBOT.id
MAX_REPLACE_IN_BATCH = 9999
$replaced_in_batch = 0
def replace_with_missing_tags! replacement
escaped_replacement = replacement.dup
escaped_replacement = escaped_replacement.gsub!('(', '\\(')
escaped_replacement = escaped_replacement.gsub!(')', '\\)')
items = TaxonHistoryItem.where("taxt REGEXP ?", '.*' + escaped_replacement + ':' + '.*')
# puts "#{replacement}: #{items.count}"
items.find_each do |item|
old_taxt = item.taxt.dup
if old_taxt["{missing #{replacement}}"]
puts "nope".red
return
end
new_taxt = item.taxt.dup
new_taxt.gsub!(replacement, "{missing #{replacement}}")
if old_taxt == new_taxt
puts "nothing was changed: #{item.id} - #{replacement}".red
return
end
new_detaxt_missing_tags = new_taxt.dup
new_detaxt_missing_tags.gsub!(Taxt::MISSING_OR_UNMISSING_TAG) do
$LAST_MATCH_INFO[:hardcoded_name]
end
old_detaxt_missing_tags = old_taxt.dup
old_detaxt_missing_tags.gsub!(Taxt::MISSING_OR_UNMISSING_TAG) do
$LAST_MATCH_INFO[:hardcoded_name]
end
if old_detaxt_missing_tags == new_detaxt_missing_tags
item.update!(taxt: new_taxt)
item.create_activity :update, ANTCATBOT, edit_summary: EDIT_SUMMARY
puts "OK".green
$replaced_in_batch += 1
else
puts "does not detax in the same way: #{item.id} - #{replacement}".red
end
end
end
# ---
existing_subgenus_names = Subgenus.joins(:name).pluck(:name); nil
missing_subgenus_names = TaxonHistoryItem.where("taxt REGEXP ?", ".*[A-z][a-z]+ \\([A-Z][a-z]+\\):.*").
pluck(:taxt).join(' ').scan(/[A-Z][a-z]+ \([A-Z][a-z]+\)/).uniq.sort; nil
all_replacements = (existing_subgenus_names + missing_subgenus_names).uniq
replacements = all_replacements
replacements.each do |replacement|
if $replaced_in_batch > MAX_REPLACE_IN_BATCH
break
end
replacement_with_italics = "<i>#{replacement}</i>"
replace_with_missing_tags! replacement_with_italics
replace_with_missing_tags! replacement
end; nil
# ---
Activity.execute_script_activity User.find_by!(name: 'Fredrik Palmkron'), EDIT_SUMMARY |
Batch 4: Subgenera with double italics tags -
|
Batch 5: More subgenera with single italics tags -
|
|
More genus-group name batchesReplaceTaxt
class ReplaceTaxt
attr_accessor :replaced_in_batch, :debug
def initialize taxt_regexes, edit_summary, max_replace_in_batch: 99, debug: false
@taxt_regexes = taxt_regexes
@edit_summary = edit_summary
@max_replace_in_batch = max_replace_in_batch
@debug = debug
@replaced_in_batch = 0
end
def calld
self.debug = true
call
end
def call
puts "DEBUG".green if debug
setup_tracking
taxt_regexes.each do |(mysql_regex, ruby_regex)|
replace_with_missing_tags! mysql_regex, ruby_regex
end
create_script_activity
end
private
attr_reader :taxt_regexes, :edit_summary, :max_replace_in_batch
def setup_tracking
RequestStore.store[:current_request_uuid] = SecureRandom.uuid
PaperTrail.request.whodunnit = antcat_bot.id
end
def antcat_bot
@_antcat_bot ||= User.find_by!(name: 'AntCatBot')
end
def create_script_activity
puts "replaced_in_batch: #{replaced_in_batch}".blue
Activity.execute_script_activity User.find_by!(name: 'Fredrik Palmkron'), edit_summary
end
def replace_with_missing_tags! mysql_regex, ruby_regex
items = TaxonHistoryItem.where("BINARY taxt REGEXP ?", mysql_regex)
puts "#{mysql_regex}".blue + " --- " + "#{items.count}".green
items.find_each do |item|
if replaced_in_batch >= max_replace_in_batch
create_script_activity
raise "reached max"
end
old_taxt = item.taxt.dup
new_taxt = item.taxt.dup
puts "old_taxt: #{old_taxt}".blue
new_taxt.gsub!(ruby_regex) do |match|
hardcoded_name = $LAST_MATCH_INFO[:hardcoded_name]
without_italics = hardcoded_name.gsub(%r{</?i>}, '')
match.to_s.sub(hardcoded_name, "{missing #{without_italics}}")
end
puts "new_taxt: #{new_taxt}".yellow
if old_taxt == new_taxt
puts "nothing was changed: #{item.id} - #{mysql_regex}".red
return
end
new_detaxt_missing_tags = new_taxt.dup
new_detaxt_missing_tags.gsub!(Taxt::MISSING_OR_UNMISSING_TAG_REGEX) do
$LAST_MATCH_INFO[:hardcoded_name]
end
new_detaxt_missing_tags.gsub!(%r{</?i>}, '')
old_detaxt_missing_tags = old_taxt.dup
old_detaxt_missing_tags.gsub!(Taxt::MISSING_OR_UNMISSING_TAG_REGEX) do
$LAST_MATCH_INFO[:hardcoded_name]
end
old_detaxt_missing_tags.gsub!(%r{</?i>}, '')
if old_detaxt_missing_tags == new_detaxt_missing_tags
self.replaced_in_batch += 1
puts "OK".green
next if debug
item.update!(taxt: new_taxt)
item.create_activity :update, antcat_bot, edit_summary: edit_summary
else
puts "does not detax in the same way: #{item.id} - #{mysql_regex}".red
end
end
end
end
# Genus-group names and above.
batch = 6
edit_summary = "Replace hardcoded names with `missing` tags [batch #{batch}], see %github1052"
taxt_regexes = [
['^{(pro|missing|tax) [0-9]+} as junior synonym of ([A-Z][a-z]+):', /^{(pro|missing|tax) [0-9]+} as junior synonym of (?<hardcoded_name>[A-Z][a-z]+):/],
]
ReplaceTaxt.new(taxt_regexes, edit_summary).call
# Genus-group names and above.
batch = 7
edit_summary = "Replace hardcoded names with `missing` tags [batch #{batch}], see %github1052"
taxt_regexes = [
['^Combination in (<i>)?([A-Z][a-z]+)(</i>)?:', /^Combination in (?<hardcoded_name>(<i>)?[A-Z][a-z]+(<\/i>)?):/],
]
ReplaceTaxt.new(taxt_regexes, edit_summary).call
# Genus-group names and above.
batch = 8
edit_summary = "Replace hardcoded names with `missing` tags [batch #{batch}], see %github1052"
taxt_regexes = [
['^Combination in .+; in (<i>)?([A-Z][a-z]+)(</i>)?:', /^Combination in .+; in (?<hardcoded_name>(<i>)?[A-Z][a-z]+(<\/i>)?):/],
['^Combination in .+; in (<i>)?([A-Z][a-z]+ \\([A-Z][a-z]+\\))(</i>)?:', /^Combination in .+; in (?<hardcoded_name>(<i>)?[A-Z][a-z]+ \([A-Z][a-z]+\)(<\/i>)?):/],
]
ReplaceTaxt.new(taxt_regexes, edit_summary).call |
Future batches# Species-group names.
# Wait, due to epithets?
batch = 999
edit_summary = "Replace hardcoded names with `missing` tags [batch #{batch}], see %github1052"
taxt_regexes = [
['^Junior synonym of ([A-Z][a-z]+ [a-z]+)', /^Junior synonym of (?<hardcoded_name>[A-Z][a-z]+ [a-z]+)/],
['^Senior synonym of ([A-Z][a-z]+ [a-z]+)', /^Senior synonym of (?<hardcoded_name>[A-Z][a-z]+ [a-z]+)/]
]
ReplaceTaxt.new(taxt_regexes, edit_summary, max_replace_in_batch: 400).call
ReplaceTaxt.new(taxt_regexes, edit_summary, max_replace_in_batch: 400).call |
|
Junior/senior synonym of - test batches
# Species-group names.
batch = 9
edit_summary = "Replace hardcoded names with `missing` tags [batch #{batch}], see %github1052"
taxt_regexes = [
['^Senior synonym of ([A-Z][a-z]+ [a-z]+):', /^Senior synonym of (?<hardcoded_name>[A-Z][a-z]+ [a-z]+):/]
]
ReplaceTaxt.new(taxt_regexes, edit_summary, max_replace_in_batch: 25).call
# Species-group names.
batch = 10
edit_summary = "Replace hardcoded names with `missing` tags [batch #{batch}], see %github1052"
taxt_regexes = [
['^Junior synonym of ([A-Z][a-z]+ [a-z]+):', /^Junior synonym of (?<hardcoded_name>[A-Z][a-z]+ [a-z]+):/]
]
ReplaceTaxt.new(taxt_regexes, edit_summary, max_replace_in_batch: 25).call |
Replacement names
# Species-group names.
batch = 11
edit_summary = "Replace hardcoded names with `missing` tags [batch #{batch}], see %github1052"
taxt_regexes = [
['^Replacement name for ([A-Z][a-z]+ [a-z]+)', /^Replacement name for (?<hardcoded_name>[A-Z][a-z]+ [a-z]+)/],
['^Unnecessary replacement name for ([A-Z][a-z]+ [a-z]+)', /^Unnecessary replacement name for (?<hardcoded_name>[A-Z][a-z]+ [a-z]+)/],
['^\[Unnecessary replacement name for ([A-Z][a-z]+ [a-z]+)', /^\[Unnecessary replacement name for (?<hardcoded_name>[A-Z][a-z]+ [a-z]+)/],
['^Unnecessary \(second\) replacement name for ([A-Z][a-z]+ [a-z]+)', /^Unnecessary \(second\) replacement name for (?<hardcoded_name>[A-Z][a-z]+ [a-z]+)/],
['^\[Incorrectly proposed replacement name for ([A-Z][a-z]+ [a-z]+)', /^\[Incorrectly proposed replacement name for (?<hardcoded_name>[A-Z][a-z]+ [a-z]+)/],
['^First available replacement name for ([A-Z][a-z]+ [a-z]+)', /^First available replacement name for (?<hardcoded_name>[A-Z][a-z]+ [a-z]+)/],
[
'^\[Incorrectly proposed and unnecessary replacement name for ([A-Z][a-z]+ [a-z]+)',
/^\[Incorrectly proposed and unnecessary replacement name for (?<hardcoded_name>[A-Z][a-z]+ [a-z]+)/
]
]
ReplaceTaxt.new(taxt_regexes, edit_summary, max_replace_in_batch: 200).call |
"homonym of"
# Species-group names
batch = 14
edit_summary = "Replace hardcoded names with `missing` tags [batch #{batch}], see %github1052"
taxt_regexes = [
['homonym of ([A-Z][a-z]+ [a-z]+) {', /homonym of (?<hardcoded_name>[A-Z][a-z]+ [a-z]+) {/],
['homonym of ([A-Z][a-z]+ [a-z]+ [a-z]+) {', /homonym of (?<hardcoded_name>[A-Z][a-z]+ [a-z]+ [a-z]+) {/]
]
ReplaceTaxt.new(taxt_regexes, edit_summary, max_replace_in_batch: 200).call |
Closing for now. We must run some of these again after splitting items (see https://github.com/calacademy-research/antcat-issues/issues/52). |
For #35
The text was updated successfully, but these errors were encountered: