/
txt2mongo.rb
114 lines (104 loc) · 3.13 KB
/
txt2mongo.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
# encoding: UTF-8
#
# txt2mongo
#
# Author:: burningTyger (https://github.com/burningTyger)
# Home:: https://github.com/ckh/farhang
# Copyright:: Copyright (c) 2011 burningTyger
# License:: MIT License
#
# with txt2mongo you can check the integrity of your lexicon text files as
# used in farhang. You pass the file name or wildcard along to check the
# file entries. If you pass in a database name as a second argument txt2mongo
# will store the entries in your mongodb. If you don't get any output it means
# your files are ok. If you get filenames and line numbers it means you need to
# correct them. Usually a ";" is forgotten.
# To check all files you need to put '*.txt' into quotation marks
#
# run like this
# ruby txt2mongo.rb file.txt|'*.txt' [database]
# example
# ruby txt2mongo.rb k.txt test_db
# or
# ruby txt2mongo.rb '*.txt'
if ARGV[1]
require 'rubygems' if RUBY_VERSION[0,3] == '1.8'
require 'mongo_mapper'
MongoMapper.database = ARGV[1]
class Lemma
include MongoMapper::Document
key :lemma, String, :unique => true, :required => true
key :lemma_vowelized, String
key :language, String
key :rtl, Boolean
many :translations
timestamps!
end
class Translation
include MongoMapper::EmbeddedDocument
key :source, String
key :target, String
key :language, String
key :fix, Boolean
timestamps!
end
Lemma.collection.remove
Lemma.ensure_index(:lemma)
#replace wasla with madda on alif
def fix_typos(str)
puts "Wasla to Madda in #{str} fixed" if str.gsub!("\u0671", "\u0622")
end
#this method removes kasra, fatha and damma from lemma
def devowelize(str)
str.delete("\u064B-\u0655")
end
def strip_replace(str)
str.strip.gsub('\;', ';')
end
lemma = nil
Dir.glob("#{ARGV[0]}").each do |ff|
File.open(ff, 'r') do |f|
f.each_line do |l|
# split only if there is no unescaped semicolon (ie. \;)
# this way you can have translations that include semicolons
source, target = l.split(/(?<!\\)[;]/)
begin
source = strip_replace(source)
target = strip_replace(target)
rescue
puts "#{ff.to_s} - #{f.lineno}: #{l}"
end
if !source.start_with?('- ')
lemma = Lemma.new( :lemma => source )
lemma.language = "de"
unless target.nil? or target.empty?
trans = Translation.new( :source => source, :target => target, :language => "de" )
lemma.translations << trans
end
else
source.sub!('- ', '')
trans = Translation.new( :source => source, :target => target, :language => "de" )
lemma.translations << trans
end
lemma.save unless lemma.nil?
end
end
end
else
Dir.glob("#{ARGV[0]}").each do |ff|
File.open(ff, 'r') do |f|
f.each_line do |l|
source, target = l.split(/(?<!\\)[;]/)
begin
source.strip!
target.strip!
rescue
puts "#{ff.to_s} - #{f.lineno}: #{l}"
end
if source.start_with?("- ") or l.count(";") > 1
puts "#{ff.to_s} - #{f.lineno}: #{l}"
end
end
end
end
end