This repository has been archived by the owner on Dec 14, 2023. It is now read-only.
/
lt.pm
151 lines (127 loc) · 5.83 KB
/
lt.pm
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
package MediaWords::Languages::lt;
use Moose;
with 'MediaWords::Languages::Language';
#
# Lithuanian
#
use strict;
use warnings;
use utf8;
use Modern::Perl "2013";
use MediaWords::CommonLibs;
use Lingua::Stem::Snowball::Lt;
# Lingua::Stem::Snowball::Lt instance (if needed), lazy-initialized in stem()
has 'lt_stemmer' => ( is => 'rw', default => 0 );
sub get_language_code
{
return 'lt';
}
sub fetch_and_return_tiny_stop_words
{
my $self = shift;
return $self->_get_stop_words_from_file( 'lib/MediaWords/Languages/resources/lt_stoplist.txt' );
}
sub fetch_and_return_short_stop_words
{
my $self = shift;
return $self->_get_stop_words_from_file( 'lib/MediaWords/Languages/resources/lt_stoplist.txt' );
}
sub fetch_and_return_long_stop_words
{
my $self = shift;
return $self->_get_stop_words_from_file( 'lib/MediaWords/Languages/resources/lt_stoplist.txt' );
}
sub stem
{
my $self = shift;
# (Re-)initialize stemmer if needed
if ( $self->lt_stemmer == 0 )
{
$self->lt_stemmer( Lingua::Stem::Snowball::Lt->new() );
}
my @stems = $self->lt_stemmer->stem( \@_ );
return \@stems;
}
sub get_word_length_limit
{
my $self = shift;
# The two longest Lithuanian words are 37 letters long: 1) the adjective
# septyniasdešimtseptyniastraipsniuose – the plural locative case of the
# adjective septyniasdešimtseptyniastraipsnis, meaning "(object) with
# seventy-seven articles"; 2) the participle
# nebeprisikiškiakopūsteliaudavusiuose, "in those that were repeatedly
# unable to pick enough of small wood-sorrels in the past" – the plural
# locative case of past iterative active participle of verb
# kiškiakopūsteliauti meaning "to pick wood-sorrels" (edible forest plant
# with sour taste, word by word translation "rabbit cabbage"). The word
# is commonly attributed to famous Lithuanian language teacher Jonas
# Kvederaitis, who actually used the plural first person of past iterative
# tense, nebeprisikiškiakopūstaudavome.[citation needed]
return 37;
}
sub get_sentences
{
my ( $self, $story_text ) = @_;
return $self->_tokenize_text_with_lingua_sentence( 'lt',
'lib/MediaWords/Languages/resources/lt_nonbreaking_prefixes.txt', $story_text );
}
sub tokenize
{
my ( $self, $sentence ) = @_;
return $self->_tokenize_with_spaces( $sentence );
}
sub get_noise_strings
{
my $self = shift;
my @noise_strings = (
'naujienų', 'BNS', 'skelbti', 'cituoti', 'atgaminti', 'kopijuoti',
'dauginti', 'platinti', 'informavimo', 'raštiško', 'raštišką', 'sutikimo',
'sutikimą', 'sutikimas', 'neleidžiama', 'draudžiama', 'taisyklės', 'teisės',
'saugomos', 'griežtai', 'DELFI', 'žiniasklaidos', 'nurodyti', 'šaltinį',
'šaltinis'
);
return \@noise_strings;
}
sub get_copyright_strings
{
my $self = shift;
my @copyright_strings =
( 'copyright', 'copying', '©', 'all rights reserved', 'teisės saugomos', 'visos teisės saugomos', );
return \@copyright_strings;
}
sub get_locale_codes_api_object
{
my $self = shift;
return $self->_get_locale_country_multilingual_object( 'lt' );
}
sub get_country_name_remapping
{
my $self = shift;
return {
'antigva ir barbuda' => 'antigva', # 'antigua and barbuda'
'bosnija ir hercegovina' => 'bosnija', # 'bosnia and herzegovina'
'centrinės afrikos respublika' => 'centrinė afrika', # 'central african republic'
'didžiosios britanijos mergelių salos' => -1, # 'virgin islands, british'
'dramblio kaulo krantas' => -1, # 'cote d\'ivoire'
'heardo ir mcdonaldo salų sritis' => -1, # 'heard island and mcdonald islands'
'indijos vandenyno britų sritis' => -1, # 'british indian ocean territory'
'jungtiniai arabų emyratai' => 'arabų emyratai', # 'united arab emirates'
'jungtinių valstijų mažosios aplinkinės salos' => -1, # 'united states minor outlying islands'
'kinijos s.a.r.honkongas' => 'honkongas', # 'hong kong'
'kongo demokratinė respublika' => 'kongas-kinšasa', # 'congo, the democratic republic of the'
'marianos šiaurinės salos' => -1, # 'northern mariana islands'
'mergelių salos (jav)' => -1, # 'virgin islands, u.s.'
'papua naujoji gvinėja' => 'papua gvinėja', # 'papua new guinea'
'prancūzijos pietų sritys' => -1, # 'french southern territories'
'rytų džordžija ir rytų sandwich salos' => -1, # 'south georgia and the south sandwich islands'
'san tomė ir principė' => -1, # 'sao tome and principe'
'sen pjeras ir mikelonas' => -1, # 'saint pierre and miquelon'
'sent kitsas ir nevis' => -1, # 'saint kitts and nevis'
'svalbardo ir jan majen salos' => -1, # 'svalbard and jan mayen'
'trinidadas ir tobagas' => 'trinidadas', # 'trinidad and tobago'
'turkso ir caicoso salos' => -1, # 'turks and caicos islands'
'wallisas ir futuna' => 'futuna', # 'wallis and futuna'
'šventasis vincentas ir grenadinai' => -1, # 'saint vincent and the grenadines'
};
}
1;