Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Rearrangement of contrib/Analyzers + Arabic Analyzer for 2.9.2 tag an…
…d trunk git-svn-id: https://svn.apache.org/repos/asf/incubator/lucene.net/tags/Lucene.Net_2_9_2@1069573 13f79535-47bb-0310-9956-ffa450edef68
- Loading branch information
Isik Yigit
committed
Feb 10, 2011
1 parent
c01ded5
commit 33dad5a
Showing
20 changed files
with
1,456 additions
and
41 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
202 changes: 202 additions & 0 deletions
202
contrib/Analyzers/Lucene.Net.Analyzers/AR/ArabicAnalyzer.cs
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,202 @@ | ||
/** | ||
* Licensed to the Apache Software Foundation (ASF) under one or more | ||
* contributor license agreements. See the NOTICE file distributed with | ||
* this work for additional information regarding copyright ownership. | ||
* The ASF licenses this file to You under the Apache License, Version 2.0 | ||
* (the "License"); you may not use this file except in compliance with | ||
* the License. You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
|
||
using System.IO; | ||
using System.Collections; | ||
|
||
using Lucene.Net.Analysis; | ||
using Lucene.Net.Util; | ||
|
||
namespace Lucene.Net.Analysis.AR | ||
{ | ||
/** | ||
* {@link Analyzer} for Arabic. | ||
* <p> | ||
* This analyzer implements light-stemming as specified by: | ||
* <i> | ||
* Light Stemming for Arabic Information Retrieval | ||
* </i> | ||
* http://www.mtholyoke.edu/~lballest/Pubs/arab_stem05.pdf | ||
* <p> | ||
* The analysis package contains three primary components: | ||
* <ul> | ||
* <li>{@link ArabicNormalizationFilter}: Arabic orthographic normalization. | ||
* <li>{@link ArabicStemFilter}: Arabic light stemming | ||
* <li>Arabic stop words file: a set of default Arabic stop words. | ||
* </ul> | ||
* | ||
*/ | ||
public class ArabicAnalyzer : Analyzer | ||
{ | ||
|
||
/** | ||
* File containing default Arabic stopwords. | ||
* | ||
* Default stopword list is from http://members.unine.ch/jacques.savoy/clef/index.html | ||
* The stopword list is BSD-Licensed. | ||
*/ | ||
public static string DEFAULT_STOPWORD_FILE = "ArabicStopWords.txt"; | ||
|
||
/** | ||
* Contains the stopwords used with the StopFilter. | ||
*/ | ||
private Hashtable stoptable = new Hashtable(); | ||
/** | ||
* The comment character in the stopwords file. All lines prefixed with this will be ignored | ||
*/ | ||
public static string STOPWORDS_COMMENT = "#"; | ||
|
||
private Version matchVersion; | ||
|
||
/** | ||
* Builds an analyzer with the default stop words: {@link #DEFAULT_STOPWORD_FILE}. | ||
* | ||
* @deprecated Use {@link #ArabicAnalyzer(Version)} instead | ||
*/ | ||
public ArabicAnalyzer() : this(Version.LUCENE_24) | ||
{ | ||
|
||
} | ||
|
||
/** | ||
* Builds an analyzer with the default stop words: {@link #DEFAULT_STOPWORD_FILE}. | ||
*/ | ||
public ArabicAnalyzer(Version matchVersion) | ||
{ | ||
this.matchVersion = matchVersion; | ||
|
||
using (StreamReader reader = new StreamReader(System.Reflection.Assembly.GetAssembly(this.GetType()).GetManifestResourceStream("Lucene.Net.Analyzers.AR." + DEFAULT_STOPWORD_FILE))) | ||
{ | ||
while (!reader.EndOfStream) | ||
{ | ||
string word = reader.ReadLine(); | ||
stoptable.Add(word, word); | ||
} | ||
} | ||
} | ||
|
||
/** | ||
* Builds an analyzer with the given stop words. | ||
* | ||
* @deprecated Use {@link #ArabicAnalyzer(Version, String[])} instead | ||
*/ | ||
public ArabicAnalyzer(string[] stopwords): this(Version.LUCENE_24, stopwords) | ||
{ | ||
} | ||
|
||
/** | ||
* Builds an analyzer with the given stop words. | ||
*/ | ||
public ArabicAnalyzer(Version matchVersion, string[] stopwords) | ||
{ | ||
stoptable = StopFilter.MakeStopSet(stopwords); | ||
this.matchVersion = matchVersion; | ||
} | ||
|
||
/** | ||
* Builds an analyzer with the given stop words. | ||
* | ||
* @deprecated Use {@link #ArabicAnalyzer(Version, Hashtable)} instead | ||
*/ | ||
public ArabicAnalyzer(Hashtable stopwords) : this(Version.LUCENE_24, stopwords) | ||
{ | ||
} | ||
|
||
/** | ||
* Builds an analyzer with the given stop words. | ||
*/ | ||
public ArabicAnalyzer(Version matchVersion, Hashtable stopwords) | ||
{ | ||
stoptable = new Hashtable(stopwords); | ||
this.matchVersion = matchVersion; | ||
} | ||
|
||
//DIGY | ||
///** | ||
// * Builds an analyzer with the given stop words. Lines can be commented out using {@link #STOPWORDS_COMMENT} | ||
// * | ||
// * @deprecated Use {@link #ArabicAnalyzer(Version, File)} instead | ||
// */ | ||
//public ArabicAnalyzer(File stopwords) | ||
//{ | ||
// this(Version.LUCENE_24, stopwords); | ||
//} | ||
|
||
///** | ||
// * Builds an analyzer with the given stop words. Lines can be commented out using {@link #STOPWORDS_COMMENT} | ||
// */ | ||
//public ArabicAnalyzer(Version matchVersion, File stopwords) | ||
//{ | ||
// stoptable = WordlistLoader.getWordSet(stopwords, STOPWORDS_COMMENT); | ||
// this.matchVersion = matchVersion; | ||
//} | ||
|
||
|
||
/** | ||
* Creates a {@link TokenStream} which tokenizes all the text in the provided {@link Reader}. | ||
* | ||
* @return A {@link TokenStream} built from an {@link ArabicLetterTokenizer} filtered with | ||
* {@link LowerCaseFilter}, {@link StopFilter}, {@link ArabicNormalizationFilter} | ||
* and {@link ArabicStemFilter}. | ||
*/ | ||
public override TokenStream TokenStream(string fieldName, TextReader reader) | ||
{ | ||
TokenStream result = new ArabicLetterTokenizer(reader); | ||
result = new LowerCaseFilter(result); | ||
result = new StopFilter(StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion), result, stoptable); | ||
result = new ArabicNormalizationFilter(result); | ||
result = new ArabicStemFilter(result); | ||
|
||
return result; | ||
} | ||
|
||
private class SavedStreams | ||
{ | ||
internal Tokenizer Source; | ||
internal TokenStream Result; | ||
}; | ||
|
||
/** | ||
* Returns a (possibly reused) {@link TokenStream} which tokenizes all the text | ||
* in the provided {@link Reader}. | ||
* | ||
* @return A {@link TokenStream} built from an {@link ArabicLetterTokenizer} filtered with | ||
* {@link LowerCaseFilter}, {@link StopFilter}, {@link ArabicNormalizationFilter} | ||
* and {@link ArabicStemFilter}. | ||
*/ | ||
public override TokenStream ReusableTokenStream(string fieldName, TextReader reader) | ||
{ | ||
SavedStreams streams = (SavedStreams)GetPreviousTokenStream(); | ||
if (streams == null) | ||
{ | ||
streams = new SavedStreams(); | ||
streams.Source = new ArabicLetterTokenizer(reader); | ||
streams.Result = new LowerCaseFilter(streams.Source); | ||
streams.Result = new StopFilter(StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion), | ||
streams.Result, stoptable); | ||
streams.Result = new ArabicNormalizationFilter(streams.Result); | ||
streams.Result = new ArabicStemFilter(streams.Result); | ||
SetPreviousTokenStream(streams); | ||
} | ||
else | ||
{ | ||
streams.Source.Reset(reader); | ||
} | ||
return streams.Result; | ||
} | ||
} | ||
} |
63 changes: 63 additions & 0 deletions
63
contrib/Analyzers/Lucene.Net.Analyzers/AR/ArabicLetterTokenizer.cs
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,63 @@ | ||
/** | ||
* Licensed to the Apache Software Foundation (ASF) under one or more | ||
* contributor license agreements. See the NOTICE file distributed with | ||
* this work for additional information regarding copyright ownership. | ||
* The ASF licenses this file to You under the Apache License, Version 2.0 | ||
* (the "License"); you may not use this file except in compliance with | ||
* the License. You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
|
||
using System.IO; | ||
using System.Collections; | ||
|
||
using Lucene.Net.Analysis; | ||
using Lucene.Net.Util; | ||
|
||
namespace Lucene.Net.Analysis.AR | ||
{ | ||
|
||
/** | ||
* Tokenizer that breaks text into runs of letters and diacritics. | ||
* <p> | ||
* The problem with the standard Letter tokenizer is that it fails on diacritics. | ||
* Handling similar to this is necessary for Indic Scripts, Hebrew, Thaana, etc. | ||
* </p> | ||
* | ||
*/ | ||
public class ArabicLetterTokenizer : LetterTokenizer | ||
{ | ||
|
||
public ArabicLetterTokenizer(TextReader @in): base(@in) | ||
{ | ||
|
||
} | ||
|
||
public ArabicLetterTokenizer(AttributeSource source, TextReader @in) : base(source, @in) | ||
{ | ||
|
||
} | ||
|
||
public ArabicLetterTokenizer(AttributeFactory factory, TextReader @in) : base(factory, @in) | ||
{ | ||
|
||
} | ||
|
||
/** | ||
* Allows for Letter category or NonspacingMark category | ||
* @see org.apache.lucene.analysis.LetterTokenizer#isTokenChar(char) | ||
*/ | ||
protected override bool IsTokenChar(char c) | ||
{ | ||
return base.IsTokenChar(c) || char.GetUnicodeCategory(c)==System.Globalization.UnicodeCategory.NonSpacingMark ; | ||
} | ||
|
||
} | ||
} |
61 changes: 61 additions & 0 deletions
61
contrib/Analyzers/Lucene.Net.Analyzers/AR/ArabicNormalizationFilter.cs
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,61 @@ | ||
/** | ||
* Licensed to the Apache Software Foundation (ASF) under one or more | ||
* contributor license agreements. See the NOTICE file distributed with | ||
* this work for additional information regarding copyright ownership. | ||
* The ASF licenses this file to You under the Apache License, Version 2.0 | ||
* (the "License"); you may not use this file except in compliance with | ||
* the License. You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
|
||
using System.IO; | ||
using System.Collections; | ||
|
||
using Lucene.Net.Analysis; | ||
using Lucene.Net.Analysis.Tokenattributes; | ||
using Lucene.Net.Util; | ||
|
||
|
||
namespace Lucene.Net.Analysis.AR | ||
{ | ||
|
||
/** | ||
* A {@link TokenFilter} that applies {@link ArabicNormalizer} to normalize the orthography. | ||
* | ||
*/ | ||
|
||
public class ArabicNormalizationFilter : TokenFilter | ||
{ | ||
|
||
protected ArabicNormalizer normalizer = null; | ||
private TermAttribute termAtt; | ||
|
||
public ArabicNormalizationFilter(TokenStream input) : base(input) | ||
{ | ||
|
||
normalizer = new ArabicNormalizer(); | ||
termAtt = (TermAttribute)AddAttribute(typeof(TermAttribute)); | ||
} | ||
|
||
public override bool IncrementToken() | ||
{ | ||
if (input.IncrementToken()) | ||
{ | ||
int newlen = normalizer.Normalize(termAtt.TermBuffer(), termAtt.TermLength()); | ||
termAtt.SetTermLength(newlen); | ||
return true; | ||
} | ||
else | ||
{ | ||
return false; | ||
} | ||
} | ||
} | ||
} |
Oops, something went wrong.