Permalink
Browse files

Remove stop words, protwords, synonyms, and other index/query modific…

…ations from solr schema
  • Loading branch information...
1 parent 466b556 commit 6f19fa0629acf86d0ae94bc745d5872e22b92fff @mbklein mbklein committed Mar 22, 2013
Showing with 3 additions and 60 deletions.
  1. +3 −60 solr_conf/conf/schema.xml
View
@@ -201,7 +201,6 @@
<analyzer>
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.ICUFoldingFilterFactory" />
- <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true" />
<filter class="solr.SnowballPorterFilterFactory" language="English" />
</analyzer>
</fieldType>
@@ -222,65 +221,36 @@
<!-- A general text field that has reasonable, generic
cross-language defaults: it tokenizes with StandardTokenizer,
- removes stop words from case-insensitive "stopwords.txt"
- (empty by default), and down cases. At query time only, it
+ and down cases. At query time only, it
also applies synonyms. -->
<fieldType name="text_general" class="solr.TextField" positionIncrementGap="100">
<analyzer type="index">
<tokenizer class="solr.StandardTokenizerFactory"/>
- <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true" />
- <!-- in this example, we will only use synonyms at query time
- <filter class="solr.SynonymFilterFactory" synonyms="index_synonyms.txt" ignoreCase="true" expand="false"/>
- -->
<filter class="solr.LowerCaseFilterFactory"/>
</analyzer>
<analyzer type="query">
<tokenizer class="solr.StandardTokenizerFactory"/>
- <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true" />
- <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
<filter class="solr.LowerCaseFilterFactory"/>
</analyzer>
</fieldType>
<!-- A text field with defaults appropriate for English: it
- tokenizes with StandardTokenizer, removes English stop words
- (stopwords_en.txt), down cases, protects words from protwords.txt, and
- finally applies Porter's stemming. The query time analyzer
- also applies synonyms from synonyms.txt. -->
+ tokenizes with StandardTokenizer, down cases, and
+ finally applies Porter's stemming. -->
<fieldType name="text_en" class="solr.TextField" positionIncrementGap="100">
<analyzer type="index">
<tokenizer class="solr.StandardTokenizerFactory"/>
- <!-- in this example, we will only use synonyms at query time
- <filter class="solr.SynonymFilterFactory" synonyms="index_synonyms.txt" ignoreCase="true" expand="false"/>
- -->
- <!-- Case insensitive stop word removal.
- add enablePositionIncrements=true in both the index and query
- analyzers to leave a 'gap' for more accurate phrase queries.
- -->
- <filter class="solr.StopFilterFactory"
- ignoreCase="true"
- words="stopwords_en.txt"
- enablePositionIncrements="true"
- />
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.EnglishPossessiveFilterFactory"/>
- <filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
<!-- Optionally you may want to use this less aggressive stemmer instead of PorterStemFilterFactory:
<filter class="solr.EnglishMinimalStemFilterFactory"/>
-->
<filter class="solr.PorterStemFilterFactory"/>
</analyzer>
<analyzer type="query">
<tokenizer class="solr.StandardTokenizerFactory"/>
- <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
- <filter class="solr.StopFilterFactory"
- ignoreCase="true"
- words="stopwords_en.txt"
- enablePositionIncrements="true"
- />
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.EnglishPossessiveFilterFactory"/>
- <filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
<!-- Optionally you may want to use this less aggressive stemmer instead of PorterStemFilterFactory:
<filter class="solr.EnglishMinimalStemFilterFactory"/>
-->
@@ -303,34 +273,14 @@
<fieldType name="text_en_splitting" class="solr.TextField" positionIncrementGap="100" autoGeneratePhraseQueries="true">
<analyzer type="index">
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
- <!-- in this example, we will only use synonyms at query time
- <filter class="solr.SynonymFilterFactory" synonyms="index_synonyms.txt" ignoreCase="true" expand="false"/>
- -->
- <!-- Case insensitive stop word removal.
- add enablePositionIncrements=true in both the index and query
- analyzers to leave a 'gap' for more accurate phrase queries.
- -->
- <filter class="solr.StopFilterFactory"
- ignoreCase="true"
- words="stopwords_en.txt"
- enablePositionIncrements="true"
- />
<filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0" splitOnCaseChange="1"/>
<filter class="solr.LowerCaseFilterFactory"/>
- <filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
<filter class="solr.PorterStemFilterFactory"/>
</analyzer>
<analyzer type="query">
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
- <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
- <filter class="solr.StopFilterFactory"
- ignoreCase="true"
- words="stopwords_en.txt"
- enablePositionIncrements="true"
- />
<filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0" splitOnCaseChange="1"/>
<filter class="solr.LowerCaseFilterFactory"/>
- <filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
<filter class="solr.PorterStemFilterFactory"/>
</analyzer>
</fieldType>
@@ -340,11 +290,8 @@
<fieldType name="text_en_splitting_tight" class="solr.TextField" positionIncrementGap="100" autoGeneratePhraseQueries="true">
<analyzer>
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
- <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="false"/>
- <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords_en.txt"/>
<filter class="solr.WordDelimiterFilterFactory" generateWordParts="0" generateNumberParts="0" catenateWords="1" catenateNumbers="1" catenateAll="0"/>
<filter class="solr.LowerCaseFilterFactory"/>
- <filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
<filter class="solr.EnglishMinimalStemFilterFactory"/>
<!-- this filter can remove any duplicate tokens that appear at the same position - sometimes
possible with WordDelimiterFilter in conjuncton with stemming. -->
@@ -357,23 +304,19 @@
<fieldType name="text_general_rev" class="solr.TextField" positionIncrementGap="100">
<analyzer type="index">
<tokenizer class="solr.StandardTokenizerFactory"/>
- <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true" />
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.ReversedWildcardFilterFactory" withOriginal="true"
maxPosAsterisk="3" maxPosQuestion="2" maxFractionAsterisk="0.33"/>
</analyzer>
<analyzer type="query">
<tokenizer class="solr.StandardTokenizerFactory"/>
- <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
- <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true" />
<filter class="solr.LowerCaseFilterFactory"/>
</analyzer>
</fieldType>
<fieldType name="textSpell" class="solr.TextField" positionIncrementGap="100" >
<analyzer>
<tokenizer class="solr.StandardTokenizerFactory"/>
- <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt"/>
<filter class="solr.StandardFilterFactory"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.RemoveDuplicatesTokenFilterFactory"/>

0 comments on commit 6f19fa0

Please sign in to comment.