Skip to content

Commit

Permalink
remove words from tokenizing
Browse files Browse the repository at this point in the history
  • Loading branch information
Daniel Pfefferkorn committed Mar 2, 2021
1 parent 2204df3 commit 7f612db
Show file tree
Hide file tree
Showing 3 changed files with 42 additions and 35 deletions.
3 changes: 2 additions & 1 deletion csv-manager.cabal
Original file line number Diff line number Diff line change
Expand Up @@ -17,11 +17,12 @@ common common-options
, aeson
, bytestring
, cassava
, containers
, full-text-search
, relude
, vector
, text
, tokenize
, tokenize >= 0.1

ghc-options: -Wall
-Wcompat
Expand Down
64 changes: 32 additions & 32 deletions src/Row.hs
Original file line number Diff line number Diff line change
Expand Up @@ -13,42 +13,42 @@ import Data.Aeson (FromJSON, ToJSON)
import Data.Csv (FromRecord)

data CsvRow = CsvRow
{ itemNumber :: Text,
itemCardboardBox :: Text,
itemCount :: Text,
itemImage :: Text,
itemGender :: Text,
itemClothe :: Text,
itemTitleEtsy :: Text,
itemDesc :: Text,
itemBrand :: Text,
itemFlaws :: Text,
itemColor :: Text,
itemSize :: Text,
itemTags :: Text,
itemMaterial :: Text,
itemPrice :: Text,
itemStyle :: Text
{ number :: Text,
cardboardBox :: Text,
count :: Text,
image :: Text,
gender :: Text,
clothe :: Text,
titleEtsy :: Text,
desc :: Text,
brand :: Text,
flaws :: Text,
color :: Text,
size :: Text,
tags :: Text,
material :: Text,
price :: Text,
style :: Text
}
deriving stock (Generic, Eq, Show, Ord)
deriving anyclass (FromJSON, ToJSON, FromRecord)

indexableFields :: CsvRow -> [Text]
indexableFields CsvRow {..} =
[ itemNumber,
itemCardboardBox,
-- itemCount,
itemImage,
itemGender,
itemClothe,
itemTitleEtsy,
itemDesc,
itemBrand,
itemFlaws,
itemColor,
itemSize,
itemTags,
itemMaterial,
-- itemPrice,
itemStyle
[ number,
cardboardBox,
-- count,
image,
gender,
clothe,
titleEtsy,
desc,
brand,
flaws,
color,
size,
tags,
material,
-- price,
style
]
10 changes: 8 additions & 2 deletions src/SearchEngine.hs
Original file line number Diff line number Diff line change
@@ -1,12 +1,14 @@
{-# LANGUAGE DerivingStrategies #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE ScopedTypeVariables #-}

module SearchEngine (fromCsvRows, search) where

import Data.Char (isPunctuation)
import Data.Ix (Ix)
import Data.SearchEngine (NoFeatures, SearchConfig (..), SearchRankParameters (..))
import Data.SearchEngine (NoFeatures, SearchConfig (..), SearchRankParameters (..), Term)
import qualified Data.SearchEngine as SE
import qualified Data.Set as Set
import qualified Data.Text as Text
import qualified NLP.Tokenize as NLP
import Row (CsvRow, indexableFields)
Expand Down Expand Up @@ -50,13 +52,17 @@ searchConfig =

extractTokens :: CsvRow -> SearchField -> [Text]
extractTokens row NameField =
map (Text.toCaseFold . Text.pack)
filter (`Set.notMember` stopWords)
. map (Text.toCaseFold . Text.pack)
. concatMap splitTok
. filter (not . ignoreTok)
. concatMap (NLP.tokenize . Text.unpack)
. indexableFields
$ row

stopWords :: Set Term
stopWords = Set.fromList [""]

ignoreTok :: String -> Bool
ignoreTok = all isPunctuation

Expand Down

0 comments on commit 7f612db

Please sign in to comment.