# [TLDR] grep tokens in each products `product_descriptor`

# [LONGER VERSION]
- token matching is most direct way to grab what the customer is looking for

- upon being given a `search_string`, simple regex magic is done
  - to partition the `search_string` into `token`s
  - I then grep these `token` in each products `product_descriptor`, matching can then be either
    - exact matching (must have leading and trailing spaces) or 
    - simply on presence (presence is enough, this works well for poorly partitioned `tags`)
  - matching can also be
    - case sensitive or
    - case insenstive
  
- `product_descriptor` is built by combining from all string like info deemed to be useful, namely
    - `title` (~ French description)
    - `title_orig` (~ English translation though as the translation could be wrong there is no harm in doubling up)
    - `tags` (~ non-standard category labelling of the product)
      - is sometimes partitioned with space and ;
      - and others appears as a continuous string (non-exact matching is better for this)
    - `product_color` (self explanatory)

# IMPORTS

In [1]:
%run ipynb_setup.ipynb

In [2]:
%run class_Dataset.ipynb

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# IMPLEMENT TEXT SEARCH

In [3]:
import re
from typing import List

# FUNCTIONS TO BUILD UP QUERY

In [4]:
REGEX_PATTERN      = str
COLNAME            = str
PRODUCT_DESCRIPTOR = pd.Series(dtype='str')
SEARCH_STRING      = str
TOKENS             = str
SEARCH_TOKENS      = List[str]

class TokenSearch():
    def __init__(
        self,
        dataset : Dataset,
        ) -> None :
        self.dataset            = dataset
        self.product_descriptor = self.make_product_descriptor()
        
    def make_product_descriptor(
        self,
        descriptor_colnames : List[COLNAME] = ['title','title_orig','tags','product_color'] # cols of interest
        ) -> PRODUCT_DESCRIPTOR :
        '''
        TokenSearch.make_product_descriptor()
        '''
        descriptors = self.dataset.df[descriptor_colnames] # extract columns
        descriptors = descriptors.apply(lambda x:' '.join([str(x) for x in x.values]),axis=1) # merge with | as sep
        #descriptors = descriptors.str.replace('\'','') # strip appostrophes
        descriptors = descriptors.str.replace('\W',' ',regex=True) # strip special chars
        return descriptors

    # search string tokenizer
    def tokenize_search_string(
        self,
        search_string : SEARCH_STRING,
        exact_match   : bool = False,
        ) -> SEARCH_TOKENS :
        tokens = re.split('\W+',search_string) # partition string on non-alphanumeric chars
        #tokens = [token.lower() for token in tokens] # set tokens to lower case
        tokens = list(set(tokens)) # uniques
        if exact_match: return ['\\b'+token+'\\b' for token in tokens] # make tokens exact matches
        return tokens

    def build_regex_from_tokens(
        self,
        search_tokens : SEARCH_TOKENS,
        ) -> REGEX_PATTERN :
        base = r'^{}'
        expr = '(?=.*{})'
        return base.format(''.join(expr.format(w) for w in search_tokens))
    #display(build_regex(ts.tokenize_search_string('RED banana')))
    
    def tokens_found(
        self,
        search_tokens      : List[TOKENS]  = None, # can give either search_string or search_tokens
        search_string      : SEARCH_STRING = None, # can give either search_string or search_tokens
        exact_match        : bool          = False,
        case_sensitive     : bool          = False,
        verbose            : int           = 0, # show workings
        ) -> pd.Series(dtype='int') :
        ###########################################################
        # build regex for pd.Series.str.contains()
        ###########################################################
        # get search tokens if not already given
        if search_tokens is None:
            search_tokens = self.tokenize_search_string(
                search_string = search_string,
                exact_match   = exact_match,
            )
        if verbose>1: print(f'search_tokens = {search_tokens}')

        # compile regex
        regex_str = self.build_regex_from_tokens(search_tokens)
        if verbose>1: print(f'regex_str = {regex_str}')
        
        ###########################################################
        # apply regex to product_descriptor
        ###########################################################
        # returns a List[pd.Series(dtype=bool)], each list entry denoteing if nth token is found (exact or simple matching)
        if verbose>1: print(self.product_descriptor.shape)
        if case_sensitive:
            found_series = self.product_descriptor.str.contains(regex_str)
        else:
            found_series = self.product_descriptor.str.contains(regex_str,flags=re.IGNORECASE)
        
        if verbose>1: print(found_series.shape)
        
        ###########################################################
        # make pretty return df
        ###########################################################
        if verbose > 0:
            # snip results table to stuff which has at least SOME match
            filter_me = pd.concat(
                [
                    self.product_descriptor.to_frame('product_descriptor'),
                    self.dataset.df, #found_series.to_frame('found'),
                ],
                axis=1
            )
        else:
            filter_me = self.dataset.df
        
        # apply filter
        res = filter_me[found_series]

        ###########################################################
        # return
        ###########################################################
        if verbose>0:
            return res.head(20)
        else:
            return res