# [TLDR] match search tokens to some `product_descriptor` for each product

- token matching seems most direct in grabbing home truths re what the customer is looking for

- upon beign given a `search_string`, simple regex magic
  - is done to partition it into `token`s
  - `token` matching to the `product_descriptor` can then be done either by
    - exact matching (with leading and trailing spaces) or 
    - simply on presence (is good to catch poorly partitioned `tags`)
  - matching can also be
    - case sensitive or
    - case insenstive
  
- the `product_descriptor`
  - is built from all string like info that is predeemed to be interesting, namely
    - `title` (~ French description)
    - `title_orig` (~ English translation though as the translation could be wrong there is no harm in doubling up)
    - `tags` (~ non-standard category labelling of the product)
      - is sometimes partitioned with space and ;
      - and others appears as a continuous string (non-exact matching is better for this)
    - `product_color` (self explanatory)

# IMPORTS

In [None]:
%run ipynb_setup.ipynb

In [None]:
%run class_Dataset.ipynb

# IMPLEMENT TEXT SEARCH

In [None]:
import re
from typing import List

# FUNCTIONS TO BUILD UP QUERY

In [None]:
REGEX_PATTERN      = str
COLNAME            = str
PRODUCT_DESCRIPTOR = pd.Series(dtype='str')
SEARCH_STRING      = str
TOKENS             = str
SEARCH_TOKENS      = List[str]

class TokenSearch():
    def __init__(
        self,
        dataset : Dataset,
        ) -> None :
        self.dataset            = dataset
        self.product_descriptor = self.make_product_descriptor()
        
    def make_product_descriptor(
        self,
        descriptor_colnames : List[COLNAME] = ['title','title_orig','tags','product_color'] # cols of interest
        ) -> PRODUCT_DESCRIPTOR :
        '''
        TokenSearch.make_product_descriptor()
        '''
        descriptors = self.dataset.df[descriptor_colnames] # extract columns
        descriptors = descriptors.apply(lambda x:' '.join([str(x) for x in x.values]),axis=1) # merge with | as sep
        #descriptors = descriptors.str.replace('\'','') # strip appostrophes
        descriptors = descriptors.str.replace('\W',' ',regex=True) # strip special chars
        return descriptors

    # search string tokenizer
    def tokenize_search_string(
        self,
        search_string : SEARCH_STRING,
        exact_match   : bool = False,
        ) -> SEARCH_TOKENS :
        tokens = re.split('\W+',search_string) # partition string on non-alphanumeric chars
        #tokens = [token.lower() for token in tokens] # set tokens to lower case
        tokens = list(set(tokens)) # uniques
        if exact_match: return ['\\b'+token+'\\b' for token in tokens] # make tokens exact matches
        return tokens

    def build_regex_from_tokens(
        self,
        search_tokens : SEARCH_TOKENS,
        ) -> REGEX_PATTERN :
        base = r'^{}'
        expr = '(?=.*{})'
        return base.format(''.join(expr.format(w) for w in search_tokens))
    #display(build_regex(ts.tokenize_search_string('RED banana')))
    
    def tokens_found_count(
        self,
        search_tokens      : List[TOKENS]  = None, # can give either search_string or search_tokens
        search_string      : SEARCH_STRING = None, # can give either search_string or search_tokens
        exact_match        : bool          = False,
        case_sensitive     : bool          = False,
        verbose            : int           = 0, # show workings
        ) -> pd.Series(dtype='int') :
        ###########################################################
        # build regex for pd.Series.str.contains()
        ###########################################################
        # get search tokens if not already given
        if search_tokens is None:
            search_tokens = self.tokenize_search_string(
                search_string = search_string,
                exact_match   = exact_match,
            )
        if verbose>1: print(f'search_tokens = {search_tokens}')

        # compile regex
        regex_str = self.build_regex_from_tokens(search_tokens)
        
        ###########################################################
        # apply regex to product_descriptor
        ###########################################################
        # returns a List[pd.Series(dtype=bool)], each list entry denoteing if nth token is found (exact or simple matching)
        if case_sensitive:
            found_series = self.product_descriptor.str.contains(regex_str)
        else:
            found_series = self.product_descriptor.str.contains(regex_str,flags=re.IGNORECASE)
        
        ###########################################################
        # make pretty return df
        ###########################################################
        res = pd.concat(
            [
                self.product_descriptor.to_frame('product_descriptor'),
                self.dataset.df, #found_series.to_frame('found'),
            ],
            axis=1
        )

        # snip results table to stuff which has at least SOME match
        res = res[found_series]
        
        ###########################################################
        # return
        ###########################################################
        if verbose>0:
            return res.head(20)
        else:
            return res

In [None]:
'''
d=Dataset()
ts=TokenSearch(dataset=d)
d.show_top_n(ts.tokens_found_count(search_tokens=['summer','short'],verbose=1)) # basic logic
d.show_top_n(ts.tokens_found_count(search_tokens=['harajuku'],verbose=1)) # test rare token logic
d.show_top_n(ts.tokens_found_count(search_tokens=['harajuku','sexy'],verbose=1)) # test rare token logic
d.show_top_n(ts.tokens_found_count(search_tokens=['harajuku','goth','sexy'],verbose=1)) # test rare token logic
d.show_top_n(ts.tokens_found_count(search_tokens=['kids'],verbose=1)) # test
d.show_top_n(ts.tokens_found_count(search_tokens=['kids','ball'],verbose=1)) # test
d.show_top_n(ts.tokens_found_count(search_tokens=['balloon'],verbose=1)) # test show_top_n
d.show_top_n(ts.tokens_found_count(search_tokens=['balloon'],verbose=1)) # test results snipping
d.show_top_n(ts.tokens_found_count(search_tokens=['banana'],verbose=1)) # test results chat#
d.show_top_n(ts.tokens_found_count(search_tokens=['sandal'],verbose=1)) # test results chat
d.show_top_n(ts.tokens_found_count(search_tokens=['sandal','red'],verbose=1)) # test results chat
d.show_top_n(ts.tokens_found_count(search_string='sandal red',verbose=1)) # test results chat
d.show_top_n(ts.tokens_found_count(search_tokens=['top','skinny','red'],verbose=1)) # test results chat
d.show_top_n(ts.tokens_found_count(search_tokens=['top','skinny','red'],exact_match=True,verbose=1)) # test results chat
d.show_top_n(ts.tokens_found_count(search_string='top red',verbose=1)) # test results chat
d.show_top_n(ts.tokens_found_count(search_string='top red',exact_match=True,verbose=1)) # test exact search
d.show_top_n(ts.tokens_found_count(search_string='exy',verbose=1)) # test exact search
d.show_top_n(ts.tokens_found_count(search_string='exy',exact_match=True,verbose=1)) # test exact search
d.show_top_n(ts.tokens_found_count(search_string='top red',exact_match=True,verbose=1)) # test exact search
d.show_top_n(ts.tokens_found_count(search_string='hot',exact_match=True,verbose=1)) # test exact search
d.show_top_n(ts.tokens_found_count(search_string='hot skirt',exact_match=True,verbose=1)) # test non existent search
d.show_top_n(ts.tokens_found_count(search_string='hot skin',exact_match=True,verbose=1)) # test exact search
d.show_top_n(ts.tokens_found_count(search_string='hot skinny',exact_match=True,verbose=1)) # test exact search
d.show_top_n(ts.tokens_found_count(search_string='hot skinny',exact_match=False,verbose=1)) # test exact search
d.show_top_n(ts.tokens_found_count(search_string='skirt slim',exact_match=False,verbose=0)) # test exact search
d.show_top_n(ts.tokens_found_count(search_string='skirt slim',exact_match=False,verbose=0)) # no plotting
d.show_top_n(ts.tokens_found_count(search_string='hot',case_sensitive=True,verbose=1)) # test case sensitive
d.show_top_n(ts.tokens_found_count(search_string='VANGULL',case_sensitive=True,verbose=1)) # test case sensitive
d.show_top_n(ts.tokens_found_count(search_string='VANGULL banana',case_sensitive=True,verbose=1)) # test case sensitive
d.show_top_n(ts.tokens_found_count(search_string='VANGULl',case_sensitive=True,verbose=1)) # test case sensitive
d.show_top_n(ts.tokens_found_count(search_string='VANGUL',case_sensitive=True,verbose=1)) # test case sensitive
d.show_top_n(ts.tokens_found_count(search_string='VANGUL',case_sensitive=True,exact_match=True,verbose=1)) # test case sensitive
d.show_top_n(ts.tokens_found_count(search_string='hot',case_sensitive=True,verbose=1)) # test case sensitive
'''
None