# [TLDR] PROGRESSIVELY BUILT UP A TEXT SEARCH
- main points include

1. input `search_string` broken up (tokenized) on `\W` special chars

2. can look for tokens in "product descriptor" either by (a) exact matching tokens OR (b) simple matching of tokens
- exact matching
  - pros
    - avoids matching the token `red` in strings like `prediction`, i.e. avoids spurious matches
  - cons
    - tags are not all space separated so will not match anything, i.e. cannot find `white` in `whitesandalsbeach` 
- simple matching
  - pros and cons are literally flip of that in exact matching, hard to say which is better
  
3. "product descriptor" is a merge of `title`, `title_orig`, `tags` and `product_color`
- all seem useful
- `title` and `title_orig` should be similar, one is French the other is English translation, who knows which is better, i.e. use both
- `tags` seem very useful but format isn't standardized, some are space separated, others `;` separated, others NOT separated
- `product_color` marginally useful

4. given n tokens extracted from the `search_string`, trying to find matches for each token
- can rank products on the number of `n` token hits
- can also do a weighted token hit (based on the rarity of a token), i.e. rare tokens are stronger hits than common tokens, "goth" vs "summer"

5. search tokens that receive 0 hits on the product list are useless, can give hints to the user to try different search strings

6. top n text hits can be return along with their pictures as well for debugging / logic checks

# IMPORTS

In [28]:
%run ipynb_setup.ipynb

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [29]:
%run class_Dataset.ipynb

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# IMPLEMENT TEXT SEARCH

In [4]:
import re
from typing import List

# FUNCTIONS TO BUILD UP QUERY

In [92]:
COLNAME            = str
PRODUCT_DESCRIPTOR = pd.Series(dtype='str')
SEARCH_STRING      = str
TOKENS             = str
SEARCH_TOKENS      = List[str]

class TokenSearch():
    def __init__(
        self,
        dataset : Dataset,
        ) -> None :
        self.dataset            = dataset
        self.product_descriptor = self.make_product_descriptor()
        
    def make_product_descriptor(
        self,
        descriptor_colnames : List[COLNAME] = ['title','title_orig','tags','product_color'] # cols of interest
        ) -> PRODUCT_DESCRIPTOR :
        '''
        TokenSearch.make_product_descriptor()
        '''
        descriptors = self.dataset.df[descriptor_colnames] # extract columns
        descriptors = descriptors.apply(lambda x:' '.join([str(x) for x in x.values]),axis=1) # merge with | as sep
        #descriptors = descriptors.str.replace('\'','') # strip appostrophes
        descriptors = descriptors.str.replace('\W',' ',regex=True) # strip special chars
        return descriptors

    # search string tokenizer
    def tokenize_search_string(
        self,
        search_string : SEARCH_STRING,
        ) -> SEARCH_TOKENS :
        tokens = re.split('\W+',search_string) # partition string on non-alphanumeric chars
        tokens = [token.lower() for token in tokens] # set tokens to lower case
        tokens = list(set(tokens)) # uniques
        return tokens

    # token exact search augmenter for later regex usage
    def exactify_search_tokens(
        self,
        search_tokens : SEARCH_TOKENS,
        ) -> SEARCH_TOKENS :
        '''
        display(TokenSearch.tokenize_search_string('absc#dd	ddd  ,#,      asd;f    asdf asdf asdf asdf asdf asdfsd000_22220'))
        display(TokenSearch.tokenize_search_string('womens banana    dress dress me me me'))
        display(TokenSearch.exactify_search_tokens(tokenize_search_string('womens banana    dress dress me me me')))
        '''
        return ['\\b'+token+'\\b' for token in search_tokens] # force exact match, dont want to match 'red' from 'altered'
    #bool(re.search('\\bsets\\b',make_product_descriptor()[0],flags=re.IGNORECASE))

    # looking at all string product identifiers (title, title_orig, tags, product_color)
    # count how many tokens found in each product_descriptor + weighted token search (e.g. 3x 2x 1x)
    def tokens_found_count(
        self,
        #product_descriptor : pd.Series(dtype='str'),
        search_tokens      : List[TOKENS]  = None,
        search_string      : SEARCH_STRING = None,
        exact_search       : bool          = False,
        plot_top_n         : int           = None,
        verbose            : int           = 0, # show workings
        ) -> pd.Series(dtype='int') :
        '''
        #tokens_found_count(product_descriptor=make_product_descriptor(),search_tokens=['summer','short'],verbose=1) # basic logic
        #tokens_found_count(product_descriptor=make_product_descriptor(),search_tokens=['harajuku','goth','sexy'],verbose=1) # test rare token logic
        #tokens_found_count(product_descriptor=make_product_descriptor(),search_tokens=['kids','top','banana','bobby','henry'],verbose=0) # test bad search tokens
        #tokens_found_count(product_descriptor=make_product_descriptor(),search_tokens=['kids','sandals'],verbose=1) # test
        #tokens_found_count(product_descriptor=make_product_descriptor(),search_tokens=['balloon'],plot_top_n=5,verbose=1) # test plot_top_n
        #tokens_found_count(product_descriptor=make_product_descriptor(),search_tokens=['balloon'],plot_top_n=5,verbose=1) # test results snipping
        #tokens_found_count(product_descriptor=make_product_descriptor(),search_tokens=['banana'],plot_top_n=5,verbose=1) # test results chat#
        #tokens_found_count(product_descriptor=make_product_descriptor(),search_tokens=['sandal'],plot_top_n=5,verbose=1) # test results chat
        #tokens_found_count(product_descriptor=make_product_descriptor(),search_tokens=['sandal','red'],plot_top_n=5,verbose=1) # test results chat
        #tokens_found_count(product_descriptor=make_product_descriptor(),search_string='sandal red',plot_top_n=5,verbose=1) # test results chat
        #tokens_found_count(product_descriptor=make_product_descriptor(),search_tokens=['top','skinny','red'],plot_top_n=5,verbose=1) # test results chat
        #tokens_found_count(product_descriptor=make_product_descriptor(),search_tokens=['top','skinny','red'],plot_top_n=5,exact_search=True,verbose=1) # test results chat
        #tokens_found_count(product_descriptor=make_product_descriptor(),search_string='top red',plot_top_n=5,verbose=1) # test results chat
        #tokens_found_count(product_descriptor=make_product_descriptor(),search_string='top red',plot_top_n=5,exact_search=True,verbose=1) # test exact search
        #tokens_found_count(product_descriptor=make_product_descriptor(),search_string='exy',plot_top_n=5,verbose=1) # test exact search
        #tokens_found_count(product_descriptor=make_product_descriptor(),search_string='exy',plot_top_n=5,exact_search=True,verbose=1) # test exact search
        #tokens_found_count(product_descriptor=make_product_descriptor(),search_string='top red',plot_top_n=5,exact_search=True,verbose=1) # test exact search
        '''
        ###########################################################
        # get search tokens
        ###########################################################
        if search_tokens is None:
            search_tokens = self.tokenize_search_string(search_string=search_string) # strip them from search_string if search_tokens NOT provided
        if verbose>1: print(f'search_tokens = {search_tokens}')
            
        ###########################################################
        # amend tokens to exactify search on request
        ###########################################################
        if exact_search:
            search_tokens_orig = search_tokens
            search_tokens      = self.exactify_search_tokens(search_tokens)
        if verbose>1: print(f'search_tokens = {search_tokens}')

        ###########################################################
        # figure out which tokens are found in product_descriptor
        ###########################################################
        # returns a List[pd.Series(dtype=bool)], each list entry denoteing if nth token is found (exact or simple matching)
        found_count_list = [
            self.product_descriptor.apply(
                lambda x:bool(re.search(token,x,flags=re.IGNORECASE))
            ) for token in search_tokens
        ]
        if verbose>1: print(f'found_count_list = {[x.sum() for x in found_count_list]}')

        # df of Trues/Falses
        found_count_df = pd.concat(found_count_list,axis=1)
        found_count_df.columns = search_tokens
        if verbose>1: print(f'found_count_df = {found_count_df}')

        # series of found_count_series
        found_count_series = found_count_df.sum(axis=1)
        found_count_series.name = 'tokens_found_count'
        if verbose>1: print(f'found_count_series = {found_count_series}')

        ###########################################################
        # token rarity
        ###########################################################
        # count rarity of token - rare tokens should be valued more
        token_found_count           = found_count_df.sum()
        if verbose>1: print('token_found_count');display(token_found_count)
        token_rarity                = 1 - token_found_count / len(found_count_df) # rare tokens are valued more, only 1 instance (value ~= 1), 50% of products (value = 50%), 100% of products (value ~= 0)
        if verbose>0: print('token_rarity');display(token_rarity)
        discounted_token_df         = found_count_df * token_rarity # impact token found bool with value of token (between 0 and 1)
        #return found_count_df,token_rarity
        #print(found_count_df.shape)
        #print(token_rarity.shape)
        if verbose>2: print(discounted_token_df)
        if verbose>2: print(discounted_token_df.max())
        #display(discounted_token_df)
        discounted_token_series     = discounted_token_df.sum(axis=1)
        #display(discounted_token_series)
        discounted_token_df.columns = ['discounted('+x+')' for x in discounted_token_df.columns] # rename df columns so result can exist in same df

        ###########################################################
        # give feedback if token not found
        ###########################################################
        unmatched_tokens = token_found_count[token_found_count==0]
        if verbose>1: print(f'unmatched_tokens = {unmatched_tokens}')

        # only give feedback if there are useless tokens
        if len(unmatched_tokens)>0:
            useless_tokens = ['\'' + s + '\'' for s in unmatched_tokens.index]
            if verbose>1: display(useless_tokens)
            if len(useless_tokens)==1:
                helpful_string = useless_tokens[0]
            else:
                helpful_string = ', '.join(useless_tokens[:-1]) + ' and ' + useless_tokens[-1]

            # show helping string
            if exact_search:
                helpful_string = helpful_string.replace('\\b','')
                print(f'searchbot: no exact matches for {helpful_string}, try searching for something else')
            else:
                print(f'searchbot: no matches for {helpful_string}, try searching for something else')
            print()

        ###########################################################
        # make pretty return df
        ###########################################################
        # [ found count ] + [ which tokens found ] + [search string]
        #display(discounted_token_df.columns)
        #display(found_count_df.columns)
        found_count_summary = pd.concat(
            [
                found_count_series.to_frame('tokens_matched'),
                found_count_df,
                discounted_token_series.to_frame('discounted(tokens_matched)'),
                discounted_token_df,
                self.product_descriptor.to_frame('product_descriptor')
            ],
            axis=1
        )

        # snip results table to stuff which has at least SOME match
        found_count_summary = found_count_summary[found_count_summary['tokens_matched']>0]
        # sort on some metric
        found_count_summary = found_count_summary.sort_values(['tokens_matched']+['discounted(tokens_matched)'],ascending=False)

        ###########################################################
        # results chat
        ###########################################################
        if len(found_count_summary)==0:
            print(f'results: I got nothing! T⌓T') # unhappy
        elif len(found_count_summary)==1:
            print(f'results: only 1 hit ￣ω￣, I hope it\'s what you wanted!') # unsure
        elif len(found_count_summary)<=5:
            print(f'results: I only got {len(found_count_summary)} results, see anything you like?') # tight search
        elif len(found_count_summary)<=10:
            print(f'results: {len(found_count_summary)} results found') # normal
        elif len(found_count_summary)>10:
            print(f'results: {len(found_count_summary)} items found, I can do better if you can be more specific') # should do better

        '''
        # tag on searched string as well for debug purposes
        if verbose>0:
            found_count_summary = found_count_summary.join(product_descriptor.to_frame())
        ''' 

        ###########################################################
        # plot_top_n results
        ###########################################################
        if (plot_top_n is not None) and len(found_count_summary)>0:
            self.dataset.get_product_pictures(locs=found_count_summary.index[:plot_top_n])
            plt.show()

        ###########################################################
        # return
        ###########################################################
        if verbose>0:
            return found_count_summary.head(20)
        else:
            return found_count_summary.head(plot_top_n)

In [91]:
'''
ts=TokenSearch(dataset=Dataset())
#ts.tokens_found_count(search_tokens=['summer','short'],verbose=1) # basic logic
#ts.tokens_found_count(search_tokens=['harajuku','goth','sexy'],verbose=1) # test rare token logic
#ts.tokens_found_count(search_tokens=['kids','top','banana','bobby','henry'],verbose=0) # test bad search tokens
#ts.tokens_found_count(search_tokens=['kids','sandals'],verbose=1) # test
#ts.tokens_found_count(search_tokens=['balloon'],plot_top_n=5,verbose=1) # test plot_top_n
#ts.tokens_found_count(search_tokens=['balloon'],plot_top_n=5,verbose=1) # test results snipping
#ts.tokens_found_count(search_tokens=['banana'],plot_top_n=5,verbose=1) # test results chat#
#ts.tokens_found_count(search_tokens=['sandal'],plot_top_n=5,verbose=1) # test results chat
#ts.tokens_found_count(search_tokens=['sandal','red'],plot_top_n=5,verbose=1) # test results chat
#ts.tokens_found_count(search_string='sandal red',plot_top_n=5,verbose=1) # test results chat
#ts.tokens_found_count(search_tokens=['top','skinny','red'],plot_top_n=5,verbose=1) # test results chat
#ts.tokens_found_count(search_tokens=['top','skinny','red'],plot_top_n=5,exact_search=True,verbose=1) # test results chat
#ts.tokens_found_count(search_string='top red',plot_top_n=5,verbose=1) # test results chat
#ts.tokens_found_count(search_string='top red',plot_top_n=5,exact_search=True,verbose=1) # test exact search
#ts.tokens_found_count(search_string='exy',plot_top_n=5,verbose=1) # test exact search
#ts.tokens_found_count(search_string='exy',plot_top_n=5,exact_search=True,verbose=1) # test exact search
#ts.tokens_found_count(search_string='top red',plot_top_n=5,exact_search=True,verbose=1) # test exact search
#found_count_df,token_rarity=
#ts.tokens_found_count(search_string='hot banana skirt',plot_top_n=5,exact_search=True,verbose=1) # test exact search
#ts.tokens_found_count(search_string='skirt slim',plot_top_n=5,exact_search=False,verbose=0) # test exact search
ts.tokens_found_count(search_string='skirt slim',exact_search=False,verbose=0) # test exact search
'''
None