# [TLDR] PROGRESSIVELY BUILT UP A TEXT SEARCH
- main points include

1. input `search_string` broken up (tokenized) on `\W` special chars

2. can look for tokens in "product descriptor" either by (a) exact matching tokens OR (b) simple matching of tokens
- exact matching
  - pros
    - avoids matching the token `red` in strings like `prediction`, i.e. avoids spurious matches
  - cons
    - tags are not all space separated so will not match anything, i.e. cannot find `white` in `whitesandalsbeach` 
- simple matching
  - pros and cons are literally flip of that in exact matching, hard to say which is better
  
3. "product descriptor" is a merge of `title`, `title_orig`, `tags` and `product_color`
- all seem useful
- `title` and `title_orig` should be similar, one is French the other is English translation, who knows which is better, i.e. use both
- `tags` seem very useful but format isn't standardized, some are space separated, others `;` separated, others NOT separated
- `product_color` marginally useful

4. given n tokens extracted from the `search_string`, trying to find matches for each token
- can rank products on the number of `n` token hits
- can also do a weighted token hit (based on the rarity of a token), i.e. rare tokens are stronger hits than common tokens, "goth" vs "summer"

5. search tokens that receive 0 hits on the product list are useless, can give hints to the user to try different search strings

6. top n text hits can be return along with their pictures as well for debugging / logic checks

# IMPORTS

In [None]:
%run ipynb_setup.ipynb

In [None]:
%run class_def.ipynb

# IMPLEMENT TEXT SEARCH

In [None]:
import re
from typing import List

In [None]:
d=Dataset()

In [None]:
d.raw.head(1).transpose()

In [None]:
d.df.head(2)

### build up regex

In [None]:
d.df['title'].str.lower().str.contains('summer|hot')

In [None]:
d.df['title'].str.contains('womens|me|dress|banana',flags=re.IGNORECASE,regex=True)

In [None]:
pd.concat(
    [
        d.df['title'],
        d.df['title'].str.contains('sans',flags=re.IGNORECASE,regex=True)+0
    ],
    axis=1,
)[:20]

### combine description / tag columns to do single regex on

In [None]:
d.df[['title','title_orig','tags']].apply(lambda x:x['title']+x['title_orig']+x['tags'],axis=1)

# FUNCTIONS TO BUILD UP QUERY

### search string tokenizer

In [None]:
SEARCH_STRING = str
SEARCH_TOKENS = List[str]
def tokenize_search_string(
    search_string : SEARCH_STRING,
    ) -> SEARCH_TOKENS :
    tokens = re.split('\W+',search_string) # partition string on non-alphanumeric chars
    tokens = [token.lower() for token in tokens] # set tokens to lower case
    tokens = list(set(tokens)) # uniques
    return tokens

def exactify_search_tokens(
    search_tokens : SEARCH_TOKENS,
    ) -> SEARCH_TOKENS :
    return ['\\b'+token+'\\b' for token in search_tokens] # force exact match, dont want to match 'red' from 'altered'

#bool(re.search('\\bsets\\b',make_product_descriptor()[0],flags=re.IGNORECASE))

display(tokenize_search_string('absc#dd	ddd  ,#,      asd;f    asdf asdf asdf asdf asdf asdfsd000_22220'))
display(tokenize_search_string('womens banana    dress dress me me me'))
display(exactify_search_tokens(tokenize_search_string('womens banana    dress dress me me me')))


### build up regex pattern from search string tokens

In [None]:
def search_tokens_to_re_pattern(search_tokens : SEARCH_TOKENS) -> str :
    #return '\b'+('\b|\b'.join(search_tokens))+'\b'
    return '|'.join(search_tokens)

display(search_tokens_to_re_pattern(tokenize_search_string('womens banana    dress dress me me me')))
display(search_tokens_to_re_pattern(exactify_search_tokens(tokenize_search_string('womens banana    dress dress me me me'))))

### simple way to get description + tags to regex over

In [None]:
d.df[['title','title_orig','tags','product_color']]

In [None]:
COLNAME = str

def make_product_descriptor(
    descriptor_colnames : List[COLNAME] = ['title','title_orig','tags','product_color'] # cols of interest
    ) -> pd.Series(dtype='str') :
    descriptors = d.df[descriptor_colnames] # extract columns
    descriptors = descriptors.apply(lambda x:' '.join([str(x) for x in x.values]),axis=1) # merge with | as sep
    #descriptors = descriptors.str.replace('\'','') # strip appostrophes
    descriptors = descriptors.str.replace('\W',' ',regex=True) # strip special chars
    return descriptors
make_product_descriptor()

### count how many tokens found in each product description / tags

In [None]:
SEARCH_STRING = str
TOKENS        = str

# weighted token search
# looking at all string product identifiers (title, titie_orig, tags)
# return weighted value of tokens found (e.g. 3x 2x 1x)
def tokens_found_count(
    product_descriptor : pd.Series(dtype='str'),
    search_tokens      : List[TOKENS]  = None,
    search_string      : SEARCH_STRING = None,
    exact_search       : bool          = False,
    show_top_n         : int           = 5,
    verbose            : int           = 0, # show workings
    ) -> pd.Series(dtype='int') :
    ###########################################################
    # get search tokens
    ###########################################################
    if search_tokens is None:
        search_tokens = tokenize_search_string(search_string=search_string) # strip them from search_string if search_tokens NOT provided

    ###########################################################
    # amend tokens to exactify search on request
    ###########################################################
    if exact_search:
        search_tokens_orig = search_tokens
        search_tokens = exactify_search_tokens(search_tokens)
        
    ###########################################################
    # figure out which tokens are found in product_descriptor
    ###########################################################
    # returns a List[pd.Series(dtype=bool)], each list entry denoteing if nth token is found (exact or simple matching)
    found_count_list = [
        product_descriptor.apply(
            lambda x:bool(re.search(token,x,flags=re.IGNORECASE))
        ) for token in search_tokens
    ]
    
    # df of Trues/Falses
    found_count_df = pd.concat(found_count_list,axis=1)
    found_count_df.columns = search_tokens
    
    # series of found_count_series
    found_count_series = found_count_df.sum(axis=1)
    found_count_series.name = 'tokens_found_count'
    
    ###########################################################
    # token rarity
    ###########################################################
    # count rarity of token - rare tokens should be valued more
    token_found_count = found_count_df.sum()
    token_rarity = 1 - token_found_count / len(found_count_df) # rare tokens are valued more, only 1 instance (value ~= 1), 50% of products (value = 50%), 100% of products (value ~= 0)
    discounted_token_df = found_count_df * token_rarity # impact token found bool with value of token (between 0 and 1)
    discounted_token_series = discounted_token_df.sum(axis=1)
    discounted_token_df.columns = ['discounted('+x+')' for x in discounted_token_df.columns] # rename df columns so result can exist in same df
    
    ###########################################################
    # give feedback if token not found
    ###########################################################
    if verbose>1: print('token_found_count');display(token_found_count)
    unmatched_tokens = token_found_count[token_found_count==0]
    
    pad_string = lambda s : '\'' + s + '\''
    
    # only give feedback if there are useless tokens
    if len(unmatched_tokens)>0:
        useless_tokens = [pad_string(s) for s in unmatched_tokens.index]
        if verbose>1: display(useless_tokens)
        if len(useless_tokens)==1:
            helpful_string = useless_tokens[0]
        else:
            helpful_string = ', '.join(useless_tokens[:-1]) + ' and ' + useless_tokens[-1]

        # show helping string
        if exact_search:
            helpful_string = helpful_string.replace('\\b','')
            print(f'searchbot: no exact matches for {helpful_string}, try searching for something else')
        else:
            print(f'searchbot: no matches for {helpful_string}, try searching for something else')
        print()

    ###########################################################
    # make pretty return df
    ###########################################################
    # [ found count ] + [ which tokens found ] + [search string]
    found_count_summary = pd.concat(
        [
            discounted_token_series.to_frame('discounted(tokens matched)'),
            discounted_token_df,
            found_count_series.to_frame('tokens matched'),
            found_count_df,
            product_descriptor.to_frame('product_descriptor')
        ],
        axis=1
    )
    
    # snip results table to stuff which has at least SOME match
    found_count_summary = found_count_summary[found_count_summary['tokens matched']>0]
    # sort on some metric
    found_count_summary = found_count_summary.sort_values(['tokens matched']+search_tokens,ascending=False)

    ###########################################################
    # results chat
    ###########################################################
    if len(found_count_summary)==0:
        print(f'results: I got nothing! T⌓T') # unhappy
    elif len(found_count_summary)==1:
        print(f'results: only 1 hit ￣ω￣, I hope it\'s what you wanted!') # unsure
    elif len(found_count_summary)<=5:
        print(f'results: I only got {len(found_count_summary)} results, see anything you like?') # tight search
    elif len(found_count_summary)<=10:
        print(f'results: {len(found_count_summary)} results found') # normal
    elif len(found_count_summary)>10:
        print(f'results: {len(found_count_summary)} items found, I can do better if you can be more specific') # should do better
    
    '''
    # tag on searched string as well for debug purposes
    if verbose>0:
        found_count_summary = found_count_summary.join(product_descriptor.to_frame())
    ''' 
    
    ###########################################################
    # show_top_n results
    ###########################################################
    if show_top_n and len(found_count_summary)>0:
        d.get_product_pictures(locs=found_count_summary.index[:show_top_n])
        plt.show()

    ###########################################################
    # return
    ###########################################################
    if verbose>0:
        return found_count_summary.head(20)
    else:
        return found_count_summary.head(show_top_n)

#tokens_found_count(product_descriptor=make_product_descriptor(),search_tokens=['summer','short'],verbose=1) # basic logic
#tokens_found_count(product_descriptor=make_product_descriptor(),search_tokens=['harajuku','goth','sexy'],verbose=1) # test rare token logic
#tokens_found_count(product_descriptor=make_product_descriptor(),search_tokens=['kids','top','banana','bobby','henry'],verbose=0) # test bad search tokens
#tokens_found_count(product_descriptor=make_product_descriptor(),search_tokens=['kids','sandals'],verbose=1) # test
#tokens_found_count(product_descriptor=make_product_descriptor(),search_tokens=['balloon'],show_top_n=5,verbose=1) # test show_top_n
#tokens_found_count(product_descriptor=make_product_descriptor(),search_tokens=['balloon'],show_top_n=5,verbose=1) # test results snipping
#tokens_found_count(product_descriptor=make_product_descriptor(),search_tokens=['banana'],show_top_n=5,verbose=1) # test results chat#
#tokens_found_count(product_descriptor=make_product_descriptor(),search_tokens=['sandal'],show_top_n=5,verbose=1) # test results chat
#tokens_found_count(product_descriptor=make_product_descriptor(),search_tokens=['sandal','red'],show_top_n=5,verbose=1) # test results chat
#tokens_found_count(product_descriptor=make_product_descriptor(),search_string='sandal red',show_top_n=5,verbose=1) # test results chat
#tokens_found_count(product_descriptor=make_product_descriptor(),search_tokens=['top','skinny','red'],show_top_n=5,verbose=1) # test results chat
#tokens_found_count(product_descriptor=make_product_descriptor(),search_tokens=['top','skinny','red'],show_top_n=5,exact_search=True,verbose=1) # test results chat
#tokens_found_count(product_descriptor=make_product_descriptor(),search_string='top red',show_top_n=5,verbose=1) # test results chat
#tokens_found_count(product_descriptor=make_product_descriptor(),search_string='top red',show_top_n=5,exact_search=True,verbose=1) # test exact search
#tokens_found_count(product_descriptor=make_product_descriptor(),search_string='exy',show_top_n=5,verbose=1) # test exact search
#tokens_found_count(product_descriptor=make_product_descriptor(),search_string='exy',show_top_n=5,exact_search=True,verbose=1) # test exact search
tokens_found_count(product_descriptor=make_product_descriptor(),search_string='top red',show_top_n=5,exact_search=True,verbose=1) # test exact search