# IMPORTS

In [None]:
%run ipynb_setup.ipynb

In [None]:
%run class_def.ipynb

# IMPLEMENT TEXT SEARCH

In [None]:
import re
from typing import List

In [None]:
d=Dataset()

In [None]:
d.raw.head(1).transpose()

In [None]:
d.df.head(2)

### build up regex

In [None]:
d.df['title'].str.lower().str.contains('summer|hot')

In [None]:
d.df['title'].str.contains('womens|me|dress|banana',flags=re.IGNORECASE,regex=True)

In [None]:
pd.concat(
    [
        d.df['title'],
        d.df['title'].str.contains('sans',flags=re.IGNORECASE,regex=True)+0
    ],
    axis=1,
)[:20]

### combine description / tag columns to do single regex on

In [None]:
d.df[['title','title_orig','tags']].apply(lambda x:x['title']+x['title_orig']+x['tags'],axis=1)

# FUNCTIONS TO BUILD UP QUERY

### search string tokenizer

In [None]:
SEARCH_STRING = str
SEARCH_TOKENS = List[str]
def tokenize_search_string(
    search_string : SEARCH_STRING,
    exact_search  : bool = True,
    ) -> SEARCH_TOKENS :
    tokens = re.split('\W+',search_string) # partition string on non-alphanumeric chars
    tokens = [token.lower() for token in tokens] # set tokens to lower case
    tokens = list(set(tokens)) # uniques
    if exact_search: tokens = ['\\b'+x+'\\b' for x in tokens] # force exact match, dont want to match 'red' from 'altered'
    return tokens

#bool(re.search('\\bsets\\b',merged_product_identifier_text()[0],flags=re.IGNORECASE))

display(tokenize_search_string('absc#dd	ddd  ,#,      asd;f    asdf asdf asdf asdf asdf asdfsd000_22220'))
display(tokenize_search_string('womens banana    dress dress me me me'))
display(tokenize_search_string('womens banana    dress dress me me me',exact_search=False))

### build up regex pattern from search string tokens

In [None]:
def search_tokens_to_re_pattern(search_tokens : SEARCH_TOKENS) -> str :
    #return '\b'+('\b|\b'.join(search_tokens))+'\b'
    return '|'.join(search_tokens)

display(search_tokens_to_re_pattern(tokenize_search_string('womens banana    dress dress me me me')))
display(search_tokens_to_re_pattern(tokenize_search_string('womens banana    dress dress me me me',exact_search=False)))

### simple way to get description + tags to regex over

In [None]:
d.df[['title','title_orig','tags','product_color']]

In [None]:
def merged_product_identifier_text() -> pd.Series(dtype='str') :
    descriptor_colnames = ['title','title_orig','tags','product_color'] # cols of interest
    df_descriptors      = d.df[descriptor_colnames] # extract columns
    df_descriptors      = df_descriptors.apply(lambda x:' '.join([str(x) for x in x.values]),axis=1) # merge with | as sep
    #df_descriptors      = df_descriptors.str.replace('\'','') # strip appostrophes
    df_descriptors      = df_descriptors.str.replace('\W',' ',regex=True) # strip special chars
    return df_descriptors
merged_product_identifier_text()

### count how many tokens found in each product description / tags

In [None]:
# weighted token search
# looking at all string product identifiers (title, titie_orig, tags)
# return weighted value of tokens found (e.g. 3x 2x 1x)
def tokens_found_count(
    pd_series     : pd.Series(dtype='str'),
    search_string : str,
    top_n         : int  = 5,
    exact_search  : bool = False,
    verbose       : int  = 0, # show workings
    ) -> pd.Series(dtype='int') :
    ###########################################################
    # identify which tokens matched
    ###########################################################
    # strip tokens from search string
    search_tokens = tokenize_search_string(search_string=search_string,exact_search=exact_search)
    
    # figure out which matched / get List[pd.Series(dtype=bool)]
    found_count_list = [
        pd_series.apply(
            lambda x:bool(re.search(token,x,flags=re.IGNORECASE))
        ) for token in search_tokens
    ]
    
    # df of Trues/Falses
    found_count_df = pd.concat(found_count_list,axis=1)
    found_count_df.columns = search_tokens
    
    # series of found cfound_count_series
    found_count_series = found_count_df.sum(axis=1)
    found_count_series.name = 'tokens_found_count'
    
    ###########################################################
    # token rarity
    ###########################################################
    # count rarity of token - rare tokens should be valued more
    token_found_count = found_count_df.sum()
    token_rarity = 1 - token_found_count / len(found_count_df) # rare tokens are valued more, only 1 instance (value ~= 1), 50% of products (value = 50%), 100% of products (value ~= 0)
    discounted_token_df = found_count_df * token_rarity # impact token found bool with value of token (between 0 and 1)
    discounted_token_series = discounted_token_df.sum(axis=1)
    discounted_token_df.columns = ['discounted('+x+')' for x in discounted_token_df.columns] # rename df columns so result can exist in same df
    
    ###########################################################
    # give feedback if token not found
    ###########################################################
    if verbose>1: print('token_found_count');display(token_found_count)
    unmatched_tokens = token_found_count[token_found_count==0]
    
    pad_string = lambda s : '\'' + s + '\''
    
    # only give feedback if there are useless tokens
    if len(unmatched_tokens)>0:
        useless_tokens = [pad_string(s) for s in unmatched_tokens.index]
        if verbose>1: display(useless_tokens)
        if len(useless_tokens)==1:
            helpful_string = useless_tokens[0]
        else:
            helpful_string = ', '.join(useless_tokens[:-1]) + ' and ' + useless_tokens[-1]

        # show helping string
        print(f'searchbot: no matches for {helpful_string}, try looking for something else')
        print()

    ###########################################################
    # make pretty return df
    ###########################################################
    # [ found count ] + [ which tokens found ] + [search string]
    found_count_summary = pd.concat(
        [
            discounted_token_series.to_frame('discounted(tokens matched)'),
            discounted_token_df,
            found_count_series.to_frame('tokens matched'),
            found_count_df,
            pd_series.to_frame()
        ],
        axis=1
    )
    
    # snip results table to stuff which has at least SOME match
    found_count_summary = found_count_summary[found_count_summary['tokens matched']>0]
    # sort on some metric
    found_count_summary = found_count_summary.sort_values(['tokens matched']+search_tokens,ascending=False)

    ###########################################################
    # results chat
    ###########################################################
    if len(found_count_summary)==0:
        print(f'results: I got nothing! T⌓T')
    elif len(found_count_summary)==1:
        print(f'results: only 1 hit ￣ω￣, I hope it\'s what you wanted!')
    elif len(found_count_summary)<10:
        print(f'results: {len(found_count_summary)} items found, I can do better if you can be more specific')
    
    
    '''
    # tag on searched string as well for debug purposes
    if verbose>0:
        found_count_summary = found_count_summary.join(pd_series.to_frame())
    ''' 
    
    ###########################################################
    # plot top_n results
    ###########################################################
    if len(found_count_summary)>0:
        d.get_product_pictures(locs=found_count_summary.index[:top_n])
        plt.show()

    ###########################################################
    # return
    ###########################################################
    if verbose>0:
        return found_count_summary
    else:
        return found_count_summary.head(top_n)

#tokens_found_count(merged_product_identifier_text(),['summer','short'],verbose=1).head(10) # basic logic
#tokens_found_count(merged_product_identifier_text(),['harajuku','goth','sexy'],verbose=1).head(10) # test rare token logic
#tokens_found_count(merged_product_identifier_text(),['kids','top','banana','bobby','henry'],verbose=0).head(10) # test bad search tokens
#tokens_found_count(merged_product_identifier_text(),['kids','sandals'],verbose=0).head(10) # test
#tokens_found_count(merged_product_identifier_text(),['balloon'],top_n=5,verbose=0) # test top_n
#tokens_found_count(merged_product_identifier_text(),['balloon'],top_n=10,verbose=1).head(20) # test results snipping
#tokens_found_count(merged_product_identifier_text(),['banana'],top_n=10,verbose=1).head(20) # test results chat
#tokens_found_count(merged_product_identifier_text(),['sandal'],top_n=10,verbose=1).head(20) # test results chat
#tokens_found_count(merged_product_identifier_text(),['sandal','red'],top_n=10,verbose=1).head(20) # test results chat
#tokens_found_count(merged_product_identifier_text(),'sandal red',top_n=10,verbose=1).head(20) # test results chat
#tokens_found_count(merged_product_identifier_text(),['top','skinny','red'],top_n=10,verbose=1).head(20) # test results chat
#tokens_found_count(merged_product_identifier_text(),'top red',top_n=5,verbose=1).head(20) # test results chat
#tokens_found_count(merged_product_identifier_text(),'top red',top_n=5,exact_search=True,verbose=1).head(20) # test exact search
#tokens_found_count(merged_product_identifier_text(),'exy',top_n=5,verbose=1).head(20) # test exact search
#tokens_found_count(merged_product_identifier_text(),'exy',top_n=5,exact_search=True,verbose=1).head(20) # test exact search
tokens_found_count(merged_product_identifier_text(),'top red',top_n=5,exact_search=True,verbose=1).head(20) # test exact search