# [TLDR] use `token_search`, `nearby_search` and `image_search` together the return products to the user 

# [LONGER VERSION]
- approach taken is follows
  1. [token_results] look for products by matching the user-provided `search_string` tokens to the `product_descriptor`
    - `product_descriptor` is made by concatenating `title`, `title_orig`, `tags` and `product_color`
    - [token_results] are ordered by considering their "highest PROPORTION of 4 and 5 star ratings", do so avoids the following problems
      - results could be dominated by old products with lots of reviews, both good and bad (i.e. highest COUNT of 4 and 5 star ratings would skew to returning veteran products)
      - results could be dominated by old products with lots of `units_sold` (i.e. lots sold could simply mean the product has been sold for a long time)
    - considering "highest PROPORTION of 4 and 5 star ratings" is good as
      - it allows new products that trend to get to the top of the list

  2a. if [token_results] are more numerous than some "min results count" we're done (i.e. 25 [token_results] we found and we need only show 20 results)
  
  2b. otherwise we grab new products considering [nearby_results]
    - [nearby_results] are new products that are similar to [token_results] in numerical space
    - they are found by grabbing `kNeighbors` from our [token_results]
    - new results don't care about search string tokens
    - we can keep adding to [nearby_results] by increasing `n_nearest` considered from [token_result] until the "min results count" is breached
    - [nearby_results] are ordered by taking each [nearby_results] shortest distance to a [token_results]
    
  3a. if the `spotlight` parameter is NOT specified in the `query` we simply return "min results count" results by stacking ordered [token_results] ordered [nearby_results]
  
  3b. otherwise [image_search] is used to order [nearby_results] by measure `psnr` from the `spotlight` product
    - the `spotlight` product must be specfied by the user
    - it only considers the distance of [nearby_results] from the `spotlight` product and NOT [token_results]
    - more [nearby_results] are found so as to give more options for the [image_search] to act on


# IMPORTS

In [None]:
%run ipynb_setup.ipynb

In [None]:
%run class_Dataset.ipynb

In [None]:
%run class_TokenSearch.ipynb

In [None]:
%run class_NeighbourSearch.ipynb

In [None]:
%run class_ImageSearch.ipynb

# CLASS DEF

In [None]:
class ProductSearch():

    def __init__(
        self,
        ) -> None :

        self.dataset         = Dataset() # initialize Wish dataset
        self.token_search    = TokenSearch(dataset=self.dataset) # prep token searcher
        self.neighbor_search = NeighbourSearch(dataset=self.dataset) # prep nearest neighbor searcher
        self.image_search    = ImageSearch(dataset=self.dataset) # prep image searcher

    # this given top rating products precedence over units sold, allows good products to trend rather than old products to stay at the top
    def reorder_on_top_ratings(
        self,
        res : pd.DataFrame,
        ) -> pd.DataFrame :
        top_rating_pctage = (res['rating_five_count']+res['rating_four_count'])/res['rating_count']
        top_rating_pctage = top_rating_pctage.sort_values(ascending=False)
        return res.loc[top_rating_pctage.index,:]

    def chatbot(
        self,
        res : pd.DataFrame,
        ) -> None:
        if len(res)==0:    print(f'[token results]: I got nothing! T⌓T') # unhappy
        elif len(res)==1:  print(f'[token results]: only 1 hit ￣ω￣, I hope it\'s what you wanted!') # unsure
        elif len(res)<=5:  print(f'[token results]: {len(res)} results, see anything you like?') # tight search
        else: print(f'[token results]: {len(res)} results found') # normal

    # find products that match purely on tokens + sort them on "top ratings" (allow new top rated products to trend rather than old products with any units_sold to dominate results)
    def query(
        self,
        search_string         : str,
        exact_match           : bool  = False,
        case_sensitive        : bool  = False,
        required_results      : int   = 20,  # token matching results at the top (with )
        spotlight             : int   = None, # for ImageSearch
        spotlight_extra_ratio : float = 2,    # ratio vs `required_results` of additional results needed for image search to to help refine / acquire
        show_results          : bool  = True,
        chatbot               : bool  = True,
        verbose               : int   = 0,
        ) -> pd.DataFrame :

        # validate input
        if spotlight is not None:
            if spotlight not in self.dataset.df.index:
                print(f'spotlighted product {spotlight} not recognized, please try again with a known product ID')
                spotlight = None
            
        # protect input
        if required_results > len(self.dataset.df): required_results = len(self.dataset.df)

        ###########################################################
        # step 1 = get token_matching_results
        ###########################################################
        token_results = self.token_search.tokens_found(
            search_tokens  = None,
            search_string  = search_string,
            exact_match    = exact_match,
            case_sensitive = case_sensitive,
            verbose        = verbose,
        )
        if verbose>2: print('token_results');display(token_results)
        
        ###########################################################
        # chatbot
        ###########################################################
        if chatbot: self.chatbot(token_results)

        if len(token_results) == 0: 
            print('search too specific')
            return

        ###########################################################
        # step 2 = ensure we have enough results
        ###########################################################
        if len(token_results) >= required_results:
            # if enough results already we're good
            final_results = token_results[:required_results] # snip token_results

            # sort results on "high ratings", this allows new top rated products (that haven't sold a lot to trend rather than just having old products with many units_sold dominating results)
            final_results = self.reorder_on_top_ratings(final_results)
        else:
            # if should use ImageSearch to order or not?
            if spotlight is None:
                # no spotlight product given, just pad up to required_results
                
                # use nearest neighbour in numerical space to get more results regardless of token matching
                padded_results_distances = self.neighbor_search.nearby_results(
                    locs        = token_results.index, # token_results are source to search from
                    min_results = int(required_results),
                )
                if verbose>2: print('padded_results_distances');display(padded_results_distances)
            
                # sort non-token_results as follows
                # (a) for token_results, use "high ratings"
                token_results_order  = list(self.reorder_on_top_ratings(token_results).index)
                
                # (b) for nearby results, use distance from token_results
                nearby_results_order = padded_results_distances[:required_results] # trim to required number
                nearby_results_order = list(nearby_results_order[nearby_results_order!=0].index) # of the non 0 distance results (i.e. nearby results), take whatever is required to get up to `required_results`
                print(f'[nearby results]: {len(nearby_results_order)} results added')
                
                # (c) token_results first then nearby_results after
                padded_results_final_order = token_results_order + nearby_results_order
                    
                # extract df with final_order
                final_results = self.dataset.df.loc[padded_results_final_order] # get in order of 
            else:
                # spotlight product given, get more results than asked for to Image search over
                
                # get more than required_results to image search over
                if spotlight_extra_ratio < 1: spotlight_extra_ratio = 1

                # use nearest neighbour in numerical space to get more results regardless of token matching
                padded_results_distances = self.neighbor_search.nearby_results(
                    locs        = token_results.index, # token_results are source to search from
                    min_results = int(required_results * spotlight_extra_ratio),
                )
                if verbose>2: print('padded_results_distances');display(padded_results_distances)

                # sort non-token_results as follows
                # (a) for token_results, use "high ratings"
                token_results_order  = list(self.reorder_on_top_ratings(token_results).index)

                # (b) for nearby results, use distance from spotlight
                nearby_results_order = list(padded_results_distances[padded_results_distances!=0].index) # of the non 0 distance results (i.e. nearby results), take whatever is required to get up to `required_results`
                print(f'[nearby results]: {len(nearby_results_order)} results added')
                #display(nearby_results_order)
                psnr = self.image_search.img_similarity_tgt_locs(
                    plot_src  = False,
                    grayscale = True,
                    blur      = True,
                    src_loc   = spotlight,
                    tgt_locs  = nearby_results_order,
                )
                psnr = psnr[np.isfinite(psnr).values] # remove comparisons with self, i.e. inf's
                psnr = psnr['psnr'].sort_values(ascending=False)
                
                # (c) token_results first then nearby_results after
                padded_results_final_order = token_results_order + nearby_results_order
                padded_results_final_order = padded_results_final_order[:required_results] # trim to required number
                print(f'[image ordering]: {len(padded_results_final_order)} final results')
                
                # extract df with final_order
                final_results = self.dataset.df.loc[padded_results_final_order] # get in order of 
        
        ###########################################################
        # step 3 = show_top_n results
        ###########################################################
        if show_results: self.dataset.show_top_n(final_results,n=required_results)

        return final_results

In [None]:
#ps=ProductSearch()
#ps.query('harajuku',required_results=20)
#res=ps.query('harajuku',spotlight_extra_ratio=0.1)
#ps.query('harajuku beach')
#ps.query('harajuku pop')
#ps.query('dress')
#ps.query('dress beach')
#ps.query('dress beach flower')
#ps.query('dress beach flower sleeve')
#ps.query('dress beach flower blue')
#ps.query('dress beach flower blue',spotlight=1225) # good spotlight
#ps.query('dress beach flower blue',spotlight='asdf') # bad spotlight