# [TLDR] given several `home planets`, we simultaneously look outwards from all home worlds to find the `nearest` new `neighbouring planets` to our civilization

- `home planets`
  - they denote products that match our search tokens
  - as they match on search tokens I consider them pretty good starting locations to look for new products from
  - these new products don't need to have token matches, just to be "similar"
- `neighbouring planets`
  - they denote products that didn't satisfy our original search token request
  - being "close" in numerical space is prob not a bad place to look for additional products
- `nearest`
  - this nearness in numerical sense, i.e. making use of numerical attributes in our dataset
  - numerical attributes deemed to be useful (in some sense) are cleaned up and normalized on which we can apply a numeric metric for nearness
    - nan's are imputed (to avoid throwing away products, imputed with population mean)
    - log-transformations are done to make the data more normalized
    - means are stripped and variances normalized to put all attributes into a similar scale (as the numeric metric is sensitive to scale)

# IMPORTS

In [None]:
%run ipynb_setup.ipynb

In [None]:
%run class_Dataset.ipynb

In [None]:
from inspect import currentframe 
from typing import List
from sklearn.neighbors import NearestNeighbors

# CLASS DEF

In [None]:
class NeighbourSearch():

    def __init__(
        self,
        dataset   : Dataset,
        verbose   : int = 0,
        ) -> None :

        self.dataset   = dataset
        self.model     = NearestNeighbors().fit(self.dataset.df_num)
        self.verbose   = verbose
    
    
    # given list of locs, return a larger list of size `n` x len(locs) of locs that are closest to original locs
    def get_n_nearest_from_locs(
        self,
        n_nearest : int,
        locs      : List[int],
        ) -> pd.Series :
        if self.verbose>0: print(f'[{currentframe().f_code.co_name}] n_nearest = {n_nearest}')
        if self.verbose>0: print(f'[{currentframe().f_code.co_name}] locs = {locs}')
        if self.verbose>0: print(f'[{currentframe().f_code.co_name}] df_num = {self.dataset.df_num}, type = {type(self.dataset.df_num)}')
        
        # get nearest_n neighbors from each element of locs 
        distances,indices=self.model.kneighbors(
            self.dataset.df_num.loc[locs],
            n_neighbors = n_nearest,
        )
        
        # merge results into single vector
        nearest_results = pd.Series(np.ravel(distances),index=np.ravel(indices)).sort_values(ascending=True) # ordered from nearest to furthest, want globally nearest to origin family
        
        # return
        return nearest_results
    
    # incrementally grab more nearest neighbours from some `source_results` list until we get more than we need
    def nearby_results(
        self,
        locs        : List[int],
        min_results : int,
        ) -> pd.Series :
            
        # protect input
        if min_results > len(self.dataset.df): min_results = len(self.dataset.df)
            
        # initialize while loop
        n_nearest = 1
        res_unique = locs
        
        # look outwards and find nearest results (including self) until we get more that what we need
        while len(res_unique) < min_results:
            # get next nearest result from each source_result
            res = self.get_n_nearest_from_locs(
                n_nearest = n_nearest,
                locs      = locs,
            )
            
            # avoid situation where same neighbor is found jas
            res_unique = res.groupby(res.index).min()
        
            if self.verbose>0: print(f'[{currentframe().f_code.co_name}] n_nearest = {n_nearest}, len(res) = {len(res)}, len(res_unique) = {len(res_unique)}, min_results = {min_results}')

            # prep for next loop
            n_nearest = n_nearest + 1
            
        # return required number of results sorted by distance away from original locs
        return res_unique.sort_values()

In [None]:
#'''
d=Dataset()
neighbour_search=NeighbourSearch(dataset=d,verbose=0)
neighbour_search.get_n_nearest_from_locs(n_nearest=1,locs=[172, 375, 1307, 1362, 1483, 1485])
neighbour_search.get_n_nearest_from_locs(n_nearest=5,locs=[0,1,2])
neighbour_search.get_n_nearest_from_locs(n_nearest=5,locs=[0,1,2,3,4,5,6])
neighbour_search.get_n_nearest_from_locs(n_nearest=100,locs=[0,1,2,3,4,5,6])
neighbour_search.nearby_results(locs=[0,2,4,6],min_results=10000) # ensure while loop finishes
neighbour_search.nearby_results(locs=[0,2,4,6],min_results=10) # ensure while loop finishes
#'''
#None

In [None]:
#neighbour_search.get_n_nearest_from_locs(n_nearest=2,locs=d.df.index[:4])
#neighbour_search.nearby_results(locs=d.df.index[:4],min_results=20) # ensure while loop finishes