# [TLDR] simple class to gather all data related actions in one place

# [LONGER VERSION]
- avoid placing data manipulations everywhere
- data exploration done mainly in `exploration.ipynb` but also in other notebooks but prod logic is gathered here
  - `preselected_columns` of useful data is specified here
  - numerical data stripping, cleaning and normalizing is done here
     1. duplicated columns removed (identified in `img_features` after seeing duplicate images)
     2. missing data imputed with populate mean (identified in `exploration.ipynb` after failing sklearn fit_tranforms)
     3. log-transformations to normalized data done here (reasoning in `exploration.ipynb` after looking at qqplots)
     4. `StandardScaler` is done here (to bring data into useful space for `PCA` and `kneighbors` among other techiques)
     5. `PCA` extraction is done here (to reduce data dimensionality, amelorate outliers and to speed up processing)
- `product_picture` tools are gathered here
  1. that scrape from web 
  2. builds a local cache to avoid hitting www.wish.com
  3. `plt.imshow` plotting of `product_picture`

# IMPORTS

In [None]:
%run ipynb_setup.ipynb

In [None]:
import os
from pathlib import Path
from typing import Tuple,Dict,List

import cv2
from urllib.request import urlretrieve
from joblib import Parallel, delayed

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA

# CLASS DEF

In [None]:
IMG                   = np.ndarray      # cv2.imread return type

class Dataset():

    csv = 'Sales Of Summer Cloths.csv' # put csv into jupyter notebook root dir
    
    preselected_columns = [
        'title',
        'title_orig',
        'price',
        'retail_price',
        'units_sold',
        'rating',
        'rating_count',
        'rating_five_count',
        'rating_four_count',
        'rating_three_count',
        'rating_two_count',
        'rating_one_count',
        'tags',
        'product_color',
        'merchant_rating_count',
        'merchant_rating',
        'product_picture',
    ]
    
    def __init__(
        self,
        ) -> None :
        
        # setup directories
        self.cwd      = Path(os.getcwd())  #'C:/Users/ahkar/OneDrive/Documents/Delvify/')
        self.cachedir = self.cwd / 'cache' # cache dir path

        # read data
        self.raw     = self.read_csv()   # raw dataset
        self.df      = self.clean_df()   # cleaned up dataset
        self.df_num  = self.df_numeric() # transformed / scaled / pca'ed dataset
        
    # simple read
    def read_csv(self) -> pd.DataFrame :
        return pd.read_csv(self.cwd / Dataset.csv) # read raw df
    
    # remove columns deemed to be useless + duplicate listings
    def clean_df(
        self,
        ) -> pd.DataFrame :
        df = self.raw.copy()
        df = df.loc[:,Dataset.preselected_columns] # subset to columns deemed useful
        df = df.drop_duplicates() # remove duplicate listings
        return df

    # extract df of numeric features + populate nans
    def df_numeric(
        self,
        should_impute_nans      : bool  = True,  # rather than trashing entry, populate missing ratings with population mean
        should_log_transform    : bool  = True,  # make data distribution more normal like
        should_standard_scale   : bool  = True,  # N(0,1) the data
        should_minmax_scale     : bool  = False,
        should_minmax_scale_abs : float = 2,
        should_pca              : bool  = True,  # reduce dimensionality to speed up analysis
        should_pca_components   : float = 6,    # elbow / eyeballed optimal principal components to use
        ) -> pd.DataFrame :
        # strip
        df_num = self.df.select_dtypes(include=['int64','float64']).copy()
        
        # impute nans
        if should_impute_nans:
            imp = SimpleImputer(missing_values=np.nan, strategy='mean')
            imputed = imp.fit_transform(df_num)
            df_num = pd.DataFrame(imputed,index=df_num.index,columns=df_num.columns)
    
        # apply log transform
        if should_log_transform:
            df_num = np.log(df_num+1.1)
            
        # apply standard scaler
        if should_standard_scale:
            s_scaler = StandardScaler()
            scaled = s_scaler.fit_transform(df_num)
            df_num = pd.DataFrame(scaled,index=df_num.index,columns=df_num.columns)
            
        # apply min max scaler
        if should_minmax_scale:
            mm_scaler = MinMaxScaler(feature_range=(should_minmax_scale_abs*-1,should_minmax_scale_abs))
            scaled = mm_scaler.fit_transform(df_num)
            df_num = pd.DataFrame(scaled,index=df_num.index,columns=df_num.columns)
        
        # apply pca
        if should_pca:
            pca = PCA(n_components=should_pca_components)
            reduced = pca.fit_transform(df_num)
            df_num = pd.DataFrame(reduced,index=df_num.index)
            df_num.columns = ['ev'+str(x) for x in df_num.columns] # rename columns

        # return
        return df_num

    def url_to_cache_filepath(
        self,
        url : str, # = 'https://contestimg.wish.com/api/webimage/5e9ae51d43d6a96e303acdb0-medium.jpg',
        ) -> Path :
        return self.cachedir / Path(url).name

    # return jpg associated with url in product_picture
    def get_product_picture(
        self,
        url        : str            = None,  # 'https://contestimg.wish.com/api/webimage/5e9ae51d43d6a96e303acdb0-medium.jpg'
        loc        : int            = None,  # 0 / index name to read
        plot       : bool           = True,  # should plot final result
        grayscale  : bool           = False, # to avoid finding same item in different colour
        blur       : bool           = False, # to avoid finding same item with minor difference in listing
        blur_ksize : Tuple[int,int] = (5,5), # the larger the stronger the smoothing
        verbose    : int            = 0,     # give details
        ) -> IMG :
        '''
        d=Dataset() # instantiate
        d.get_product_picture(url='https://contestimg.wish.com/api/webimage/5e9ae51d43d6a96e303acdb0-medium.jpg')
        d.get_product_picture(loc=3)
        d.get_product_picture(loc=3,blur=False,grayscale=True)
        '''
        
        ####################################
        # ensure url is populated
        ####################################
        if loc is not None:
            url = self.df['product_picture'].loc[loc]
            
        ####################################
        # ensure local cache populated
        ####################################
        local_filepath = self.url_to_cache_filepath(url) # target location
        if local_filepath.exists():
            if verbose>0: print(f'read {local_filepath}')
        else:
            if not self.cachedir.exists(): os.mkdir(self.cachedir) # ensure cache dir exists
            urlretrieve(url,local_filepath) # populate local cache each time url / jpg is requested
            if verbose>0: print(f'cache {url}')
        
        ####################################
        # read img from cache
        ####################################
        # read raw file
        img = cv2.imread(str(local_filepath))
        
        # should apply grayscale?
        if grayscale:
            img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) # yes
            img = np.stack([img,img,img],axis=2)
        else:
            img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) # no, then just do default RGB
        
        # should apply blur?
        if blur:
            img = cv2.blur(img, blur_ksize) 

        # should plot?
        if plot:
            fig,ax = plt.subplots(1,1)
            ax.imshow(img)
            ax.set_axis_off()
            ax.set_title(loc)
            plt.show()     

        # return
        return img

    # force populate cache with all imgs from urls in df['product_picture']
    def populate_cache(self):
        '''
        d=Dataset() # instantiate
        d.populate_cache() # force read on all product_picture urls
        '''
        Parallel(n_jobs=-1)(delayed(self.get_product_picture)(url) for url in self.df['product_picture']) # batch download images

    # return multiple jpgs associated locs
    def get_product_pictures(
        self,
        locs       : List[int]      = None,  # 0 / index name to read
        ) -> IMG :
        '''
        d.get_product_pictures(locs=[1307,758,1183,1516])
        '''
        # gather imgs
        imgs = [self.get_product_picture(loc=loc,plot=False) for loc in locs]
        
        def ceildiv(a,b): return -(a // -b) # ceiling division without needing any imports

        # prep plotting device
        MAX_COLS   = 5
        PLOT_WIDTH = 5
        plot_cols = min(5,len(locs))
        plot_rows = ceildiv(len(locs),plot_cols) # max number of rows needed to plot 'locs' pictures with 'cols' pictures on each row
        fig,ax = plt.subplots(plot_rows,plot_cols,figsize=(PLOT_WIDTH * plot_cols,PLOT_WIDTH * plot_rows))

        # plot imgs
        curr_plotting_row = -1 # increments to 0 on first loop as increment condition is true
        for i,(loc,img) in enumerate(zip(locs,imgs)):
            # plotting column location
            curr_plotting_col = i%plot_cols
            # increment plotting row each time we hit the 'cols'th column
            if curr_plotting_col == 0:
                curr_plotting_row = curr_plotting_row + 1
                
            # plot
            if plot_rows == 1:
                if plot_cols == 1:
                    # dont need any dimension if only 1 plot
                    ax.imshow(img) 
                    ax.set_axis_off()
                    ax.set_title(loc)
                else:
                    # dont need 2nd dimension for ax if only 1 row
                    ax[curr_plotting_col].imshow(img) 
                    ax[curr_plotting_col].set_axis_off() 
                    ax[curr_plotting_col].set_title(loc)
            else:
                # need 2nd dimension for ax if more than 1 row
                ax[curr_plotting_row,curr_plotting_col].imshow(img)
                ax[curr_plotting_row,curr_plotting_col].set_axis_off() # 
                ax[curr_plotting_row,curr_plotting_col].set_title(loc)

    # plot top n index from df
    def show_top_n(
        self,
        res : pd.DataFrame,
        n   : int = 5
        ) -> pd.DataFrame:
        if len(res)>0: self.get_product_pictures(locs=res.index[:n]);plt.show()
        return res

In [None]:
'''
d=Dataset()
print(d.raw.shape)
print(d.df.shape)
print(d.df_num.shape)
d.get_product_picture(loc=20);
d.get_product_pictures(locs=[20,21,22]);
d.df_numeric(
        should_impute_nans       = True,  # rather than trashing entry, populate missing ratings with population mean
        should_log_transform     = True,  # make data distribution more normal like
        should_standard_scale    = True,  # N(0,1) the data
        should_minmax_scale      = False,
        should_minmax_scale_abs  = 2,
        should_pca               = True,  # reduce dimensionality to speed up analysis
        should_pca_components     = 6,    # elbow / eyeballed optimal principal components to use
)
'''
None