# Image dataset import
1. Create an empty array X
1. Create an empty list y
1. Go through every wind turbine folder
    1. Go through every category folder and pixel size the user selected (care for category_1; category_2..)
    1. Go through every crop (except 0_preview)
    1. Inside sensordata/R10m add the images B02_10m.jp2, B03_10m.jp2, B04_10m.jp2 and B08_10m.jp2 to an array
    1. Append the image array list to X and append 1 to y (select in the function parameters)
1. Go through every random crop folder
    1. Go through every category folder and pixel size the user selected (care for category_1 and/or category_2)
    1. Go through every crop (except 0_preview)
    1. Inside sensordata/R10m add the images B02_10m.jp2, B03_10m.jp2, B04_10m.jp2 and B08_10m.jp2 to an array
    1. Append the image array list to X and append 0 to y (select in the function parameters)


In [23]:
import rasterio
import numpy as np

In [33]:
class ImagesDataset():

    def __init__(self, selection_windturbine_paths=[""], selection_no_windturbine_paths=[""], 
                 categories_windturbine_crops=[3], categories_no_windturbine_crops=[2], 
                 pixel="40p", image_bands=["B02", "B03", "B04", "B08"]):
            """
            initialize all parameters for the data preparation
            
            Parameters
            ----------
            categories_windturbine_crops: list, [1,2,3]
                Set one or more categories of selection for windturbine selection. 
                Default is [3]
            categories_no_windturbine_crops: list, [1,2]
                Set one or more categories of selection for random crop selection. 
                Default is [2]
            pixel: str, ("10p", "20p", "30p", "40p" or "50p")
                Set one pixel value for the image.
                Default is "30p"
            selection_windturbines_path: pathlib.Path, pathlib.Path("")
                Set a list of paths to selection_windturbine folders in pathlib.Path format following the folder convention.
                Default is ""
            selection_no_windturbines_paths: pathlib.Path, pathlib.Path("")
                Set a list of paths to selection__no_windturbine folders in pathlib.Path format following the folder convention.
                Default is ""
            image_bands: list, ["B02", "B03", "B04", "B08"]
                Set the preferred image bands for the image.
                Default is ["B02", "B03", "B04", "B08"]
            """

            self.categories_windturbine_crops = categories_windturbine_crops
            self.categories_no_windturbine_crops = categories_no_windturbine_crops
            self.pixel = pixel
            self.selection_windturbine_paths = selection_windturbine_paths
            self.selection_no_windturbine_paths = selection_no_windturbine_paths
            self.image_bands = image_bands
            self.indices = []
            

    def get_images_from_path(self, windturbines, categories, path=""):
        """Expects a pathlib path and windturbine paramter (0 = no windturbine, 1 = windturbine)
        Returns the independent variable four dimensional numpy array with every image bands, categories 
        and pixel shape selected by the user of every crop inside the folders. Also this function returns
        the dependent variable vector (windturbine: Yes/No) corresponding to the independent variable array
        (images).
        
        Parameters
        ----------
        windtubines: int, (1 or 0)
            Set the parameter to either 1 for data with windturbines and 0 without windturbines.
            There is no default!
        categories: list, [1,2,3] or [1,2]
            Set one or more categories of the selection. 
            There is no default!
        path: pathlib.Path, pathlib.Path("")
            Set a list of paths to sentinal image folders in pathlib.Path format following the folder convention.
            Default is ""
        
        Returns
        ----------
        X_images: list, 4D array
            Returns a 4D list with images of every folder inside the given path
        y_images: list, 1D array
            Returns a 1D list with 1s (windturbines) or 0s (no windturbines) corresponding to given input
        """
        
        X_images = []
        y_images = []

        # loop through every category inside the selected windturbine crop folder
        for category in path.glob("*"):
            # only select categories and pixel shape selected by the user
            if category.name.count("_") == 3:
                if int(category.name.split("_")[1]) in categories and category.name.split("_")[3] == self.pixel:
                    for crop in category.glob("*"):
                        if crop.is_dir() and crop.name != "0_combined-preview":

                            image_path = crop / "sensordata" / "R10m"
                            image_list = np.array([])

                            # append every user selected image band to a list
                            for element in image_path.glob("*_*_B*_10m.jp2"):
                                if element.name.split("_")[2] in self.image_bands:
                                    with rasterio.open(str(element)) as f:
                                        if image_list.size == 0:
                                            image_list = f.read(indexes=1)
                                        else:
                                            image_list = np.dstack((image_list, f.read(indexes=1)))

                            X_images.append(image_list)
                            y_images.append(windturbines)
                            self.indices.append(crop.name.split("_")[0])
        
        return X_images, y_images 
    
    def create_wt_identification_data(self):
        """Takes in path lists for windturbine and no windturbine image crops, appends every image to an array
        and simultaniously adds a factorial variable to another list which indicates if the image contains a windturbine

        Parameters
        ----------
        selection_windturbines_path: pathlib.Path, pathlib.Path("")
            Set a list of paths to selection_windturbine folders in pathlib.Path format following the folder convention.
            Default is ""
        selection_no_windturbines_paths: pathlib.Path, pathlib.Path("")
            Set a list of paths to selection__no_windturbine folders in pathlib.Path format following the folder convention.
            Default is ""

        Returns
        ----------
        X: list, 4D array
            Returns a 4D list with images of every folder inside the given paths
        y: list, 1D array
            Returns a 1D list with 1s (windturbines) and 0s (no windturbines)
        """
        # initialize the independent and dependent variable
        X = []
        y = []
        
        for path in self.selection_windturbine_paths:
            X_images, y_images = self.get_images_from_path(windturbines=1, categories=self.categories_windturbine_crops,
                                                           path=path)
            X.extend(X_images)
            y.extend(y_images)

        for path in self.selection_no_windturbine_paths:
            X_images, y_images = self.get_images_from_path(windturbines=0, categories=self.categories_no_windturbine_crops,
                                                           path=path)
            X.extend(X_images)
            y.extend(y_images)
        
        X = np.array(X)
    
        return X, y

In [25]:
from pathlib import Path
dataset = ImagesDataset(selection_windturbine_paths=[Path("/data/projects/windturbine-identification-sentinel/croppedTiles/us-uswtdb_selection_windturbines")], 
                             selection_no_windturbine_paths=[Path("/data/projects/windturbine-identification-sentinel/croppedTiles/selection_no-windturbines")],
                             categories_windturbine_crops=[3], categories_no_windturbine_crops=[2], 
                             pixel="30p", image_bands=["B02", "B03", "B04", "B08"])

In [26]:
X, y = dataset.create_wt_identification_data()

  s.start()


In [27]:
X.shape

(9419, 30, 30, 4)

In [28]:
X

array([[[[2304,   64,  192,  320],
         [2816,  192,  192,  320],
         [2816,   64,  192,  320],
         ...,
         [2816,  192,  576,  576],
         [3328,  192,  192,  448],
         [3328,   64,  192,  320]],

        [[2816,   64,  192,  320],
         [3328,  192,  192,  320],
         [2816,   64,  192,  320],
         ...,
         [2816,  320,  448,  704],
         [2816,  192,  192,  320],
         [3328,  192,  192,  448]],

        [[3328,  192,  192,  320],
         [3328,  192,  192,  320],
         [2816,  192,  192,  320],
         ...,
         [2816,  448,  448,  576],
         [3328,  192,  320,  448],
         [3328,  192,  192,  448]],

        ...,

        [[2304,  192,  192,  320],
         [2304,  192,  320,  448],
         [2304,  192,  192,  320],
         ...,
         [2816,  192,  192,  320],
         [2304,   64,  192,  192],
         [2304,   64,  192,  192]],

        [[2304,  192,  192,  320],
         [2304,   64,  320,  320],
         [12

In [29]:
y.count(1)

5157

In [30]:
y.count(0)

4262

In [31]:
len(y)

9419