# *ml-1J-create-premade-train-val.ipynb*

# Create datasets for every 0.001 sample we're using

Datasets for different folds of year-wise cross-validation for 12 combinations of region, crop, compositing scheme, and in-season date

### This notebook only needs to get run once.

After running this notebook, we can run ml-1H and beyond which does fit_predict_report() really fast.


In [1]:
import numpy as np
import pandas as pd
import os

from sklearn.model_selection import train_test_split

In [2]:
def create_X_y_single_year(tile,year,scheme_name,crop_of_interest_id,
                          in_season=None):
    coiid = crop_of_interest_id
    
    refl = np.load(f'../data/composited_interpolated/Refl_{tile}_{year}_{scheme_name}.npy')

    # Determine nrf (number of reflectance features)
    if in_season not in [160, 230, None]:
        print('Please change in_season to 160, 230, or None.')
        return None
    if in_season is None:
        nrf = refl.shape[1]
    if in_season in [160, 230]:
        pdsize = int(scheme_name[:-3])
        nrf = ((in_season - 90) // pdsize) * 6 + 6
        
    crop = []
    for y in range(year-4,year+1):
        crop.append(np.load(f'../data/processed_crop/Crop_{tile}_{y}.npy')==coiid)

    X = np.column_stack([refl[:,:nrf]] + crop[:-1])
    y = crop[-1]
    
    return X, y

In [3]:
def create_X_y_multiyear(tile,
                      years,
                      scheme_name,
                      crop_of_interest_id,
                        in_season=None):
    X_list = []
    y_list = []
    
    for y in years:
        X, y = create_X_y_single_year(tile,y,scheme_name,
                                      crop_of_interest_id,
                                     in_season)
        X_list.append(X)
        y_list.append(y)
    
    X = np.concatenate(X_list)
    y = np.concatenate(y_list)
    
    return X, y

In [4]:
def create_X_y(tile,
              years,
              scheme_name,
              crop_of_interest_id,
              in_season=None
              ):
    if type(years)==int:
        return create_X_y_single_year(tile,
                                      years,
                                      scheme_name,
                                      crop_of_interest_id,
                                      in_season)
    
    return create_X_y_multiyear(tile,
                              years,
                              scheme_name,
                              crop_of_interest_id,
                              in_season)

In [8]:
def premake_Xy_trainval(training_sample_size,
                      validation_sample_size,
                      tile,
                      years,
                      scheme_name,
                      crop_of_interest_id,
                      in_season
                      ):
    
    for val_year in years:
          
        strings = []
        for arg in [tile,val_year,scheme_name,crop_of_interest_id,in_season]:
            strings.append(f'{arg}')
        most_of_name = '_'.join(strings) 

        Xy_trainval = ['X_train', 'X_val', 'y_train', 'y_val']
        names = {}
        for spec in Xy_trainval:
            names[spec] = f'{most_of_name}_{spec}.npy'
            
        # check whether previously run and, if so, end the effort
        loc = f'../data/premade_{training_sample_size}_{validation_sample_size}'
        already_there = []
        for spec in Xy_trainval:
            already_there.append(names[spec] in os.listdir(loc))
        if sum(already_there)==4:
            continue
            #return 'If you see this, all 4 specified datasets were previously created.'

      
        print(f'-- Assembling 4 datasets for {most_of_name}--')
        
        train_years = [yr for yr in range(2018,2023) if yr!=val_year]

        # Get complete datasets
        X_train0, y_train0 = create_X_y(tile,train_years,
                                      scheme_name,crop_of_interest_id,
                                       in_season)
        X_val0, y_val0 = create_X_y(tile,val_year,
                                  scheme_name,crop_of_interest_id,
                                  in_season)

        # Trim datasets down to sample size
        X_train, X_trsurplus, y_train, y_trsurplus = train_test_split(X_train0,
                                 y_train0,
                                 train_size=training_sample_size,
                                 random_state=19)

        X_val, X_vsurplus, y_val, y_vsurplus = train_test_split(X_val0,
                                 y_val0,
                                 train_size=validation_sample_size,
                                 random_state=19)

        # Save down the 4 datasets we care about
        n=names['X_train']
        np.save(f'{loc}/{n}',
                X_train,
                allow_pickle=False)
        
        n=names['X_val']
        np.save(f'{loc}/{n}',
            X_val,
            allow_pickle=False)
        
        n=names['y_train']
        np.save(f'{loc}/{n}',
            y_train,
            allow_pickle=False)
        
        n=names['y_val']
        np.save(f'{loc}/{n}',
            y_val,
            allow_pickle=False)
        
        print(f'Done. Find results in {loc}')
    
    return f'Find results in {loc}'

In [None]:
premake_Xy_trainval(0.001,0.001,
                      '10SFH',
                      [2018,2019,2020,2021,2022],
                      '5day',
                      75,
                      160
                      )

## Create all 0.001_0.001 datasets

Just need to keep running this until all the 0.001_0.001 datasets get made!


In [9]:
training_sample_size = 0.001
validation_sample_size = 0.001
years = [2018,2019,2020,2021,2022]

for tile_coiid in [('10SFH',75),('15TVG',1)]:
    for scheme_name in ['14day','5day']:
        for in_season in [160, 230, None]:
            premake_Xy_trainval(training_sample_size,
                      validation_sample_size,
                      tile_coiid[0],
                      years,
                      scheme_name,
                      tile_coiid[1],
                      in_season
                      )

-- Assembling 4 datasets for 10SFH_2020_14day_75_None--
Done. Find results in ../data/premade_0.001_0.001
-- Assembling 4 datasets for 10SFH_2021_14day_75_None--
Done. Find results in ../data/premade_0.001_0.001
-- Assembling 4 datasets for 10SFH_2022_14day_75_None--
Done. Find results in ../data/premade_0.001_0.001
