# *ml-1E-dask-distributed-model-runs.ipynb*

This notebook (1E) features the most successful way I've found using dask.

# Running models in parallel

This is a work-in-progress-notebook. Currently, it allows us to: specify a models bank for 160 Random Forest and Extra Trees models; assemble the datasets for different folds of year-wise cross-validation; run the models for 12 combinations of region, crop, compositing scheme, and in-season date; and record results (accuracies and uncertainties) in csv files.

Function definitions at the top of the notebook; then sequential model runs; then parallel model runs.

New in version 1D: parallel model runs using Dask.

New in version 1C: can draw from bank of specified models, and run through them in a for loop.

New in version 1B: can now specify `in_season`, which is None (meaning use whole year), or 230 (thru DOY 230 which is mid-August), or 160 (thru DOY 160 which is early June)

In [1]:
import numpy as np
import pandas as pd
import os

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import StandardScaler

In [2]:
def create_models_bank():
    models_bank = {}
    increment = 0
    for n_estimators in [200, 500]:
        for max_features in [0.05, 0.1, 0.2, 0.4, 1.0]:
            for min_samples_split in [2, 4]:
                for bootstrap in [False, True]:
                    for class_weight in [None, 'balanced']:

                        increment += 1

                        three_digit = str(increment).zfill(3)
                        models_bank[three_digit] = {
                            'n_estimators': n_estimators,
                            'max_features': max_features,
                            'bootstrap': bootstrap,
                            'min_samples_split': min_samples_split,
                            'class_weight': class_weight,
                            'n_jobs': -1
                        }

    return models_bank

In [3]:
def create_X_y_single_year(tile,year,scheme_name,crop_of_interest_id,
                          in_season=None):
    coiid = crop_of_interest_id
    
    refl = np.load(f'../data/composited_interpolated/Refl_{tile}_{year}_{scheme_name}.npy')

    # Determine nrf (number of reflectance features)
    if in_season not in [160, 230, None]:
        print('Please change in_season to 160, 230, or None.')
        return None
    if in_season is None:
        nrf = refl.shape[1]
    if in_season in [160, 230]:
        pdsize = int(scheme_name[:-3])
        nrf = ((in_season - 90) // pdsize) * 6 + 6
        
    crop = []
    for y in range(year-4,year+1):
        crop.append(np.load(f'../data/processed_crop/Crop_{tile}_{y}.npy')==coiid)

    X = np.column_stack([refl[:,:nrf]] + crop[:-1])
    y = crop[-1]
    
    return X, y

In [4]:
def create_X_y_multiyear(tile,
                      years,
                      scheme_name,
                      crop_of_interest_id,
                        in_season=None):
    X_list = []
    y_list = []
    
    for y in years:
        X, y = create_X_y_single_year(tile,y,scheme_name,
                                      crop_of_interest_id,
                                     in_season)
        X_list.append(X)
        y_list.append(y)
    
    X = np.concatenate(X_list)
    y = np.concatenate(y_list)
    
    return X, y

In [5]:
def create_X_y(tile,
              years,
              scheme_name,
              crop_of_interest_id,
              in_season=None
              ):
    if type(years)==int:
        return create_X_y_single_year(tile,
                                      years,
                                      scheme_name,
                                      crop_of_interest_id,
                                      in_season)
    
    return create_X_y_multiyear(tile,
                              years,
                              scheme_name,
                              crop_of_interest_id,
                              in_season)

In [6]:
def fit_predict_report(model_name,
                      model,
                      training_sample_size,
                      validation_sample_size,
                      tile,
                      years,
                      scheme_name,
                      crop_of_interest_id,
                      in_season
                      ):
    
    # produce csv_name
    exempt = ['years', 'model']
    param_value_strings = [f'{model_name}',
                      f'{training_sample_size}',
                      f'{validation_sample_size}',
                      f'{tile}',
                      f'{scheme_name}',
                      f'{crop_of_interest_id}',
                      f'{in_season}']
    csv_name = '_'.join(param_value_strings) +'.csv'

    # check whether previously run and, if so, end the effort
    if csv_name in os.listdir('../data/results/'):
        return 'If you see this, the specified model was previously run.'

    print(f'-- Process for {csv_name} --')
    
    # below is actually fitting and predicting and reporting
    
    conf = []

    for val_year in years:
        print('Starting a fold...')
        print('> Assembling the datasets')
        train_years = [yr for yr in range(2018,2023) if yr!=val_year]

        X_train0, y_train0 = create_X_y(tile,train_years,
                                      scheme_name,crop_of_interest_id,
                                       in_season)
        X_val0, y_val0 = create_X_y(tile,val_year,
                                  scheme_name,crop_of_interest_id,
                                  in_season)

        if training_sample_size is not None:
            X_train, X_trsurplus, y_train, y_trsurplus = train_test_split(X_train0,
                                                                     y_train0,
                                                                     train_size=training_sample_size,
                                                                     random_state=19)
        if training_sample_size is None:
            X_train, y_train = X_train0, y_train0

        if validation_sample_size is not None:
            X_val, X_vsurplus, y_val, y_vsurplus = train_test_split(X_val0,
                                                                     y_val0,
                                                                     train_size=validation_sample_size,
                                                                     random_state=19)
        if validation_sample_size is None:
            X_val, y_val = X_val0, y_val0
    
            
        print('> Fitting the model on the training set')
        model.fit(X_train, y_train)
        print('> Predicting on the validation set')
        pred = model.predict(X_val)

        print('> Recording performance metrics')
        act = y_val
        ActPred_00 = sum((act==0) & (pred==0))
        ActPred_01 = sum((act==0) & (pred==1))
        ActPred_10 = sum((act==1) & (pred==0))
        ActPred_11 = sum((act==1) & (pred==1))
        conf_1yr = [ActPred_00, ActPred_01, ActPred_10, ActPred_11]

        conf.append(conf_1yr)
        print('Finished a fold.')

    carr = np.array(conf)

    carr = np.row_stack([carr,np.full((2,4),-1)])

    # above we added the totals row
    # now we need to add the columns for precision and recall

    # create dataframe
    cdf = pd.DataFrame(data = carr,
                      index = [f'ValYear{yr}' for yr in years]+['Mean','StdE'],
                      columns = ['ActPred_00', 'ActPred_01', 
                                 'ActPred_10', 'ActPred_11']
                      )

    cdf['Precision'] = cdf.ActPred_11 / (cdf.ActPred_01 + cdf.ActPred_11)
    cdf['Recall'] = cdf.ActPred_11 / (cdf.ActPred_10 + cdf.ActPred_11)
    cdf['F1'] = 2*cdf.Precision*cdf.Recall / (cdf.Precision + cdf.Recall)
    for col in ['Precision','Recall','F1']:
        cdf.at['Mean',col] = np.mean(cdf.loc[:'ValYear2022',col])
        cdf.at['StdE',col] = np.std(cdf.loc[:'ValYear2022',col])
    
    
    param_strings = [f'# model_name: {model_name}',
                     f'# model: {model}',
                      f'# training_sample_size: {training_sample_size}',
                      f'# validation_sample_size: {validation_sample_size}',
                      f'# tile: {tile}',
                      f'# scheme_name: {scheme_name}',
                      f'# crop_of_interest_id: {crop_of_interest_id}',
                      f'# in_season: {in_season}']
    comment = '\n'.join(param_strings) + '\n' 
    with open(f'../data/results/{csv_name}', 'a') as f:
        f.write(comment)
        cdf.to_csv(f)
    
    print(f'Find results in ../data/results/{csv_name}')
    
    return f'Find results in ../data/results/{csv_name}'

## Dask version

Still troubleshooting memory issues on MGHPCC. Given enough memory resources this should run.

In [7]:
import dask
from dask.distributed import print
from dask.distributed import Client

In [8]:
client = Client(memory_limit='128GiB',
                threads_per_worker=1,
               )

distributed.diskutils - INFO - Found stale lock file and directory '/work/sds-lab/august/crops/notebooks/dask-worker-space/worker-0lae_gks', purging
distributed.diskutils - INFO - Found stale lock file and directory '/work/sds-lab/august/crops/notebooks/dask-worker-space/worker-_jkc0fpy', purging
distributed.diskutils - INFO - Found stale lock file and directory '/work/sds-lab/august/crops/notebooks/dask-worker-space/worker-c1of6rnf', purging
distributed.diskutils - INFO - Found stale lock file and directory '/work/sds-lab/august/crops/notebooks/dask-worker-space/worker-io_5eoo_', purging
distributed.diskutils - INFO - Found stale lock file and directory '/work/sds-lab/august/crops/notebooks/dask-worker-space/worker-iulsl_ol', purging
distributed.diskutils - INFO - Found stale lock file and directory '/work/sds-lab/august/crops/notebooks/dask-worker-space/worker-sd5pa2jz', purging
distributed.diskutils - INFO - Found stale lock file and directory '/work/sds-lab/august/crops/notebooks/d

In [9]:
client

0,1
Connection method: Cluster object,Cluster type: distributed.LocalCluster
Dashboard: http://127.0.0.1:8787/status,

0,1
Dashboard: http://127.0.0.1:8787/status,Workers: 56
Total threads: 56,Total memory: 7.00 TiB
Status: running,Using processes: True

0,1
Comm: tcp://127.0.0.1:44757,Workers: 56
Dashboard: http://127.0.0.1:8787/status,Total threads: 56
Started: Just now,Total memory: 7.00 TiB

0,1
Comm: tcp://127.0.0.1:36440,Total threads: 1
Dashboard: http://127.0.0.1:44509/status,Memory: 128.00 GiB
Nanny: tcp://127.0.0.1:39406,
Local directory: /work/sds-lab/august/crops/notebooks/dask-worker-space/worker-6_hmbr22,Local directory: /work/sds-lab/august/crops/notebooks/dask-worker-space/worker-6_hmbr22

0,1
Comm: tcp://127.0.0.1:41465,Total threads: 1
Dashboard: http://127.0.0.1:42402/status,Memory: 128.00 GiB
Nanny: tcp://127.0.0.1:39934,
Local directory: /work/sds-lab/august/crops/notebooks/dask-worker-space/worker-nahu93zw,Local directory: /work/sds-lab/august/crops/notebooks/dask-worker-space/worker-nahu93zw

0,1
Comm: tcp://127.0.0.1:43102,Total threads: 1
Dashboard: http://127.0.0.1:34685/status,Memory: 128.00 GiB
Nanny: tcp://127.0.0.1:35418,
Local directory: /work/sds-lab/august/crops/notebooks/dask-worker-space/worker-lr_13dul,Local directory: /work/sds-lab/august/crops/notebooks/dask-worker-space/worker-lr_13dul

0,1
Comm: tcp://127.0.0.1:35807,Total threads: 1
Dashboard: http://127.0.0.1:36441/status,Memory: 128.00 GiB
Nanny: tcp://127.0.0.1:34951,
Local directory: /work/sds-lab/august/crops/notebooks/dask-worker-space/worker-qq0ffecc,Local directory: /work/sds-lab/august/crops/notebooks/dask-worker-space/worker-qq0ffecc

0,1
Comm: tcp://127.0.0.1:35919,Total threads: 1
Dashboard: http://127.0.0.1:43755/status,Memory: 128.00 GiB
Nanny: tcp://127.0.0.1:45092,
Local directory: /work/sds-lab/august/crops/notebooks/dask-worker-space/worker-4r1x1_is,Local directory: /work/sds-lab/august/crops/notebooks/dask-worker-space/worker-4r1x1_is

0,1
Comm: tcp://127.0.0.1:32911,Total threads: 1
Dashboard: http://127.0.0.1:37986/status,Memory: 128.00 GiB
Nanny: tcp://127.0.0.1:33194,
Local directory: /work/sds-lab/august/crops/notebooks/dask-worker-space/worker-dhi034s_,Local directory: /work/sds-lab/august/crops/notebooks/dask-worker-space/worker-dhi034s_

0,1
Comm: tcp://127.0.0.1:41183,Total threads: 1
Dashboard: http://127.0.0.1:44952/status,Memory: 128.00 GiB
Nanny: tcp://127.0.0.1:43899,
Local directory: /work/sds-lab/august/crops/notebooks/dask-worker-space/worker-dq472zrv,Local directory: /work/sds-lab/august/crops/notebooks/dask-worker-space/worker-dq472zrv

0,1
Comm: tcp://127.0.0.1:38831,Total threads: 1
Dashboard: http://127.0.0.1:39599/status,Memory: 128.00 GiB
Nanny: tcp://127.0.0.1:36986,
Local directory: /work/sds-lab/august/crops/notebooks/dask-worker-space/worker-_dndl1se,Local directory: /work/sds-lab/august/crops/notebooks/dask-worker-space/worker-_dndl1se

0,1
Comm: tcp://127.0.0.1:43817,Total threads: 1
Dashboard: http://127.0.0.1:37028/status,Memory: 128.00 GiB
Nanny: tcp://127.0.0.1:33078,
Local directory: /work/sds-lab/august/crops/notebooks/dask-worker-space/worker-f4kfhm48,Local directory: /work/sds-lab/august/crops/notebooks/dask-worker-space/worker-f4kfhm48

0,1
Comm: tcp://127.0.0.1:33780,Total threads: 1
Dashboard: http://127.0.0.1:44732/status,Memory: 128.00 GiB
Nanny: tcp://127.0.0.1:34678,
Local directory: /work/sds-lab/august/crops/notebooks/dask-worker-space/worker-tcyr6rwy,Local directory: /work/sds-lab/august/crops/notebooks/dask-worker-space/worker-tcyr6rwy

0,1
Comm: tcp://127.0.0.1:43478,Total threads: 1
Dashboard: http://127.0.0.1:34535/status,Memory: 128.00 GiB
Nanny: tcp://127.0.0.1:36236,
Local directory: /work/sds-lab/august/crops/notebooks/dask-worker-space/worker-m3kjn9u1,Local directory: /work/sds-lab/august/crops/notebooks/dask-worker-space/worker-m3kjn9u1

0,1
Comm: tcp://127.0.0.1:46747,Total threads: 1
Dashboard: http://127.0.0.1:34896/status,Memory: 128.00 GiB
Nanny: tcp://127.0.0.1:36854,
Local directory: /work/sds-lab/august/crops/notebooks/dask-worker-space/worker-9a5d7q3u,Local directory: /work/sds-lab/august/crops/notebooks/dask-worker-space/worker-9a5d7q3u

0,1
Comm: tcp://127.0.0.1:41547,Total threads: 1
Dashboard: http://127.0.0.1:35428/status,Memory: 128.00 GiB
Nanny: tcp://127.0.0.1:46787,
Local directory: /work/sds-lab/august/crops/notebooks/dask-worker-space/worker-irro2o95,Local directory: /work/sds-lab/august/crops/notebooks/dask-worker-space/worker-irro2o95

0,1
Comm: tcp://127.0.0.1:46231,Total threads: 1
Dashboard: http://127.0.0.1:45826/status,Memory: 128.00 GiB
Nanny: tcp://127.0.0.1:46574,
Local directory: /work/sds-lab/august/crops/notebooks/dask-worker-space/worker-p13fo1cm,Local directory: /work/sds-lab/august/crops/notebooks/dask-worker-space/worker-p13fo1cm

0,1
Comm: tcp://127.0.0.1:33091,Total threads: 1
Dashboard: http://127.0.0.1:35415/status,Memory: 128.00 GiB
Nanny: tcp://127.0.0.1:36227,
Local directory: /work/sds-lab/august/crops/notebooks/dask-worker-space/worker-91lmgus7,Local directory: /work/sds-lab/august/crops/notebooks/dask-worker-space/worker-91lmgus7

0,1
Comm: tcp://127.0.0.1:36892,Total threads: 1
Dashboard: http://127.0.0.1:37897/status,Memory: 128.00 GiB
Nanny: tcp://127.0.0.1:36085,
Local directory: /work/sds-lab/august/crops/notebooks/dask-worker-space/worker-ul5140v7,Local directory: /work/sds-lab/august/crops/notebooks/dask-worker-space/worker-ul5140v7

0,1
Comm: tcp://127.0.0.1:38366,Total threads: 1
Dashboard: http://127.0.0.1:36834/status,Memory: 128.00 GiB
Nanny: tcp://127.0.0.1:42484,
Local directory: /work/sds-lab/august/crops/notebooks/dask-worker-space/worker-mstew_ao,Local directory: /work/sds-lab/august/crops/notebooks/dask-worker-space/worker-mstew_ao

0,1
Comm: tcp://127.0.0.1:38352,Total threads: 1
Dashboard: http://127.0.0.1:35455/status,Memory: 128.00 GiB
Nanny: tcp://127.0.0.1:36774,
Local directory: /work/sds-lab/august/crops/notebooks/dask-worker-space/worker-zi8ir9i1,Local directory: /work/sds-lab/august/crops/notebooks/dask-worker-space/worker-zi8ir9i1

0,1
Comm: tcp://127.0.0.1:42021,Total threads: 1
Dashboard: http://127.0.0.1:43552/status,Memory: 128.00 GiB
Nanny: tcp://127.0.0.1:46163,
Local directory: /work/sds-lab/august/crops/notebooks/dask-worker-space/worker-ovv50nsk,Local directory: /work/sds-lab/august/crops/notebooks/dask-worker-space/worker-ovv50nsk

0,1
Comm: tcp://127.0.0.1:44887,Total threads: 1
Dashboard: http://127.0.0.1:34602/status,Memory: 128.00 GiB
Nanny: tcp://127.0.0.1:44504,
Local directory: /work/sds-lab/august/crops/notebooks/dask-worker-space/worker-cve3xu4l,Local directory: /work/sds-lab/august/crops/notebooks/dask-worker-space/worker-cve3xu4l

0,1
Comm: tcp://127.0.0.1:40417,Total threads: 1
Dashboard: http://127.0.0.1:40377/status,Memory: 128.00 GiB
Nanny: tcp://127.0.0.1:33521,
Local directory: /work/sds-lab/august/crops/notebooks/dask-worker-space/worker-5fhgnh1r,Local directory: /work/sds-lab/august/crops/notebooks/dask-worker-space/worker-5fhgnh1r

0,1
Comm: tcp://127.0.0.1:36635,Total threads: 1
Dashboard: http://127.0.0.1:38688/status,Memory: 128.00 GiB
Nanny: tcp://127.0.0.1:33180,
Local directory: /work/sds-lab/august/crops/notebooks/dask-worker-space/worker-y9aeb6_r,Local directory: /work/sds-lab/august/crops/notebooks/dask-worker-space/worker-y9aeb6_r

0,1
Comm: tcp://127.0.0.1:41195,Total threads: 1
Dashboard: http://127.0.0.1:39695/status,Memory: 128.00 GiB
Nanny: tcp://127.0.0.1:41182,
Local directory: /work/sds-lab/august/crops/notebooks/dask-worker-space/worker-uv3uc0ql,Local directory: /work/sds-lab/august/crops/notebooks/dask-worker-space/worker-uv3uc0ql

0,1
Comm: tcp://127.0.0.1:39411,Total threads: 1
Dashboard: http://127.0.0.1:36398/status,Memory: 128.00 GiB
Nanny: tcp://127.0.0.1:42437,
Local directory: /work/sds-lab/august/crops/notebooks/dask-worker-space/worker-j4lrvrn3,Local directory: /work/sds-lab/august/crops/notebooks/dask-worker-space/worker-j4lrvrn3

0,1
Comm: tcp://127.0.0.1:40637,Total threads: 1
Dashboard: http://127.0.0.1:34505/status,Memory: 128.00 GiB
Nanny: tcp://127.0.0.1:36490,
Local directory: /work/sds-lab/august/crops/notebooks/dask-worker-space/worker-vy6qjo1z,Local directory: /work/sds-lab/august/crops/notebooks/dask-worker-space/worker-vy6qjo1z

0,1
Comm: tcp://127.0.0.1:45309,Total threads: 1
Dashboard: http://127.0.0.1:45882/status,Memory: 128.00 GiB
Nanny: tcp://127.0.0.1:41953,
Local directory: /work/sds-lab/august/crops/notebooks/dask-worker-space/worker-g56sy3rr,Local directory: /work/sds-lab/august/crops/notebooks/dask-worker-space/worker-g56sy3rr

0,1
Comm: tcp://127.0.0.1:45161,Total threads: 1
Dashboard: http://127.0.0.1:37047/status,Memory: 128.00 GiB
Nanny: tcp://127.0.0.1:46307,
Local directory: /work/sds-lab/august/crops/notebooks/dask-worker-space/worker-dyxwl7de,Local directory: /work/sds-lab/august/crops/notebooks/dask-worker-space/worker-dyxwl7de

0,1
Comm: tcp://127.0.0.1:42083,Total threads: 1
Dashboard: http://127.0.0.1:45701/status,Memory: 128.00 GiB
Nanny: tcp://127.0.0.1:43381,
Local directory: /work/sds-lab/august/crops/notebooks/dask-worker-space/worker-5mpqxr5_,Local directory: /work/sds-lab/august/crops/notebooks/dask-worker-space/worker-5mpqxr5_

0,1
Comm: tcp://127.0.0.1:39419,Total threads: 1
Dashboard: http://127.0.0.1:34506/status,Memory: 128.00 GiB
Nanny: tcp://127.0.0.1:42942,
Local directory: /work/sds-lab/august/crops/notebooks/dask-worker-space/worker-bu0xwblm,Local directory: /work/sds-lab/august/crops/notebooks/dask-worker-space/worker-bu0xwblm

0,1
Comm: tcp://127.0.0.1:38601,Total threads: 1
Dashboard: http://127.0.0.1:34636/status,Memory: 128.00 GiB
Nanny: tcp://127.0.0.1:39307,
Local directory: /work/sds-lab/august/crops/notebooks/dask-worker-space/worker-0zn94d1d,Local directory: /work/sds-lab/august/crops/notebooks/dask-worker-space/worker-0zn94d1d

0,1
Comm: tcp://127.0.0.1:40554,Total threads: 1
Dashboard: http://127.0.0.1:38030/status,Memory: 128.00 GiB
Nanny: tcp://127.0.0.1:34233,
Local directory: /work/sds-lab/august/crops/notebooks/dask-worker-space/worker-e8mx07gw,Local directory: /work/sds-lab/august/crops/notebooks/dask-worker-space/worker-e8mx07gw

0,1
Comm: tcp://127.0.0.1:44493,Total threads: 1
Dashboard: http://127.0.0.1:43219/status,Memory: 128.00 GiB
Nanny: tcp://127.0.0.1:39325,
Local directory: /work/sds-lab/august/crops/notebooks/dask-worker-space/worker-8xipm3q4,Local directory: /work/sds-lab/august/crops/notebooks/dask-worker-space/worker-8xipm3q4

0,1
Comm: tcp://127.0.0.1:38465,Total threads: 1
Dashboard: http://127.0.0.1:46602/status,Memory: 128.00 GiB
Nanny: tcp://127.0.0.1:32777,
Local directory: /work/sds-lab/august/crops/notebooks/dask-worker-space/worker-qvss0uo2,Local directory: /work/sds-lab/august/crops/notebooks/dask-worker-space/worker-qvss0uo2

0,1
Comm: tcp://127.0.0.1:36977,Total threads: 1
Dashboard: http://127.0.0.1:32938/status,Memory: 128.00 GiB
Nanny: tcp://127.0.0.1:40919,
Local directory: /work/sds-lab/august/crops/notebooks/dask-worker-space/worker-z5ddvr3g,Local directory: /work/sds-lab/august/crops/notebooks/dask-worker-space/worker-z5ddvr3g

0,1
Comm: tcp://127.0.0.1:41462,Total threads: 1
Dashboard: http://127.0.0.1:39987/status,Memory: 128.00 GiB
Nanny: tcp://127.0.0.1:37629,
Local directory: /work/sds-lab/august/crops/notebooks/dask-worker-space/worker-u29ot19l,Local directory: /work/sds-lab/august/crops/notebooks/dask-worker-space/worker-u29ot19l

0,1
Comm: tcp://127.0.0.1:43082,Total threads: 1
Dashboard: http://127.0.0.1:34745/status,Memory: 128.00 GiB
Nanny: tcp://127.0.0.1:40409,
Local directory: /work/sds-lab/august/crops/notebooks/dask-worker-space/worker-r06prmpw,Local directory: /work/sds-lab/august/crops/notebooks/dask-worker-space/worker-r06prmpw

0,1
Comm: tcp://127.0.0.1:42969,Total threads: 1
Dashboard: http://127.0.0.1:45116/status,Memory: 128.00 GiB
Nanny: tcp://127.0.0.1:41691,
Local directory: /work/sds-lab/august/crops/notebooks/dask-worker-space/worker-k0x0m698,Local directory: /work/sds-lab/august/crops/notebooks/dask-worker-space/worker-k0x0m698

0,1
Comm: tcp://127.0.0.1:44744,Total threads: 1
Dashboard: http://127.0.0.1:32812/status,Memory: 128.00 GiB
Nanny: tcp://127.0.0.1:43797,
Local directory: /work/sds-lab/august/crops/notebooks/dask-worker-space/worker-gffu40c_,Local directory: /work/sds-lab/august/crops/notebooks/dask-worker-space/worker-gffu40c_

0,1
Comm: tcp://127.0.0.1:43572,Total threads: 1
Dashboard: http://127.0.0.1:35204/status,Memory: 128.00 GiB
Nanny: tcp://127.0.0.1:45907,
Local directory: /work/sds-lab/august/crops/notebooks/dask-worker-space/worker-zyshe6h3,Local directory: /work/sds-lab/august/crops/notebooks/dask-worker-space/worker-zyshe6h3

0,1
Comm: tcp://127.0.0.1:41974,Total threads: 1
Dashboard: http://127.0.0.1:41553/status,Memory: 128.00 GiB
Nanny: tcp://127.0.0.1:35527,
Local directory: /work/sds-lab/august/crops/notebooks/dask-worker-space/worker-uh11uoam,Local directory: /work/sds-lab/august/crops/notebooks/dask-worker-space/worker-uh11uoam

0,1
Comm: tcp://127.0.0.1:38312,Total threads: 1
Dashboard: http://127.0.0.1:39171/status,Memory: 128.00 GiB
Nanny: tcp://127.0.0.1:33669,
Local directory: /work/sds-lab/august/crops/notebooks/dask-worker-space/worker-9qyjqxck,Local directory: /work/sds-lab/august/crops/notebooks/dask-worker-space/worker-9qyjqxck

0,1
Comm: tcp://127.0.0.1:43880,Total threads: 1
Dashboard: http://127.0.0.1:35520/status,Memory: 128.00 GiB
Nanny: tcp://127.0.0.1:42668,
Local directory: /work/sds-lab/august/crops/notebooks/dask-worker-space/worker-r3yad1we,Local directory: /work/sds-lab/august/crops/notebooks/dask-worker-space/worker-r3yad1we

0,1
Comm: tcp://127.0.0.1:36463,Total threads: 1
Dashboard: http://127.0.0.1:43358/status,Memory: 128.00 GiB
Nanny: tcp://127.0.0.1:46247,
Local directory: /work/sds-lab/august/crops/notebooks/dask-worker-space/worker-s65zyte7,Local directory: /work/sds-lab/august/crops/notebooks/dask-worker-space/worker-s65zyte7

0,1
Comm: tcp://127.0.0.1:37106,Total threads: 1
Dashboard: http://127.0.0.1:42318/status,Memory: 128.00 GiB
Nanny: tcp://127.0.0.1:39410,
Local directory: /work/sds-lab/august/crops/notebooks/dask-worker-space/worker-_571g8l8,Local directory: /work/sds-lab/august/crops/notebooks/dask-worker-space/worker-_571g8l8

0,1
Comm: tcp://127.0.0.1:42665,Total threads: 1
Dashboard: http://127.0.0.1:35857/status,Memory: 128.00 GiB
Nanny: tcp://127.0.0.1:36912,
Local directory: /work/sds-lab/august/crops/notebooks/dask-worker-space/worker-juf7sd47,Local directory: /work/sds-lab/august/crops/notebooks/dask-worker-space/worker-juf7sd47

0,1
Comm: tcp://127.0.0.1:41239,Total threads: 1
Dashboard: http://127.0.0.1:33586/status,Memory: 128.00 GiB
Nanny: tcp://127.0.0.1:43360,
Local directory: /work/sds-lab/august/crops/notebooks/dask-worker-space/worker-b2cazeqs,Local directory: /work/sds-lab/august/crops/notebooks/dask-worker-space/worker-b2cazeqs

0,1
Comm: tcp://127.0.0.1:42415,Total threads: 1
Dashboard: http://127.0.0.1:34494/status,Memory: 128.00 GiB
Nanny: tcp://127.0.0.1:41444,
Local directory: /work/sds-lab/august/crops/notebooks/dask-worker-space/worker-qj0_hyl1,Local directory: /work/sds-lab/august/crops/notebooks/dask-worker-space/worker-qj0_hyl1

0,1
Comm: tcp://127.0.0.1:44628,Total threads: 1
Dashboard: http://127.0.0.1:41097/status,Memory: 128.00 GiB
Nanny: tcp://127.0.0.1:37440,
Local directory: /work/sds-lab/august/crops/notebooks/dask-worker-space/worker-lxebo1yv,Local directory: /work/sds-lab/august/crops/notebooks/dask-worker-space/worker-lxebo1yv

0,1
Comm: tcp://127.0.0.1:37969,Total threads: 1
Dashboard: http://127.0.0.1:44429/status,Memory: 128.00 GiB
Nanny: tcp://127.0.0.1:43500,
Local directory: /work/sds-lab/august/crops/notebooks/dask-worker-space/worker-ea1xd1ob,Local directory: /work/sds-lab/august/crops/notebooks/dask-worker-space/worker-ea1xd1ob

0,1
Comm: tcp://127.0.0.1:39670,Total threads: 1
Dashboard: http://127.0.0.1:37840/status,Memory: 128.00 GiB
Nanny: tcp://127.0.0.1:35074,
Local directory: /work/sds-lab/august/crops/notebooks/dask-worker-space/worker-kzgyjrt6,Local directory: /work/sds-lab/august/crops/notebooks/dask-worker-space/worker-kzgyjrt6

0,1
Comm: tcp://127.0.0.1:42657,Total threads: 1
Dashboard: http://127.0.0.1:46000/status,Memory: 128.00 GiB
Nanny: tcp://127.0.0.1:41747,
Local directory: /work/sds-lab/august/crops/notebooks/dask-worker-space/worker-ze2xhfnq,Local directory: /work/sds-lab/august/crops/notebooks/dask-worker-space/worker-ze2xhfnq

0,1
Comm: tcp://127.0.0.1:42139,Total threads: 1
Dashboard: http://127.0.0.1:33484/status,Memory: 128.00 GiB
Nanny: tcp://127.0.0.1:35318,
Local directory: /work/sds-lab/august/crops/notebooks/dask-worker-space/worker-i18et_ho,Local directory: /work/sds-lab/august/crops/notebooks/dask-worker-space/worker-i18et_ho

0,1
Comm: tcp://127.0.0.1:41392,Total threads: 1
Dashboard: http://127.0.0.1:38637/status,Memory: 128.00 GiB
Nanny: tcp://127.0.0.1:39858,
Local directory: /work/sds-lab/august/crops/notebooks/dask-worker-space/worker-2yfeggit,Local directory: /work/sds-lab/august/crops/notebooks/dask-worker-space/worker-2yfeggit

0,1
Comm: tcp://127.0.0.1:39658,Total threads: 1
Dashboard: http://127.0.0.1:43701/status,Memory: 128.00 GiB
Nanny: tcp://127.0.0.1:33622,
Local directory: /work/sds-lab/august/crops/notebooks/dask-worker-space/worker-r27osc3r,Local directory: /work/sds-lab/august/crops/notebooks/dask-worker-space/worker-r27osc3r

0,1
Comm: tcp://127.0.0.1:36224,Total threads: 1
Dashboard: http://127.0.0.1:43962/status,Memory: 128.00 GiB
Nanny: tcp://127.0.0.1:34338,
Local directory: /work/sds-lab/august/crops/notebooks/dask-worker-space/worker-ym8ctuws,Local directory: /work/sds-lab/august/crops/notebooks/dask-worker-space/worker-ym8ctuws

0,1
Comm: tcp://127.0.0.1:40616,Total threads: 1
Dashboard: http://127.0.0.1:46539/status,Memory: 128.00 GiB
Nanny: tcp://127.0.0.1:35479,
Local directory: /work/sds-lab/august/crops/notebooks/dask-worker-space/worker-2fhacs8_,Local directory: /work/sds-lab/august/crops/notebooks/dask-worker-space/worker-2fhacs8_


In [None]:
#client.close()

In [10]:
fit_predict_report = dask.delayed(fit_predict_report)

## Regular parallel version:

In [11]:
models_bank = create_models_bank()
training_sample_size = 0.001
validation_sample_size = 0.001

r = []
for tile_coiid in [('10SFH',75),('15TVG',1)]:
    for scheme_name in ['14day','5day']:
        for in_season in [160, 230, None]:
            for three_digit in models_bank.keys():

                ## Parameters dictionary p RANDOM FOREST
                p = {

                ## SPECIFY MODEL ##
                'model_name': 'RF' + three_digit,
                'model': RandomForestClassifier(**models_bank[three_digit]),
                'training_sample_size': training_sample_size,
                'validation_sample_size': validation_sample_size,

                ## SPECIFY TILE AND SCHEME ##
                'tile': tile_coiid[0],
                'years': [2018, 2019, 2020, 2021, 2022],
                'scheme_name': scheme_name,
                'crop_of_interest_id': tile_coiid[1], 
                'in_season': in_season
                }

                #fit_predict_report(**p) # run with the above parameters
                r.append(fit_predict_report(**p))
                
                ## Parameters dictionary p EXTRA TREES
                p = {

                ## SPECIFY MODEL ##
                'model_name': 'ET' + three_digit,
                'model': ExtraTreesClassifier(**models_bank[three_digit]),
                'training_sample_size': training_sample_size,
                'validation_sample_size': validation_sample_size,

                ## SPECIFY TILE AND SCHEME ##
                'tile': tile_coiid[0],
                'years': [2018, 2019, 2020, 2021, 2022],
                'scheme_name': scheme_name,
                'crop_of_interest_id': tile_coiid[1], 
                'in_season': in_season
                }

                r.append(fit_predict_report(**p))
                
dask.compute(*r)

-- Process for RF001_0.001_0.001_15TVG_14day_1_160.csv --
Starting a fold...
> Assembling the datasets
-- Process for ET074_0.001_0.001_15TVG_5day_1_160.csv --
Starting a fold...
> Assembling the datasets
-- Process for ET029_0.001_0.001_10SFH_5day_75_None.csv --
Starting a fold...
> Assembling the datasets
-- Process for ET058_0.001_0.001_15TVG_14day_1_230.csv --
Starting a fold...
> Assembling the datasets
-- Process for ET074_0.001_0.001_15TVG_5day_1_230.csv --
Starting a fold...
> Assembling the datasets
-- Process for RF035_0.001_0.001_10SFH_5day_75_160.csv --
Starting a fold...
> Assembling the datasets
-- Process for ET042_0.001_0.001_15TVG_5day_1_None.csv --
Starting a fold...
> Assembling the datasets
-- Process for RF015_0.001_0.001_10SFH_5day_75_230.csv --
Starting a fold...
> Assembling the datasets
-- Process for RF067_0.001_0.001_10SFH_14day_75_None.csv --
Starting a fold...
> Assembling the datasets
-- Process for RF011_0.001_0.001_10SFH_5day_75_230.csv --
Starting a fol

Function:  execute_task
args:      ((<function apply at 0x2b33e84d9820>, <function fit_predict_report at 0x2b33fb08e0d0>, [], (<class 'dict'>, [['model_name', 'RF001'], ['model', RandomForestClassifier(bootstrap=False, max_features=0.05, n_estimators=200,
                       n_jobs=-1)], ['training_sample_size', 0.001], ['validation_sample_size', 0.001], ['tile', '15TVG'], ['years', [2018, 2019, 2020, 2021, 2022]], ['scheme_name', '14day'], ['crop_of_interest_id', 1], ['in_season', 160]])))
kwargs:    {}
Exception: "MemoryError((13395600, 40), dtype('int16'))"



MemoryError: Unable to allocate 1022. MiB for an array with shape (13395600, 40) and data type int16



-- Process for RF067_0.001_0.001_10SFH_14day_75_None.csv --
Starting a fold...
> Assembling the datasets


Function:  execute_task
args:      ((<function apply at 0x2ac10cb0d820>, <function fit_predict_report at 0x2ac11f668160>, [], (<class 'dict'>, [['model_name', 'RF049'], ['model', RandomForestClassifier(bootstrap=False, max_features=0.1, n_estimators=500,
                       n_jobs=-1)], ['training_sample_size', 0.001], ['validation_sample_size', 0.001], ['tile', '10SFH'], ['years', [2018, 2019, 2020, 2021, 2022]], ['scheme_name', '14day'], ['crop_of_interest_id', 75], ['in_season', None]])))
kwargs:    {}
Exception: "MemoryError((53582400, 106), dtype('int16'))"

  next(self.gen)
Function:  execute_task
args:      ((<function apply at 0x2af4aba7c820>, <function fit_predict_report at 0x2af4ce59f3a0>, [], (<class 'dict'>, [['model_name', 'RF076'], ['model', RandomForestClassifier(class_weight='balanced', max_features=1.0,
                       n_estimators=500, n_jobs=-1)], ['training_sample_size', 0.001], ['validation_sample_size', 0.001], ['tile', '10SFH'], ['years', [2018, 2019, 2

> Fitting the model on the training set


  n_jobs = min(effective_n_jobs(n_jobs), n_estimators)


> Predicting on the validation set


distributed.diskutils - INFO - Found stale lock file and directory '/work/sds-lab/august/crops/notebooks/dask-worker-space/worker-3kvwtbkf', purging


> Recording performance metrics
Finished a fold.
Starting a fold...
> Assembling the datasets


Function:  execute_task
args:      ((<function apply at 0x2acb2c0a6820>, <function fit_predict_report at 0x2acb46c97430>, [], (<class 'dict'>, [['model_name', 'ET049'], ['model', ExtraTreesClassifier(max_features=0.1, n_estimators=500, n_jobs=-1)], ['training_sample_size', 0.001], ['validation_sample_size', 0.001], ['tile', '10SFH'], ['years', [2018, 2019, 2020, 2021, 2022]], ['scheme_name', '14day'], ['crop_of_interest_id', 75], ['in_season', None]])))
kwargs:    {}
Exception: "MemoryError((53582400, 106), dtype('int16'))"

  next(self.gen)
Function:  execute_task
args:      ((<function apply at 0x2b4573740820>, <function fit_predict_report at 0x2b4596349d30>, [], (<class 'dict'>, [['model_name', 'ET004'], ['model', ExtraTreesClassifier(bootstrap=True, class_weight='balanced', max_features=0.05,
                     n_estimators=200, n_jobs=-1)], ['training_sample_size', 0.001], ['validation_sample_size', 0.001], ['tile', '15TVG'], ['years', [2018, 2019, 2020, 2021, 2022]], ['scheme_n

> Fitting the model on the training set


Function:  execute_task
args:      ((<function apply at 0x2ac34094a820>, <function fit_predict_report at 0x2ac353559c10>, [], (<class 'dict'>, [['model_name', 'RF048'], ['model', RandomForestClassifier(class_weight='balanced', max_features=0.05,
                       min_samples_split=4, n_estimators=500, n_jobs=-1)], ['training_sample_size', 0.001], ['validation_sample_size', 0.001], ['tile', '15TVG'], ['years', [2018, 2019, 2020, 2021, 2022]], ['scheme_name', '5day'], ['crop_of_interest_id', 1], ['in_season', 230]])))
kwargs:    {}
Exception: "MemoryError((3536438400,), dtype('int16'))"

  next(self.gen)
Function:  execute_task
args:      ((<function apply at 0x2b693f909820>, <function fit_predict_report at 0x2b6962446040>, [], (<class 'dict'>, [['model_name', 'RF077'], ['model', RandomForestClassifier(bootstrap=False, max_features=1.0, min_samples_split=4,
                       n_estimators=500, n_jobs=-1)], ['training_sample_size', 0.001], ['validation_sample_size', 0.001], ['til

> Fitting the model on the training set


Function:  execute_task
args:      ((<function apply at 0x2ae277f0f820>, <function fit_predict_report at 0x2ae296a7a4c0>, [], (<class 'dict'>, [['model_name', 'RF037'], ['model', RandomForestClassifier(bootstrap=False, max_features=1.0, min_samples_split=4,
                       n_estimators=200, n_jobs=-1)], ['training_sample_size', 0.001], ['validation_sample_size', 0.001], ['tile', '10SFH'], ['years', [2018, 2019, 2020, 2021, 2022]], ['scheme_name', '5day'], ['crop_of_interest_id', 75], ['in_season', 160]])))
kwargs:    {}
Exception: "MemoryError((13395600, 94), dtype('int16'))"

  next(self.gen)
Function:  execute_task
args:      ((<function apply at 0x2b72db150820>, <function fit_predict_report at 0x2b72fdcad940>, [], (<class 'dict'>, [['model_name', 'RF052'], ['model', RandomForestClassifier(class_weight='balanced', max_features=0.1,
                       n_estimators=500, n_jobs=-1)], ['training_sample_size', 0.001], ['validation_sample_size', 0.001], ['tile', '15TVG'], ['year

> Fitting the model on the training set




> Predicting on the validation set


  n_jobs = min(effective_n_jobs(n_jobs), n_estimators)


> Recording performance metrics
Finished a fold.
Starting a fold...
> Assembling the datasets
> Fitting the model on the training set
> Predicting on the validation set


  n_jobs = min(effective_n_jobs(n_jobs), n_estimators)


> Recording performance metrics
Finished a fold.
Starting a fold...
> Assembling the datasets


Function:  execute_task
args:      ((<function apply at 0x2ad969035820>, <function fit_predict_report at 0x2ad97bbc2430>, [], (<class 'dict'>, [['model_name', 'ET056'], ['model', ExtraTreesClassifier(bootstrap=True, class_weight='balanced', max_features=0.1,
                     min_samples_split=4, n_estimators=500, n_jobs=-1)], ['training_sample_size', 0.001], ['validation_sample_size', 0.001], ['tile', '10SFH'], ['years', [2018, 2019, 2020, 2021, 2022]], ['scheme_name', '5day'], ['crop_of_interest_id', 75], ['in_season', 160]])))
kwargs:    {}
Exception: "MemoryError((13395600, 94), dtype('int16'))"

Function:  execute_task
args:      ((<function apply at 0x2afc89beb820>, <function fit_predict_report at 0x2afcac728670>, [], (<class 'dict'>, [['model_name', 'ET079'], ['model', ExtraTreesClassifier(bootstrap=True, max_features=1.0, min_samples_split=4,
                     n_estimators=500, n_jobs=-1)], ['training_sample_size', 0.001], ['validation_sample_size', 0.001], ['tile', '10SF

> Fitting the model on the training set
> Fitting the model on the training set
> Predicting on the validation set


  n_jobs = min(effective_n_jobs(n_jobs), n_estimators)


> Recording performance metrics
Finished a fold.
Starting a fold...
> Assembling the datasets
> Predicting on the validation set


  n_jobs = min(effective_n_jobs(n_jobs), n_estimators)


> Recording performance metrics
Finished a fold.
Starting a fold...
> Assembling the datasets


Function:  execute_task
args:      ((<function apply at 0x2b363d409820>, <function fit_predict_report at 0x2b36640193a0>, [], (<class 'dict'>, [['model_name', 'RF015'], ['model', RandomForestClassifier(max_features=0.1, min_samples_split=4, n_estimators=200,
                       n_jobs=-1)], ['training_sample_size', 0.001], ['validation_sample_size', 0.001], ['tile', '10SFH'], ['years', [2018, 2019, 2020, 2021, 2022]], ['scheme_name', '5day'], ['crop_of_interest_id', 75], ['in_season', 230]])))
kwargs:    {}
Exception: "MemoryError((13382205, 178), dtype('int16'))"



> Fitting the model on the training set


  n_jobs = min(effective_n_jobs(n_jobs), n_estimators)


> Predicting on the validation set
> Recording performance metrics
Finished a fold.
Starting a fold...
> Assembling the datasets




> Fitting the model on the training set
> Fitting the model on the training set
> Fitting the model on the training set
> Predicting on the validation set


  n_jobs = min(effective_n_jobs(n_jobs), n_estimators)


> Recording performance metrics
Finished a fold.
Starting a fold...
> Assembling the datasets
> Predicting on the validation set


  n_jobs = min(effective_n_jobs(n_jobs), n_estimators)


> Recording performance metrics
Finished a fold.
Starting a fold...
> Assembling the datasets
> Fitting the model on the training set


Function:  execute_task
args:      ((<function apply at 0x2aee30e10820>, <function fit_predict_report at 0x2aee439c0a60>, [], (<class 'dict'>, [['model_name', 'RF027'], ['model', RandomForestClassifier(max_features=0.4, n_estimators=200, n_jobs=-1)], ['training_sample_size', 0.001], ['validation_sample_size', 0.001], ['tile', '10SFH'], ['years', [2018, 2019, 2020, 2021, 2022]], ['scheme_name', '5day'], ['crop_of_interest_id', 75], ['in_season', 160]])))
kwargs:    {}
Exception: "MemoryError((3536438400,), dtype('int16'))"

  n_jobs = min(effective_n_jobs(n_jobs), n_estimators)


> Predicting on the validation set
> Recording performance metrics
Finished a fold.
Starting a fold...
> Assembling the datasets
> Predicting on the validation set


  n_jobs = min(effective_n_jobs(n_jobs), n_estimators)


> Recording performance metrics
Finished a fold.
Starting a fold...
> Assembling the datasets
> Fitting the model on the training set
> Predicting on the validation set


  n_jobs = min(effective_n_jobs(n_jobs), n_estimators)


> Recording performance metrics
Finished a fold.
Starting a fold...
> Assembling the datasets


Function:  execute_task
args:      ((<function apply at 0x2b387ff68820>, <function fit_predict_report at 0x2b38ab271700>, [], (<class 'dict'>, [['model_name', 'ET054'], ['model', ExtraTreesClassifier(class_weight='balanced', max_features=0.1,
                     min_samples_split=4, n_estimators=500, n_jobs=-1)], ['training_sample_size', 0.001], ['validation_sample_size', 0.001], ['tile', '10SFH'], ['years', [2018, 2019, 2020, 2021, 2022]], ['scheme_name', '5day'], ['crop_of_interest_id', 75], ['in_season', 160]])))
kwargs:    {}
Exception: "MemoryError((3536438400,), dtype('int16'))"

Function:  execute_task
args:      ((<function apply at 0x2b3aa6c82820>, <function fit_predict_report at 0x2b3ac97edb80>, [], (<class 'dict'>, [['model_name', 'ET017'], ['model', ExtraTreesClassifier(max_features=0.2, n_estimators=200, n_jobs=-1)], ['training_sample_size', 0.001], ['validation_sample_size', 0.001], ['tile', '15TVG'], ['years', [2018, 2019, 2020, 2021, 2022]], ['scheme_name', '5day'], ['

> Fitting the model on the training set
> Fitting the model on the training set
> Predicting on the validation set
> Predicting on the validation set


  n_jobs = min(effective_n_jobs(n_jobs), n_estimators)
  n_jobs = min(effective_n_jobs(n_jobs), n_estimators)


> Recording performance metrics
Finished a fold.
Starting a fold...
> Assembling the datasets
> Recording performance metrics
Finished a fold.
Starting a fold...
> Assembling the datasets
> Fitting the model on the training set


  n_jobs = min(effective_n_jobs(n_jobs), n_estimators)


> Predicting on the validation set
> Recording performance metrics
Finished a fold.
Starting a fold...
> Assembling the datasets
