In [14]:
import numpy as np
import pandas as pd

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import StandardScaler

In [2]:
def create_X_y_single_year(tile,year,scheme_name,crop_of_interest_id):
    coiid = crop_of_interest_id
    
    refl = np.load(f'../data/composited_interpolated/Refl_{tile}_{year}_{scheme_name}.npy')

    crop = []
    for y in range(year-4,year+1):
        crop.append(np.load(f'../data/processed_crop/Crop_{tile}_{y}.npy')==coiid)

    X = np.column_stack([refl] + crop[:-1])
    y = crop[-1]
    
    return X, y

In [3]:
def create_X_y_multiyear(tile,
                      years,
                      scheme_name,
                      crop_of_interest_id):
    X_list = []
    y_list = []
    
    for y in years:
        X, y = create_X_y_single_year(tile,y,scheme_name,
                                      crop_of_interest_id)
        X_list.append(X)
        y_list.append(y)
    
    X = np.concatenate(X_list)
    y = np.concatenate(y_list)
    
    return X, y

In [4]:
def create_X_y(tile,
              years,
              scheme_name,
              crop_of_interest_id,
              ):
    if type(years)==int:
        return create_X_y_single_year(tile,
                                      years,
                                      scheme_name,
                                      crop_of_interest_id)
    
    return create_X_y_multiyear(tile,
                              years,
                              scheme_name,
                              crop_of_interest_id)

In [9]:
def fit_predict_report(model_name,
                      model,
                      training_sample_size,
                      tile,
                      years,
                      scheme_name,
                      crop_of_interest_id
                      ):

    conf = []

    for val_year in years:
        print('Starting a fold...')
        print('> Assembling the datasets')
        train_years = [yr for yr in range(2018,2023) if yr!=val_year]

        X_train0, y_train0 = create_X_y(tile,train_years,
                                      scheme_name,crop_of_interest_id)
        X_val, y_val = create_X_y(tile,val_year,
                                  scheme_name,crop_of_interest_id)

        if training_sample_size is not None:
            X_train, X_surplus, y_train, y_surplus = train_test_split(X_train0,
                                                                     y_train0,
                                                                     train_size=training_sample_size,
                                                                     random_state=19)
        if training_sample_size is None:
            X_train, y_train = X_train0, y_train0

        print('> Fitting the model on the training set')
        model.fit(X_train, y_train)
        print('> Predicting on the validation set')
        pred = model.predict(X_val)

        print('> Recording performance metrics')
        act = y_val
        ActPred_00 = sum((act==0) & (pred==0))
        ActPred_01 = sum((act==0) & (pred==1))
        ActPred_10 = sum((act==1) & (pred==0))
        ActPred_11 = sum((act==1) & (pred==1))
        conf_1yr = [ActPred_00, ActPred_01, ActPred_10, ActPred_11]

        conf.append(conf_1yr)
        print('Finished a fold.')

    carr = np.array(conf)
    totals = carr.sum(axis=0)

    carr = np.row_stack([carr,totals])

    # above we added the totals row
    # now we need to add the columns for precision and recall

    # create dataframe
    cdf = pd.DataFrame(data = carr,
                      index = [f'ValYear{yr}' for yr in years]+['Total'],
                      columns = ['ActPred_00', 'ActPred_01', 
                                 'ActPred_10', 'ActPred_11']
                      )

    cdf['Precision'] = cdf.ActPred_11 / (cdf.ActPred_01 + cdf.ActPred_11)
    cdf['Recall'] = cdf.ActPred_11 / (cdf.ActPred_10 + cdf.ActPred_11)
    
    param_strings = [f'# {k}: {p[k]}' for k in p.keys() if k!='years']
    comment = '\n'.join(param_strings) + '\n' 
    with open(f'../data/results/{model_name}.csv', 'a') as f:
        f.write(comment)
        cdf.to_csv(f)
    
    print(f'Find results in ../data/results/{model_name}.csv')

## BROWN PAPER BAG v2.0

draft below.

In [15]:
## Parameters dictionary p
p = {

## SPECIFY MODEL ##
'model_name': 'BrownBag06--SGDLogistic',
'model': make_pipeline(StandardScaler(),
                       SGDClassifier(loss='log_loss')),
'training_sample_size': 0.01,

## SPECIFY TILE AND SCHEME ##
'tile': '10SFH',
'years': [2018, 2019, 2020, 2021, 2022],
'scheme_name': '14day',
'crop_of_interest_id': 75 # Almonds 
}

In [16]:
fit_predict_report(model_name=p['model_name'],
                  model=p['model'],
                  training_sample_size=p['training_sample_size'],
                  tile=p['tile'],
                  years=p['years'],
                  scheme_name=p['scheme_name'],
                  crop_of_interest_id=p['crop_of_interest_id']
                  )

Starting a fold...
> Assembling the datasets
> Fitting the model on the training set
> Predicting on the validation set
> Recording performance metrics
Finished a fold.
Starting a fold...
> Assembling the datasets
> Fitting the model on the training set
> Predicting on the validation set
> Recording performance metrics
Finished a fold.
Starting a fold...
> Assembling the datasets
> Fitting the model on the training set
> Predicting on the validation set
> Recording performance metrics
Finished a fold.
Starting a fold...
> Assembling the datasets
> Fitting the model on the training set
> Predicting on the validation set
> Recording performance metrics
Finished a fold.
Starting a fold...
> Assembling the datasets
> Fitting the model on the training set
> Predicting on the validation set
> Recording performance metrics
Finished a fold.
Find results in ../data/results/BrownBag06--SGDLogistic.csv


I think the above will work!

