In [1]:
import numpy as np


In [2]:
from sklearn.tree import DecisionTreeClassifier

### Remind myself how Scikit-learn works

In [3]:
X_train = np.array([[1,2,3],
                   [4,5,6],
                   [7,4,2],
                   [3,9,5],
                   [2,2,3]])
y_train = np.array([0,1,0,1,1])

In [4]:
tree01 = DecisionTreeClassifier(max_depth=1)
tree01.fit(X_train, y_train)
pred = tree01.predict(X_train)

In [5]:
pred

array([0, 1, 0, 1, 0])

#### Great, I remembered how to do that!

Try using kwargs.

In [11]:
kwargs = {'max_depth': 2}
tree30 = DecisionTreeClassifier(**kwargs)
tree30.fit(X_train, y_train)
pred = tree30.predict(X_train)

In [9]:
tree30

Insight: for each parameter combo, I can express those parameters as a dictionary, and then feed that dictionary into the model initialization.

In [12]:
all_models_params = {}
all_models_params['04000'] = {'max_depth': 3}
tree40 = DecisionTreeClassifier(**all_models_params['04000'])
tree40.fit(X_train, y_train)
pred = tree40.predict(X_train)

In [14]:
tree40

Insight: I could have separate parts of the dictionary for the params that go inside the SKL class versus the ones that stay outside of it (training_sample_size, period_scheme, tile)

### Read in Refls

### Read in Crops

### Create Input-Target for one year

In [None]:
tile = '10SFH'
scheme_name = '14day'
year = 2018
crop_of_interest_id = 75

In [24]:
coiid = crop_of_interest_id

In [25]:
# Read in Refls for 2018
refl = np.load(f'../data/composited_interpolated/Refl_{tile}_{year}_{scheme_name}.npy')


In [39]:
crop = []
for y in range(year-4,year+1):
    crop.append(np.load(f'../data/processed_crop/Crop_{tile}_{y}.npy')==coiid)

## Next: combine all of the above into an X and a y for 2018.

In [None]:

# first, make X: X is an array with all the Refl and also the Crop
# rotation from the previous 4 years


In [45]:
X = np.column_stack([refl] + crop[:-1])
y = crop[-1].reshape(-1,1)

y.shape

(13395600, 1)

## Next: put all of above in a function that allows you to just input the params -- results in X and y for 2018.

In [None]:
tile = '10SFH'
scheme_name = '14day'
year = 2018
crop_of_interest_id = 75

In [171]:
def create_X_y_single_year(tile,year,scheme_name,crop_of_interest_id):
    coiid = crop_of_interest_id
    
    refl = np.load(f'../data/composited_interpolated/Refl_{tile}_{year}_{scheme_name}.npy')

    crop = []
    for y in range(year-4,year+1):
        crop.append(np.load(f'../data/processed_crop/Crop_{tile}_{y}.npy')==coiid)

    X = np.column_stack([refl] + crop[:-1])
    y = crop[-1]
    
    return X, y

In [172]:
X, y = create_X_y_single_year('10SFH',2018,'14day',75)

## Then, loop over 2018-2022: now we have an X and a y for every year.

In [None]:
train_years=[2018,2019,2020,2021]
val_year=2022

In [184]:
def create_X_y_multiyear(tile,
                      years,
                      scheme_name,
                      crop_of_interest_id):
    X_list = []
    y_list = []
    
    for y in years:
        X, y = create_X_y_single_year(tile,y,scheme_name,
                                      crop_of_interest_id)
        X_list.append(X)
        y_list.append(y)
    
    X = np.concatenate(X_list)
    y = np.concatenate(y_list)
    
    return X, y

In [177]:
def create_X_y(tile,
              years,
              scheme_name,
              crop_of_interest_id):
    if type(years)==int:
        return create_X_y_single_year(tile,
                                      year,
                                      scheme_name,
                                      crop_of_interest_id)
    return create_X_y_multiyear(tile,
                              years,
                              scheme_name,
                              crop_of_interest_id)

In [185]:
X_train, y_train = create_X_y_multiyear('10SFH',[2018,2019],'14day',75)

#X_val, y_val = create_X_y_single_year('10SFH',2020,'14day',75)

In [187]:
X_train.shape

(26791200, 106)

In [190]:
y_train[:200].shape

(200,)

## BROWN PAPER BAG v1.0

draft below.

In [163]:
from sklearn.ensemble import RandomForestClassifier

In [164]:
## SPECIFY MODEL ##
model_name = 'BrownBag01'
model = RandomForestClassifier()

In [None]:
## SPECIFY TILE AND SCHEME ##
tile = '10SFH'
years = [2018, 2019, 2020, 2021, 2022]
scheme_name = '14day'
crop_of_interest_id = 75 # Almonds

In [147]:
conf = []

In [149]:
for val_year in years:
    train_years = [yr for yr in range(2018,2023) if yr!=val_year]
    
    X_train, y_train = create_X_y(tile,train_years,
                                  scheme_name,crop_of_interest_id)
    X_val, y_val = create_X_y(tile,val_year,
                              scheme_name,crop_of_interest_id)
    
    model.fit(X_train, y_train)
    pred = model.predict(X_val)
    
    act = y_val[:,0]
    ActPred_00 = sum((act==0) & (pred==0))
    ActPred_01 = sum((act==0) & (pred==1))
    ActPred_10 = sum((act==1) & (pred==0))
    ActPred_11 = sum((act==1) & (pred==1))
    conf_1yr = [ActPred_00, ActPred_01, ActPred_10, ActPred_11]

    conf.append(conf_1yr)

In [None]:
carr = np.array(conf)
totals = carr.sum(axis=0)

carr = np.row_stack([carr,totals])

# above we added the totals row
# now we need to add the columns for precision and recall

# create dataframe
cdf = pd.DataFrame(data = carr,
                  index = [f'ValYear{yr}' for yr in years]+['Total'],
                  columns = ['ActPred_00', 'ActPred_01', 
                             'ActPred_10', 'ActPred_11']
                  )

cdf['Precision'] = cdf.ActPred_11 / (cdf.ActPred_01 + cdf.ActPred_11)
cdf['Recall'] = cdf.ActPred_11 / (cdf.ActPred_10 + cdf.ActPred_11)

cdf.to_csv(f'../data/results/{model_name}.csv')

I think the above will work!

Now put it in a separate clean notebook. (Duplicate this one and delete the stuff we don't need).

In [None]:
Precision = ActPred_11 / (ActPred_01 + ActPred_11)
Recall = ActPred_11 / (ActPred_10 + ActPred_11)

In [169]:
X_train[:200].shape

(200, 106)

In [170]:
y_train[:200].shape

(200, 1)

In [166]:
model.fit(X_train[:200], y_train[:200].ravel())
pred = model.predict(X_val)

In [168]:
pred.shape

(13395600,)

In [89]:
act = y_val[:,0]
ActPred_00 = sum((act==0) & (pred==0))
ActPred_01 = sum((act==0) & (pred==1))
ActPred_10 = sum((act==1) & (pred==0))
ActPred_11 = sum((act==1) & (pred==1))

In [143]:
conf_1yr = [ActPred_00, ActPred_01, ActPred_10, ActPred_11]

conf.append(conf_1yr)

In [90]:
Precision = ActPred_11 / (ActPred_01 + ActPred_11)

Recall = ActPred_11 / (ActPred_10 + ActPred_11)

In [96]:
Precision

0.9821753339511807

In [97]:
Recall

0.3140381521898965

In [71]:
type([2,3,4])

list

In [104]:
import pandas as pd

In [133]:
df = pd.DataFrame(index=[f'ValYear{y}' for y in years])

Upshot: for some reason this is hard todo in pandas, so I'll do it all in numpy instead and add the nice names later!

## Then put all of the above in a nice neat function.

Still need functions for...
- create_X_y (agnostic of how many years)
- something see notes
- something see notes

In [None]:
def create_X_y(tile,years,scheme_name,crop_of_interest_id):
    if type


#### Later: divide into train and test for cross validation

In [None]:
from sklearn.datasets import Iris