## Rasters Sandbox

August Posch, Jan 2023

In [1]:
import numpy as np

import pandas as pd

import matplotlib.pyplot as plt

import rasterio

from sklearn.utils import shuffle

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import HistGradientBoostingClassifier

from sklearn.model_selection import train_test_split

from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate
from sklearn.model_selection import validation_curve

import sys
sys.path.append('../src/')
import aposch_cv_src as aposch

# An actual machine learning script

We will collect 5 years of data and use the first 4 years to predict the last year.

In [7]:
county = 'Yolo'
years = range(2017,2022)
features = []

for year in years:
    tiff = f'../Data/{county}CountyCDL_{year}/clipped.TIF'
    print('Loading tiff file', tiff)
    with rasterio.open(tiff) as src:
        array = src.read(1)
        feature = array.flatten()
        features.append(feature)
        print(f'Feature for year {year} is length n={len(feature)}')

Loading tiff file ../Data/YoloCountyCDL_2017/clipped.TIF
Feature for year 2017 is length n=23090594
Loading tiff file ../Data/YoloCountyCDL_2018/clipped.TIF
Feature for year 2018 is length n=23090594
Loading tiff file ../Data/YoloCountyCDL_2019/clipped.TIF
Feature for year 2019 is length n=23090594
Loading tiff file ../Data/YoloCountyCDL_2020/clipped.TIF
Feature for year 2020 is length n=23090594
Loading tiff file ../Data/YoloCountyCDL_2021/clipped.TIF
Feature for year 2021 is length n=23090594


In [8]:
print('Cleaning data...')
X_y = np.stack(features).T
X_y = np.delete(X_y, X_y[:,-1]==255, axis=0) # remove the background part of the image
print(X_y.shape)

Cleaning data...
(4840554, 5)


In [9]:
# Problem: some categories have a very low count and
# won't work for our classifier
# we see this is 59

print('Looking for pixels with low counts (problem)')
unique, counts = np.unique(X_y[:,-1], return_counts=True)
unq_cnts = np.asarray((unique, counts))
print(unq_cnts)
problem_IDs = unique[counts<5]
print('Problem IDs are', problem_IDs)
print('Removing problem-ID pixels')
for pid in problem_IDs:
    X_y = np.delete(X_y, X_y[:,-1]==pid, axis=0)


Looking for pixels with low counts (problem)
[[     1      2      3      4      5      6     12     13     21     23
      24     27     28     29     33     36     37     42     43     44
      47     48     49     50     51     53     54     57     58     61
      66     67     68     69     71     72     74     75     76     77
      92    111    112    121    122    123    124    131    141    142
     143    152    176    190    195    204    205    206    208    209
     211    212    213    214    216    217    219    220    221    222
     224    225    226    227    228    229    236    243]
 [ 31671    178 150221   3085      4 158093   1136   1104  15452   1440
  299196   4650  29451    467  41766 238178  97915   4193      2  12483
     362   5740    482   4394    117    745 263633    940  57902 491366
     527    186   2713 152167    841    976    946 422944 188402   1572
       5  55918      4  97470  84719 109663  43817  14000    510  36296
   32523 800617 688450   7754  5

In [10]:
print('Last cleaning steps...')
X_y = shuffle(X_y, random_state=19)
X = X_y[:,0:-1]
y = X_y[:,-1]
print('All cleaned.')

Last cleaning steps...
All cleaned.


Classifier:

In [11]:
aposch.cv_readout(
    HistGradientBoostingClassifier(categorical_features=[1,1,1,1]),
                                            X,
                                            y)

Performing 5-fold cross-validation...
Doing the first fold...
Finished scoring that fold. Doing the next task...
Finished scoring that fold. Doing the next task...
Finished scoring that fold. Doing the next task...
Finished scoring that fold. Doing the next task...
Finished scoring that fold. Doing the next task...
Aggregating the scores...
               mean       std
precision  0.736575  0.002142
recall     0.723549  0.005089
f1         0.726503  0.003436


(array([[ 8414,    16,    64, ...,   101,    20,    13],
        [    1,     1,     2, ...,     0,     0,     3],
        [   88,   321, 99496, ...,    85,   747,    21],
        ...,
        [    2,     0,     0, ...,     8,     0,     1],
        [    3,     0,    89, ...,     3,   842,     0],
        [    0,     0,     0, ...,     0,     0,     3]]),
                mean       std
 precision  0.736575  0.002142
 recall     0.723549  0.005089
 f1         0.726503  0.003436)

We did it! So, what was odd about this toy example?
- we removed a few pixels (<20 out of 2 million) because they came from rare classes
- I'm "cheating" compared to Shashank in that I used non-crop pixels like buildings
- it would be difficult to get our predicted classes back into raster form and make an image
- for science, we need to hold out a test set

Next Steps:
- calculate more performance metrics (crop-wise performance, other metrics)

Already Fixed:
- we had limited performance metrics (i.e. only precision and recall and f1, and only weighted pixel-wise globally)
- precision ill-defined error

## Old lives below

In [None]:
with rasterio.open('Data/LeeCountyCDL_2021/clipped.TIF') as src:
    print(src.width, src.height)
    print(src.crs)
    print(src.transform)
    print(src.count)
    print(src.indexes)

In [None]:
special_nbr = 365
low_nbr = 8
interjection = "Aha!"

string = f"I work {special_nbr} days per year"

string2 = f"I work {special_nbr} days per year and I sleep {low_nbr} hours every night."

string3 = f"{interjection} I work {special_nbr} days per year and I sleep {low_nbr} hours every night."


print(string)
print(string2)
print(string3)

In [None]:
src = rasterio.open('Data/LeeCountyCDL_2021/clipped.TIF')
plt.imshow(src.read(1), cmap='pink')
plt.show()

In [None]:
src = rasterio.open('../Data/LeeCountyCDL_2021/clipped.TIF')
array1 = src.read(1)
plt.imshow(array1)
plt.show()

In [None]:
from rasterio.plot import show_hist
show_hist(
    src, bins=50, lw=0.0, stacked=False, alpha=0.3,
    histtype='stepfilled', title="Histogram")

In [None]:
array1.shape

255 is white, so this array must include the white border around the edge

In [None]:
array1

In [None]:
np.mean(array1)

In [None]:
flat = array1.flatten()

In [None]:
flat.shape

In [None]:
X_y = np.stack([flat, flat, flat]).T

Note: We don't yet have the legend/reference of which crop ID corresponds to which crop and which color. But no worries, we can still do machine learning!

In [None]:
X_y

In [None]:
X = X_y[:,0:-1]
y = X_y[:,-1].reshape(-1,1)


In [None]:
print(X.shape, y.shape)

In [None]:
plt.imshow(array1, cmap='Set3')

In [None]:
plt.imshow(array1, cmap='tab10')

In [None]:
array2 = array1.flatten()

In [None]:
array2

In [None]:
array2.shape

In [None]:
print('hello')