# About 

This notebook follows the recommendations in P. Olofsson et al. (2014) to create a stratified sampling design per map class (page 52 sec. 5). 
The goal is to select a sampling design by exploring how accuracy metrics change according to different sample sizes and sample allocations per class. 
The metrics used to investigate sampling design are overall and class-specific accuracies, and confidence intervals for each accuracy and area parameter.

In [1]:
import os
import numpy as np
import pandas as pd

# Assuming repository's parent directory is the home directory
home = os.path.expanduser("~")
os.chdir(os.path.join(home,'iceplant-detection-santa-barbara'))

### Determining the sample size (Olofsson et al. 2014 - sec. 5.1.1)

In [2]:
# ---------------------------------------------
# --------------- PARAMETER -------------------
# Import data: Pixels per class in map of SB coast
pixels_per_class_path = os.path.join(os.getcwd(),
                                    'data',
                                    'map',
                                    'validation_data',
                                    'pixel_counts',
                                    'final_model_combined_pixel_counts_total.csv')
df = pd.read_csv(pixels_per_class_path)
df

Unnamed: 0,n_other_veg,n_ice,n_nonveg
0,120173466,5981423,188071487


In [3]:
# ---------------------------------------------
# --------------- PARAMETERS ------------------
# standard error for all the points
std_error = 0.0105

# estimates of user's accuracies TP/(TP+FP)
# classes are: [other vegetation, iceplant, non_vegetation]
U = [0.8184, 0.8468, 0.9]
# ---------------------------------------------
# ---------------------------------------------

# fraction of pixels with a given class in total pixels
total_pix = sum(df.iloc[0,])
pix_prop = [n/total_pix for n in list(df.iloc[0,])]

# standard deviation of user's accuracies
# Cochran, 1977, Eq (5.55)
stdv = [ np.sqrt(u*(1-u)) for u in U]

numerator = sum([ x*y for x,y in zip(pix_prop, stdv)])

sample_size = (numerator/std_error)**2
print('sample size: ', sample_size)
print('std_error: ', std_error)
print('OA 95% conf interval (%): ', std_error*196)

sample size:  1010.9366977671175
std_error:  0.0105
OA 95% conf interval (%):  2.0580000000000003


### Determine sample allocation per class (Olofsson et al., 2014 - sec. 5.1.2)

In [4]:
def sample_allocation(n_pix_df, fixed_classes, fixed_n, sample_size):
    """ sample allocation per class combining fixed + proportional allocation
    
    Allocates classes in fixed_classes a fixed number of sample points and 
    distributes the remaing pts proportionally across the rest of the classes
    
    Parameters
    ----------
    n_pix_df : pandas.DataFrame
        one-row data frame with the number of pixels per class. 
        columns must be names of classes
        
    fixed_classes : list
        subset of n_pix_df.columns. these are the classes that will get a 
        fixed allocation of points.
    
    fixed_n : int
        the number of samples to allocate to indicated classes
    
    sample_size : int
        number of points to be distributed among classes
        see Oloffson et al., 2014, Eq. 13
    """
    
    # non-fixed allocation classes which will get proportional sample allocation 
    prop_classes = list(set(n_pix_df.columns) - set(fixed_classes))

    # add total of pixels in non-fixed allocation classes
    d = n_pix_df[prop_classes].sum(axis=1).iloc[0]

    # get proportions for each non-fixed class
    props = n_pix_df[prop_classes]/d

    # remaining points after allocating to fixed classes
    remain = sample_size-(fixed_n*len(fixed_classes))

    samples = []
    for col in n_pix_df.columns:
        if col in fixed_classes:
            samples.append(fixed_n)
        else:
            samples.append(int(props[col][0]*remain))
    return samples

# --------------------------------------------------------------------------------------
def strat_stderror(U, strat_sample):
    """ estimated standard error of estimated user's accuracies (U) per class 
    
    See Oloffson et al., 2014, Eq. 6.
    
    Parameters:
    ----------
    U : list
        a list with the estimated user's accuracies per class, each is a number in [0,1]
        
    strat_sample: list
        list of integers indicating how many points to sample from reach class
    
    Return:
    -------
    A list with the std. errors for each class based on U and the stratified sample.
    
    """
    
    return [ np.sqrt(u*(1-u)/(n-1)) for u,n in zip(U, strat_sample) ]
# --------------------------------------------------------------------------------------
def confidence_intervals(U, strat_sample):
    """ radius of 95% conf. interval around estimated user's accuracy for each class
    
    Parameters
    ----------
    U : list
        a list with the estimated user's accuracies per class, each is a number in [0,1]
        
    strat_sample: list
        list of integers indicating how many points to sample from reach class
    
    Return:
    -------
        a list with the radius of the 95% confidence interval (as a percentage and 
        rounded to two decimal places) for the estimated user's accuracies

    """
    
    se = strat_stderror(U, strat_sample)
    return [np.round(196*x,2) for x in se]
    

In [5]:
# Distributing sample among classes

conf_intrs =[]
strat_samples = []
strat_title = []

# ---------------------------------------------
strat_title.append('equal')
sample_equal = [sample_size/len(df.columns) for i in range(0,len(df.columns))]
strat_samples.append(sample_equal)
conf_intrs.append(confidence_intervals(U, sample_equal))

# ---------------------------------------------
# vegetation and iceplant get equal allocations
fixed_classes = ['n_other_veg','n_ice']

for n in [300,200,150, 140, 130, 120, 100]:
    strat_title.append(str(n))
    sample = sample_allocation(df, fixed_classes, n, sample_size)
    strat_samples.append(sample)
    conf_intrs.append(confidence_intervals(U, sample))
    
# ---------------------------------------------
# only iceplant gets fixed allocation
fixed_classes = ['n_ice']

for n in [400,300,200,150, 140, 130, 120, 100]:
    strat_title.append(str(n))
    sample = sample_allocation(df, fixed_classes, n, sample_size)
    strat_samples.append(sample)
    conf_intrs.append(confidence_intervals(U, sample))    
    
# ---------------------------------------------

strat_title.append('prop')
sample_prop = [sample_size*x for x in pix_prop]
strat_samples.append(sample_prop)
conf_intrs.append(confidence_intervals(U, sample_prop))


In [6]:
strat_df = pd.DataFrame(strat_samples).T
strat_df.columns  = strat_title
strat_df

Unnamed: 0,equal,300,200,150,140,130,120,100,400,300.1,200.1,150.1,140.1,130.1,120.1,100.1,prop
0,336.978899,300.0,200.0,150.0,140.0,130.0,120.0,100.0,238.0,277.0,316.0,335.0,339.0,343.0,347.0,355.0,386.624982
1,336.978899,300.0,200.0,150.0,140.0,130.0,120.0,100.0,400.0,300.0,200.0,150.0,140.0,130.0,120.0,100.0,19.243579
2,336.978899,410.0,610.0,710.0,730.0,750.0,770.0,810.0,372.0,433.0,494.0,525.0,531.0,537.0,543.0,555.0,605.068137


In [7]:
conf_intrs_df = pd.DataFrame(conf_intrs).T
conf_intrs_df.columns  = strat_title
conf_intrs_df

Unnamed: 0,equal,300,200,150,140,130,120,100,400,300.1,200.1,150.1,140.1,130.1,120.1,100.1,prop
0,4.12,4.37,5.36,6.19,6.41,6.65,6.93,7.59,4.91,4.55,4.26,4.13,4.11,4.09,4.06,4.02,3.85
1,3.85,4.08,5.0,5.78,5.99,6.22,6.47,7.1,3.53,4.08,5.0,5.78,5.99,6.22,6.47,7.1,16.53
2,3.21,2.91,2.38,2.21,2.18,2.15,2.12,2.07,3.05,2.83,2.65,2.57,2.55,2.54,2.53,2.5,2.39
