<a href="https://colab.research.google.com/github/boothmanrylan/accuracyAssessmentTools/blob/main/increasing_sample_size.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Installs and Imports

In [None]:
%cd /content
!git clone https://github.com/boothmanrylan/accuracyAssessmentTools.git
%cd accuracyAssessmentTools
!pip install .

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from acc_assessment.olofsson import Olofsson
from acc_assessment.utils import _expand_error_matrix

# Create Assessments

In [None]:
MIN = 10 # with less than this very likely to encounter a divide by zero error

classes = ["Deforestation", "Non-Forest", "Forest"]

error_matrix = pd.DataFrame(
    [[66, 3, 2],
     [5, 55, 2],
     [5, 8, 153]],
    index=classes,
    columns=classes,
)

mapped_populations = dict(zip(classes, [200000, 150000, 3200000]))

# sample(frac=1) shuffles the data to ensure they are in a random order
longform_data = _expand_error_matrix(error_matrix, "map", "ref").sample(frac=1)

# create n assessments where the ith assessment is based on the first i samples
all_assessments = [
    Olofsson(longform_data.iloc[:i, :], mapped_populations, "map", "ref")
    for i in range(MIN, longform_data.shape[0])
]

# Plot results

In [None]:
# get the users accuracy and standard error for the deforestation classes
deforestation_users_accuracy = [x.users_accuracy("Deforestation") for x in all_assessments]
ua = np.array([x[0] for x in deforestation_users_accuracy])
se = np.array([x[1] for x in deforestation_users_accuracy])
x = np.arange(0, ua.shape[0]) + MIN

plt.plot(x, ua)
plt.fill_between(x, ua - se, ua + se, alpha=0.3)
plt.xlabel("Num. Samples")
plt.ylabel("Deforestation User's Accuracy")
plt.ylim(0.7, 1)
plt.xlim(MIN, ua.shape[0] + MIN)
plt.show()

In [None]:
# plot the overall accuracy
overall_accuracy = [x.overall_accuracy() for x in all_assessments]
oa = np.array([x[0] for x in overall_accuracy])
oa_se = np.array([x[1] for x in overall_accuracy])
x = np.arange(0, ua.shape[0]) + MIN

plt.plot(x, oa)
plt.fill_between(x, oa - oa_se, oa + oa_se, alpha=0.3)
plt.xlabel("Number of Samples")
plt.ylabel("Overall Accuracy")
plt.ylim(0.7, 1)
plt.xlim(MIN, ua.shape[0] + MIN)
plt.show()