# Accuracy Assessment Stats and Confusion Matrices for Sargassum Classification

In [None]:
import os
import pandas as pd
import geopandas as gpd
import numpy as np
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

## Load the AA shapefiles into GeodataFrame

In [None]:
# Get individual date accuracy assessment shapefiles
source_dir = r'/Users/arbailey/Google Drive/My Drive/sargassum/aa'

# # Merge Individual files here
# files = [file for file in os.listdir(source_dir) if file.endswith('validated.shp') and file.startswith('aaPoints')]
# files.sort()
# print(files)
# # Append all AA shapefiles into one GeoDataFrame
# for file in files:
#     temp_gdf = gpd.read_file(os.path.join(source_dir,file))
#     if files.index(file) == 0:
#         sargassum_aa_gdf = temp_gdf.copy()
#     else:
#         sargassum_aa_gdf = sargassum_aa_gdf.append(temp_gdf)
        
# Use already existing files merged across all dates
file = 'aaPoints_validated_2019.shp'
sargassum_aa_gdf = gpd.read_file(os.path.join(source_dir,file))

        
sargassum_aa_gdf = sargassum_aa_gdf.dropna()  # remove rows w/null
print(sargassum_aa_gdf.describe())
sargassum_aa_gdf = sargassum_aa_gdf.astype({"validclass": int, "validpa": int})
sargassum_aa_gdf


In [None]:
# Original Patches
orig_patch_file = 'aaPatches_2019.shp'
orig_patch_gdf = gpd.read_file(os.path.join(source_dir,orig_patch_file))
# Updated training sites
# SR
sr_patch_file = 'aaPatches_2019sr.shp'
toa_patch_file = 'aaPatches_2019toa.shp'
sr_patch_gdf = gpd.read_file(os.path.join(source_dir,sr_patch_file))
toa_patch_gdf = gpd.read_file(os.path.join(source_dir,toa_patch_file))
print(orig_patch_gdf.head())
print(sr_patch_gdf.head())
print(toa_patch_gdf.head())

In [None]:
# Join these data to validated AA points
combined_aa_df = pd.merge(sargassum_aa_gdf, orig_patch_gdf.drop(columns=['geometry']), on="aa_id", how="left")
combined_aa_df = pd.merge(combined_aa_df, sr_patch_gdf.drop(columns=['geometry']), on="aa_id", how="left")
combined_aa_df = pd.merge(combined_aa_df, toa_patch_gdf.drop(columns=['geometry']), on="aa_id", how="left")
combined_aa_df = combined_aa_df.fillna(0)
# print(combined_aa_df.dtypes)
combined_aa_df = combined_aa_df.astype({"patch": 'int64', "sr_sarg": 'int64', "sr_patch": 'int64', "toa_patch": 'int64', "toa_sarg": 'int64'})
# print(combined_aa_df.dtypes)
combined_aa_df

In [None]:
# Join these data to validated AA points
sargassum_aa_gdf = pd.merge(sargassum_aa_gdf, orig_patch_gdf.drop(columns=['geometry']), on="aa_id", how="left")
sargassum_aa_gdf = pd.merge(sargassum_aa_gdf, sr_patch_gdf.drop(columns=['geometry']), on="aa_id", how="left")
sargassum_aa_gdf = pd.merge(sargassum_aa_gdf, toa_patch_gdf.drop(columns=['geometry']), on="aa_id", how="left")
sargassum_aa_gdf = sargassum_aa_gdf.fillna(0)
# print(combined_aa_df.dtypes)
sargassum_aa_gdf = sargassum_aa_gdf.astype({"patch": 'int64', "sr_sarg": 'int64', "sr_patch": 'int64', "toa_patch": 'int64', "toa_sarg": 'int64'})
# print(combined_aa_df.dtypes)
sargassum_aa_gdf

## Error Matrices and Accuracy Stats

### All Dates Together

In [None]:
def accuracy_report(df, groundtruth, predicted):
    print(confusion_matrix(df[groundtruth], df[predicted]))
    print(classification_report(df[groundtruth], df[predicted], labels=[0, 1]))
    print(accuracy_score(df[groundtruth], df[predicted]))

In [None]:
print('<------ All Dates - Original Training Sites ------>')
accuracy_report(sargassum_aa_gdf,'validpa','sargassum')
print('\n<------ All Dates - New Training Sites - SR ------>')
accuracy_report(sargassum_aa_gdf,'validpa','sr_sarg')
print('\n<------ All Dates - Original Training Sites - TOA ------>')
accuracy_report(sargassum_aa_gdf,'validpa','toa_sarg')

### Individual Dates

In [None]:
image_dates = sargassum_aa_gdf['imagedate'].unique()
# print(image_dates)

print('<------ Individual Dates - Original Training Sites ------>')
for date2test in image_dates:
    sargassum_aa_subset_gdf = sargassum_aa_gdf[sargassum_aa_gdf['imagedate']== date2test]
    print('\nDate: ' + date2test)
    accuracy_report(sargassum_aa_subset_gdf,'validpa','sargassum')

In [None]:
print('<------ Individual Dates - New Training Sites - SR ------>')
for date2test in image_dates:
    sargassum_aa_subset_gdf = sargassum_aa_gdf[sargassum_aa_gdf['imagedate']== date2test]
    print('\nDate: ' + date2test)
    accuracy_report(sargassum_aa_subset_gdf,'validpa','sr_sarg')

In [None]:
print('<------ Individual Dates - Original Training Sites - TOA ------>')
for date2test in image_dates:
    sargassum_aa_subset_gdf = sargassum_aa_gdf[sargassum_aa_gdf['imagedate']== date2test]
    print('\nDate: ' + date2test)
    accuracy_report(sargassum_aa_subset_gdf,'validpa','toa_sarg')

In [None]:
sargassum_aa_gdf.groupby(['sargassum','validclass','classdesc','validpa']).count()
# .agg(['count'])

In [None]:
sargassum_aa_gdf.groupby(['sr_sarg','validclass','classdesc','validpa']).count()


In [None]:
sargassum_aa_gdf.groupby(['toa_sarg','validclass','classdesc','validpa']).count()

In [None]:
# df = df[['STNAME','CTYNAME']].groupby(['STNAME'])['CTYNAME'] \
#                              .count() \
#                              .reset_index(name='count') \
#                              .sort_values(['count'], ascending=False) \
#                              .head(5)

def patch_summary(df, sarg, patch_id, cutoff=10):
    print(df.groupby(sarg).size())
    patchcount = df[patch_id].unique().size
    print('Patch Count: ' + str(patchcount))
    print(df.groupby(patch_id).size().nlargest(cutoff))

In [None]:
print('Sargassum Patches - Original Training Sites - SR')
patch_summary(sargassum_aa_gdf, 'sargassum', 'patch', 25)

In [None]:
print('Sargassum Patches - New Training Sites - SR')
patch_summary(sargassum_aa_gdf, 'sr_sarg', 'sr_patch')

In [None]:
print('Sargassum Patches - Original Training Sites - TOA')
patch_summary(sargassum_aa_gdf, 'toa_sarg', 'toa_patch')

In [None]:
print('<------ Individual Dates - Original Training Sites - SR ------>')
for date2test in image_dates:
    sargassum_aa_subset_gdf = sargassum_aa_gdf[sargassum_aa_gdf['imagedate']== date2test]
    print('\nDate: ' + date2test)
    patch_summary(sargassum_aa_subset_gdf,'sargassum','patch')

In [None]:
print('<------ Individual Dates - New Training Sites - SR ------>')
for date2test in image_dates:
    sargassum_aa_subset_gdf = sargassum_aa_gdf[sargassum_aa_gdf['imagedate']== date2test]
    print('\nDate: ' + date2test)
    patch_summary(sargassum_aa_subset_gdf,'sr_sarg','sr_patch')

In [None]:
print('<------ Individual Dates - Original Training Sites - TOA ------>')
for date2test in image_dates:
    sargassum_aa_subset_gdf = sargassum_aa_gdf[sargassum_aa_gdf['imagedate']== date2test]
    print('\nDate: ' + date2test)
    patch_summary(sargassum_aa_subset_gdf,'toa_sarg','toa_patch')

In [None]:
sargassum_aa_gdf

In [None]:
def by_patch(gdf,patch,date,predicted,groundtruth):
    absent_gdf = gdf[gdf[patch] == 0][[patch,date,predicted,groundtruth]]
    present_gdf = gdf[gdf[patch] > 0].groupby([patch,date], as_index=False)[predicted,groundtruth].first()
    polygons_gdf = pd.concat([absent_gdf, present_gdf], axis=0) 
    return polygons_gdf

# Original Training sites, by polygon
sargassum_aa_polygons_gdf = by_patch(sargassum_aa_gdf, 'patch','imagedate','sargassum','validpa')
print(sargassum_aa_polygons_gdf)
accuracy_report(sargassum_aa_polygons_gdf,'validpa','sargassum')

In [None]:
print('<------ Individual Dates - Original Training Sites -- Polygon AA ------>')
for date2test in image_dates:
    sargassum_aa_polygons_subset_gdf = sargassum_aa_polygons_gdf[sargassum_aa_polygons_gdf['imagedate']== date2test]
    print('\nDate: ' + date2test)
    accuracy_report(sargassum_aa_polygons_subset_gdf,'validpa','sargassum')

In [None]:
# Original Training sites TOA, by polygon
sargassum_aa_polygons_gdf = by_patch(sargassum_aa_gdf, 'toa_patch','imagedate','toa_sarg','validpa')
print(sargassum_aa_polygons_gdf)
accuracy_report(sargassum_aa_polygons_gdf,'validpa','toa_sarg')

In [None]:
print('<------ Individual Dates - Original Training Sites TOA -- Polygon AA ------>')
for date2test in image_dates:
    sargassum_aa_polygons_subset_gdf = sargassum_aa_polygons_gdf[sargassum_aa_polygons_gdf['imagedate']== date2test]
    print('\nDate: ' + date2test)
    accuracy_report(sargassum_aa_polygons_subset_gdf,'validpa','toa_sarg')