# Global Flood Validation Metrics Sept 2018 Updates with Otsu
Written in Python 2

In [89]:
# This notebook was built in Catherine's vectorenv
import pandas as pd
from pandas import DataFrame
import numpy as np
import os
import glob
import datetime
#import pylabb
import matplotlib as plt
import datetime as datetime  
import statsmodels.api as sm  
import matplotlib.pyplot as plt
import seaborn as sns
# plt.style.use('ggplot')

In [90]:
%matplotlib inline

# Data Preparation

In [91]:
%cd /Users/ckuhn/Downloads/

raw = pd.read_csv('gfdValidationPoints_10_02_18.csv',index_col=None, header=0)
raw = raw.drop(['system:index','.geo'], axis=1)
raw.head(1)

/Users/ckuhn/Downloads


Unnamed: 0,B1,B2,B3,B4,B5,B7,MNDWI,NDVI,analyst,date,...,pixel_lon,point_lat,point_lon,random,std_2day,std_3day,otsu_2day,otsu_3day,strata,validation
0,792.15651,896.579047,761.223987,1833.576595,980.842213,516.20095,-0.338543,0.408565,SiL,1534720000000.0,...,87.277107,25.681778,87.277107,0.853473,0.0,0.0,0.0,0.0,0,0.0


In [128]:
# Validation** = student classification  
# Strata** = DFO classification

print('no. of unique floods:', len(raw['dfoID'].unique()))
print('no of validation points:', len(raw))

print('representative breakdown:')
print('no of dry points:', len(raw.loc[raw['std_2day'] == 0]))
print('no of permanent water points:', len(raw.loc[raw['std_2day'] == 1]))
print('no of flooded points:', len(raw.loc[raw['std_2day'] == 2]))

no. of unique floods: 123
no of validation points: 34141
representative breakdown:
no of dry points: 8320
no of permanent water points: 8352
no of flooded points: 17465


## Cleaning

#### Drop floods with few points

In [129]:
df = raw.groupby('dfoID').filter(lambda g: g.dfoID.count() >= 200)
#df.dfoID.unique()

#### Drop NA points

In [130]:
# Pixels got labeled "NAN" were coded as either -99 or 99 
df = df.loc[df['validation'] > -1]
df = df.loc[df['validation'] < 99]

## Collections 


The points were stratified into three classes or strata. They are 50% not flooded (permanent water + dry) and 50% flooded. 
- not flooded/dry (0)
- permanent water (1)
- flooded (2)

#### Doing one method at a time

In [98]:
# # Select which collection to analyze: 
# They all have the same number of floods
st2 = df.filter(['dfoID', 'std_2day','strata', 'validation' ], axis=1)
st3 = df.filter(['dfoID', 'std_3day','strata', 'validation' ], axis=1)
ot2 = df.filter(['dfoID', 'otsu_2day','strata', 'validation' ], axis=1)
ot3 = df.filter(['dfoID', 'otsu_3day','strata', 'validation' ], axis=1)

# Create a label for the method to use in filenaming
method = st2
method_name = 'std_2day'

method['model_score'] = method[method_name]
method['student_score'] = method['validation']

print(method.groupby('model_score')['dfoID'].count())
# strata = 0 = dry, 1 = permanent water, 2 = flood
print(method.groupby(['student_score'])['dfoID'].count())

model_score
0.0     7343
1.0     7475
2.0    15926
Name: dfoID, dtype: int64
student_score
0.0    11653
1.0    19091
Name: dfoID, dtype: int64


Drop NaN values (-99, 99)

### Prepare Inputs for Confusion Matrix

## Calculate matrix components


- true positive = Validation (1) + Strata(4) = 5
- true negative = Validation (0) + Strata (0) =  0 
- false positive = Validation (0) + Strata (4) = 4
- false negative = Validation (1) + Strata (0) = 1

### Calculate aggregate metrics

In [85]:
method['model_score'] = method['model_score'].multiply(2)
method['model_score'].value_counts()
# 0 - dry, 2 - permanent water - 4 - floods
# 0 - dry, 1 - flooded
method['student_score'].value_counts()
# Add them together
method['score']  = method['model_score'] + method['student_score'] 
method['score'].value_counts()

# True Positive = Validation (1) + Strata (4)
tp = len(method.loc[method['score']== 5])
# True Negative = Validation (0) + Strata (0)
tn = len(method.loc[method['score']== 0])
# False Positive = Validation (0) + Strata (4)
fp = len(method.loc[method['score']== 4])
# False Negative = Validation (1) + Strata (0)
fn = len(method.loc[method['score']== 1])


precision = tp / (tp + fp)
recall  = tp / (tp + fn)  # % of flooded pixels it ID'ed correctly
accuracy = (tn + tp) / (tn+tp+fp+fn)
commission = fp / (tp + fp)  # modeled positives over all true positives
omission = fn / (tp + fn)

total = tn+tp+fn+fp
print('True Negatives(%)',tn/total) #32   Sep 2018 updates: 30%
print('True Positives(%)',tp/total) #52   Sep 2018 updates: 52%
print('False Positives(%)',fp/total) #15  Sep 2018 updates: 16%
print('False Negatives(%)',fn/total) # <0.01  Sep 2018 updates: 1.6%
print(total)
print(accuracy)
print(commission)
print(omission)

True Negatives(%) 0.3002277708539258
True Positives(%) 0.5174695947397825
False Positives(%) 0.16696033349091066
False Negatives(%) 0.015342300915380979
23269
0.8176973655937083
0.24394072585708904
0.028794966930150025


In [86]:
col_names =  ['Method', 'tp', 'tn','fp','fn',
             'precision','recall','overall_accuracy',
             'commission','omission'] ### Column names for making data frames for each flood
my_df  = pd.DataFrame(columns = col_names)## Make empty data frame to put values for each sampling level in

#std_2day = [method_name, tp, tn, fp, fn, precision, recall, accuracy, commission, omission]
#std_3day = [method_name, tp, tn, fp, fn, precision, recall, accuracy, commission, omission]
#otsu_2day = [method_name, tp, tn, fp, fn, precision, recall, accuracy, commission, omission]
otsu_3day = [method_name, tp, tn, fp, fn, precision, recall, accuracy, commission, omission]

In [88]:
#std2  = pd.DataFrame([std_2day], columns = col_names)## Make empty data frame to put values for each sampling level in
#std3  = pd.DataFrame([std_3day], columns = col_names)## Make empty data frame to put values for each sampling level in
#ot2  = pd.DataFrame([otsu_2day], columns = col_names)## Make empty data frame to put values for each sampling level in
ot3  = pd.DataFrame([otsu_3day], columns = col_names)## Make empty data frame to put values for each sampling level in

In [None]:
win = std2.append(std3).append(ot2).append(ot3)
win

# Trying to make a loop!

In [132]:
df = st2 # st2, st3, ot2, ot3
df['score'] = df[ method_name]

floods = list(set(df.dfoID)) ## Get a list of all the unique values of floods
sampling_levels = np.arange(0, 200, 1) # list(set(df.method))

col_names =  ['Flood','NumofPoints', 'tp', 'tn','fp','fn',
             'precision','recall','overall_accuracy',
             'commission','omission'] ### Column names for making data frames for each flood

my_df  = pd.DataFrame(columns = col_names)## Make empty data frame to put values for each sampling level in

for i in floods:  ### Loop through floods
    df_full = raw.loc[raw['dfoID']== i] ### subset data frame by single flood
    for j in sampling_levels: ## Loop through sampling levels
        df = df_full.sample(j) ##Apparently pandas has a built-in dataframe random sampler!
        tp = float(len(df.loc[df['score']== 5])) 
        tn = float(len(df.loc[df['score']== 0]))
        fp = float(len(df.loc[df['score']== 4]))
        fn = float(len(df.loc[df['score']== 1]))
        ### Calculating values!
        if tp == 0: ###Hmm, sometimes there are no tp scores, so you get a zero divide error
            continue ###I've put this in for now but it's gonna bite your butt eventually
        precision = tp / (tp + fp)
        recall  = tp / (tp + fn)  # % of flooded pixels it ID'ed correctly
        accuracy = (tn + tp) / (tn+tp+fp+fn)
        commission = fp / (tp + fp)  # modeled positives over all true positives
        omission = fn / (tp + fn)
        ##add a new row to the dataframe based on this sampling level!
        my_df.loc[len(my_df)] = [i,j, tp, tn, fp, fn, precision, recall, accuracy, commission, omission]

        ###Now you should have a full dataframe with all your sampling levels for each flood
        ###SO
        
#Export Results
#%cd '/Users/ckuhn/Desktop/gfd_accuracy_rename/lumped/'
#Anna's original - my_df.to_csv('myprecious.csv', encoding='utf-8')
#my_df.to_csv('gfd_summary_validaton_metrics_resampled.csv', encoding='utf-8')

KeyError: 'score'

In [133]:
my_df

Unnamed: 0,Flood,NumofPoints,tp,tn,fp,fn,precision,recall,overall_accuracy,commission,omission


# Make a new dataframe with values for each method

In [None]:
# def set_style():
#     plt.style.use(['seaborn-white', 'seaborn-talk'])
#     plt.rc("font", family="Times New Roman", size = 50)
    
# set_style()  

# def plot_confusion_matrix(cm, target_names, title='Confusion matrix', cmap=plt.cm.Blues):
#     plt.imshow(cm, interpolation='nearest', cmap=cmap)
#     plt.title(title)
#     plt.colorbar()
#     tick_marks = np.arange(len(target_names))
#     plt.xticks(tick_marks, target_names, rotation=45)
#     plt.yticks(tick_marks, target_names)
#     plt.tight_layout()

#     width, height = cm.shape

#     for x in range(width):
#         for y in range(height):
#             plt.annotate(str(cm[x][y]), xy=(y, x), 
#                         horizontalalignment='center',
#                         verticalalignment='center')
#     plt.ylabel('True label', fontsize=24)
#     plt.xlabel('Predicted label',  fontsize=24)

#     tp,
# cm = np.array([[tp, fn],[fp, tn]])
# plot_confusion_matrix(cm, ['Flooded', 'Not Flooded'])

# #%cd /Users/ckuhn/Desktop/gfd_accuracy_rename/figures
# #plt.savefig('GFD_overall_confusion_matrix_Sep2018.png', dpi = 300)

## Calculate Summarized Error Metrics By Flood

In [None]:
floods = list(set(raw.dfoID)) ## Get a list of all the unique values of floods

In [None]:
# floods = list(set(raw.dfoID)) ## Get a list of all the unique values of floods
# sampling_levels = np.arange(0, 200, 1)

# col_names =  ['Flood','Method', 'NumofPoints', 'tp', 'tn','fp','fn',
#              'precision','recall','overall_accuracy',
#              'commission','omission'] ### Column names for making data frames for each flood
# my_df  = pd.DataFrame(columns = col_names)## Make empty data frame to put values for each sampling level in

# for i in floods:  ### Loop through floods
#     df_full = raw.loc[raw['dfoID']== i] ### subset data frame by single flood
#     for j in sampling_levels: ## Loop through sampling levels
#         df = df_full.sample(j) ##Apparently pandas has a built-in dataframe random sampler!
#         tp = float(len(df.loc[df['score']== 5])) 
#         tn = float(len(df.loc[df['score']== 0]))
#         fp = float(len(df.loc[df['score']== 4]))
#         fn = float(len(df.loc[df['score']== 1]))
#         ### Calculating values!
#         if tp == 0: ###Hmm, sometimes there are no tp scores, so you get a zero divide error
#             continue ###I've put this in for now but it's gonna bite your butt eventually
#         precision = tp / (tp + fp)
#         recall  = tp / (tp + fn)  # % of flooded pixels it ID'ed correctly
#         accuracy = (tn + tp) / (tn+tp+fp+fn)
#         commission = fp / (tp + fp)  # modeled positives over all true positives
#         omission = fn / (tp + fn)
#         ##add a new row to the dataframe based on this sampling level!
#         my_df.loc[len(my_df)] = [i,j, tp, tn, fp, fn, precision, recall, accuracy, commission, omission]

#         ###Now you should have a full dataframe with all your sampling levels for each flood
#         ###SO
        
# #Export Results
# #%cd '/Users/ckuhn/Desktop/gfd_accuracy_rename/lumped/'
# #Anna's original - my_df.to_csv('myprecious.csv', encoding='utf-8')
# #my_df.to_csv('gfd_summary_validaton_metrics_resampled.csv', encoding='utf-8')

In [None]:
floods = list(set(raw.dfoID)) ## Get a list of all the unique values of floods
#sampling_levels = np.arange(0, 200, 1)

col_names =  ['Flood','NumofPoints', 'tp', 'tn','fp','fn',
             'precision','recall','overall_accuracy',
             'commission','omission'] ### Column names for making data frames for each flood

my_df  = pd.DataFrame(columns = col_names)## Make empty data frame to put values for each sampling level in

for i in floods:  ### Loop through floods
    for col in df.columns[18:22]:
    
    df = df_full.loc(j) ##Apparently pandas has a built-in dataframe random sampler!
    df = raw.loc[raw['dfoID']== i] ### subset data frame by single flood
    tp = float(len(df.loc[df['score']== 5])) 
    tn = float(len(df.loc[df['score']== 0]))
    fp = float(len(df.loc[df['score']== 4]))
    fn = float(len(df.loc[df['score']== 1]))
        ### Calculating values!
    if tp == 0: ###Hmm, sometimes there are no tp scores, so you get a zero divide error
        continue ###I've put this in for now but it's gonna bite your butt eventually
    precision = tp / (tp + fp)
    recall  = tp / (tp + fn)  # % of flooded pixels it ID'ed correctly
    accuracy = (tn + tp) / (tn+tp+fp+fn)
    commission = fp / (tp + fp)  # modeled positives over all true positives
    omission = fn / (tp + fn)
    ##add a new row to the dataframe based on this sampling level!
    my_df.loc[len(my_df)] = [i,col, tp, tn, fp, fn, precision, recall, accuracy, commission, omission]

        ###Now you should have a full dataframe with all your sampling levels for each flood
        ###SO
        
#Export Results
#%cd '/Users/ckuhn/Desktop/gfd_accuracy_rename/lumped/'
#Anna's original - my_df.to_csv('myprecious.csv', encoding='utf-8')
#my_df.to_csv('gfd_summary_validaton_metrics_resampled.csv', encoding='utf-8')

In [None]:
my_df.head()

### Sub-set resampled floods to just the ones with 200 points

In [None]:
# Extract the dataframes with 200 points
my_df = my_df.loc[my_df['NumofPoints'] == 199]

# Drop the one garbage dataframe
#my_df = my_df.loc[my_df['Flood'] != 'SiL']

len(my_df)

In [None]:
# Recast as an integer
my_df['Flood'] = my_df['Flood'].astype(int)

# Export results
%cd '/Users/ckuhn/Desktop/gfd_accuracy_rename/lumped/'
my_df.to_csv('gfd_summary_validaton_metrics.csv', encoding='utf-8')  #aka my_df.to_csv('myprecious3.csv', encoding='utf-8')

In [None]:
my_df.head(1)

### Prep Data for Export to Long Form

In [None]:
prepped = my_df.copy()     
prepped.rename(columns={'tp': 'True Positive', 'fp': 'False Positive', 'fn':'False Negative', 'tn': 'True Negative', 'overall_accuracy':'Overall Accuracy', 'commission':'Commission', 'omission':'Omission', 'precision':'Precision', 'recall':'Recall'}, inplace=True)

In [None]:
dfmelted = prepped.melt(id_vars=['Flood', 'NumofPoints'], var_name='Metric')
dfmelted.head(1)

In [None]:
# Export results
%cd '/Users/ckuhn/Desktop/gfd_accuracy_rename/lumped/'
dfmelted.to_csv('gfd_summary_validaton_metrics_longform.csv', encoding='utf-8')  #aka my_df.to_csv('myprecious3.csv', encoding='utf-8')

### Analyze results

In [None]:
# 59 floods

print('% of Floods with >90% accuracy', (len(my_df.loc[my_df['overall_accuracy'] >= 0.9]))/len(my_df))
print('% of Floods with >75% accuracy', (len(my_df.loc[my_df['overall_accuracy'] >= 0.75]))/len(my_df))
my_df['overall_accuracy'].describe()

# For 59 floods, mean accuracy is 82%, median accuracy is 86%, 75% of floods have 94% accuracy
    # 41% of the floods have 90% or better accuracy 
    # 73% of the flood have 80% or better accuracy 
# for 37 floods:  mean accuracy is 85%, median accuracy is 89%, 75% of the floods have 92% accuracy. 
    # and 49% of the floods have 90% or better accuracy 
    # 78% of the flood have 80% or better accuracy 

### Errors of Commission

In [None]:
my_df['commission'].mean()

#From 37 floods: 0.22182899539761314
#From 59 floods: 0.25010345838907294

In [None]:
params = {'axes.titlesize':'32',
          'xtick.labelsize':'24',
          'ytick.labelsize':'24'}
plt.rcParams.update(params)

# Go back to normal
#matplotlib.rcParams.update(matplotlib.rcParamsDefault)  # to revert to default settings


my_df['commission'].hist(color='grey')
plt.ylabel('count', fontsize=24)
plt.xlabel('Rate of Commission Errors', fontsize=24)

plt.axvline(x=0.22182899539761314, color='black', ls='--')
plt.axvline(x=0.25010345838907294, color='red', ls='--')

# mean errors of commission is 25%, 

#%cd /Users/ckuhn/Desktop/gfd_accuracy_rename/figures/
#plt.savefig('GFD_overall_commission.png', dpi = 300)

### Errors of Omission

In [None]:
my_df['omission'].mean()

#From 37 floods: 0.02238782782112279
#From 59 floods: 0.022641702349266005

In [None]:
my_df['omission'].hist(color='grey')
plt.ylabel('count', fontsize=24)
plt.xlabel('Errors of Omission', fontsize=24)
plt.axvline(x=0.02238782782112279, color='black', ls='--')
plt.axvline(x=0.022641702349266005, color='red', ls='--')

%cd /Users/ckuhn/Desktop/gfd_accuracy_rename/figures
plt.savefig('GFD_overall_omission.png', dpi = 300)

In [None]:
def set_style():
    plt.style.use(['seaborn-white', 'seaborn-talk'])
    plt.rc("font", family="Times New Roman", size = 22)
    plt.rc('xtick', labelsize=22) 
    plt.rc('ytick', labelsize=22) 
set_style()  

fig, ax = plt.subplots(figsize=(20,7))

my_df.groupby(['Flood'])['overall_accuracy'].mean().plot(kind='bar', color='grey')
plt.ylabel('Overall Accuracy', fontsize = 30)
plt.xlabel('DFO ID', fontsize = 30)
plt.axhline(y=0.9, color='red', ls='--')
plt.axhline(y=0.75, color='#FFA07A', ls='--')

plt.xticks(rotation=60)

plt.tight_layout()

%cd /Users/ckuhn/Desktop/gfd_accuracy_rename/figures/
plt.savefig('GFD_overall_accuracy_by_flood.png', dpi = 300)

In [None]:
def set_style():
    plt.style.use(['seaborn-white', 'seaborn-talk'])
    plt.rc("font", family="Times New Roman", size = 10)
    plt.rc('xtick', labelsize=10) 
    plt.rc('ytick', labelsize=10) 
set_style()  

plt.figure(figsize=(40, 12))


fig, ax = plt.subplots(2, 2, figsize=(20,7))

my_df.groupby(['Flood'])['tp'].mean().plot(kind='bar', color = 
                                                              ['grey', 'black'], ax = ax[0,0])
ax[0, 0].set_ylabel('True Positives')
ax[0, 0].set_xlabel('')
ax[0, 0].set_xticklabels([])

my_df.groupby(['Flood'])['tn'].mean().plot(kind='bar', color = 'grey', ax=ax[0, 1])
ax[0, 1].set_ylabel('True Negatives')
ax[0, 1].set_xlabel('')
ax[0, 1].set_xticklabels([])

my_df.groupby(['Flood'])['fp'].mean().plot(kind='bar', color = 'grey', ax=ax[1, 0])
ax[1, 0].set_ylabel('False Positives')
ax[1, 0].set_xlabel('DFO ID')

my_df.groupby(['Flood'])['fn'].mean().plot(kind='bar', color = 'grey', ax=ax[1, 1])
ax[1, 1].set_ylabel('False Negatives')
ax[1, 1].set_xlabel('DFO ID')

plt.tight_layout()

%cd /Users/ckuhn/Desktop/gfd_accuracy_rename/figures/
plt.savefig('GFD_base_metrics_by_flood.png', dpi = 300)

In [None]:
%cd /Users/ckuhn/Desktop/gfd_accuracy_rename/lumped/
test = pd.read_csv('gfd_summary_validaton_metrics_longform.csv', low_memory=False)
test.groupby(['Metric'])['value'].mean()

In [None]:
test.head(1)

In [None]:
error = test.loc[test['Metric'].isin(['Commission','Omission'])]
overall = test.loc[test['Metric'].isin(['Overall Accuracy','Precision','Recall' ])]
basics = test.loc[test['Metric'].isin(['True Positive','True Negative','False Positive', 'False Negative' ])]

In [None]:
test.Metric.unique()

In [None]:
def set_style():
    plt.style.use(['seaborn-white', 'seaborn-talk'])
    plt.rc("font", family="Times New Roman", size = 24)
    
set_style()    

ax = sns.barplot(y="value", x="Metric", palette= ['#388e3c', '#6abf69', '#00600f'],   data=overall)
ax.set_ylabel('')
ax.tick_params(labelsize=24)
ax.set_title('')
plt.tight_layout()
plt.show()

In [None]:
def set_style():
    plt.style.use(['seaborn-white', 'seaborn-talk'])
    plt.rc("font", family="Times New Roman", size = 30)
    plt.rc('xtick', labelsize=30) 
    plt.rc('ytick', labelsize=30) 
set_style()  

order = 'True Positive', 'True Negative', 'False Negative', 'False Positive'

g = sns.factorplot(kind='box',        # Boxplot
               y='value',       # Y-axis - values for boxplot
               x='Metric',        # X-axis - first factor
               hue = 'Metric',
               #order= metric,
               data=basics,        # Dataframe 
               size=8,            # Figure size (x100px)      
               aspect=1.5, 
               palette= ['#01579b', '#0d47a1', '#c30000', '#c30000'],  # Width = size * aspect 
               legend_out=False)  # Make legend inside the plot
#g.fig.get_axes()[0].set_yscale('log')

g.fig.text(0.75, 0.75,'n = 117 floods', fontsize=30, color = '#ff3d00') #add text

g.set_xticklabels(rotation=30)

plt.ylabel('Count', fontsize = 45)
plt.xlabel('', fontsize = 45)
plt.title('', fontsize = 45)
#plt.axhline(y=0, color='grey', ls='--')
#plt.legend(loc='upper left', fontsize=45)

%cd /Users/ckuhn/Desktop/gfd_accuracy_rename/figures/
plt.savefig('GFD_basics.png', dpi = 300)

plt.show()

#http://queirozf.com/entries/matplotlib-pyplot-by-example#change-tick-label-rotation # ADD LABELS

In [None]:
def set_style():
    plt.style.use(['seaborn-white', 'seaborn-talk'])
    plt.rc("font", family="Times New Roman", size = 30)
    plt.rc('xtick', labelsize=30) 
    plt.rc('ytick', labelsize=30) 
set_style()  


g = sns.factorplot(kind='bar',        # Boxplot
               y='value',       # Y-axis - values for boxplot
               x='Metric',        # X-axis - first factor
               hue = 'Metric',
               #order= metric,
               data=overall,        # Dataframe 
               size=8,            # Figure size (x100px)      
               aspect=1.5, 
               palette= ['#388e3c', '#6abf69', '#00600f'], 
               legend_out=False)  # Make legend inside the plot
#g.fig.get_axes()[0].set_yscale('log')

g.fig.text(0.2, 0.90,'n = 117 floods', fontsize=30, color = 'darkblue') #add text
g.set_xticklabels(rotation=30)

plt.ylabel('Score', fontsize = 45)
plt.xlabel('', fontsize = 45)
plt.title('', fontsize = 45)
#plt.axhline(y=0, color='grey', ls='--')
#plt.legend(loc='upper left', fontsize=45)



%cd /Users/ckuhn/Desktop/gfd_accuracy_rename/figures/
plt.savefig('GFD_overall_.png', dpi = 300)

plt.show()


In [None]:
def set_style():
    plt.style.use(['seaborn-white', 'seaborn-talk'])
    plt.rc("font", family="Times New Roman", size = 30)
    plt.rc('xtick', labelsize=30) 
    plt.rc('ytick', labelsize=30) 
set_style()  


g = sns.factorplot(kind='box',        # Boxplot
               y='value',       # Y-axis - values for boxplot
               x='Metric',        # X-axis - first factor
               hue = 'Metric',
               #order= metric,
               data=error,        # Dataframe 
               size=8,            # Figure size (x100px)      
               aspect=1.5, 
               palette= [ '#c30000','#c30000'],  # Width = size * aspect 
               legend_out=False)  # Make legend inside the plot
#g.fig.get_axes()[0].set_yscale('log')

g.fig.text(0.8, 0.8,'n = 117 floods', fontsize=30, color = 'darkblue') #add text

plt.ylabel('Score', fontsize = 45)
plt.xlabel('', fontsize = 45)
plt.title('', fontsize = 45)
#plt.axhline(y=0, color='grey', ls='--')
#plt.legend(loc='upper left', fontsize=45)

plt.show()

# %cd /Users/ckuhn/Desktop/gfd_accuracy_rename/figures/
# plt.savefig('GFD_error.png', dpi = 300)

In [None]:
# q = df.loc[df['NumofPoints'] == 199]
# x = df['NDVI'].values
# y = df['overall_accuracy'].values
# plt.plot(x, y, "o", color = 'green')

# NDVI

In [None]:
# Filter to the scores of interest (excludes permanent water category)
q = (inputData.loc[inputData['score']!= 2]) # permanent water
q = (q.loc[q['score']!= 3]) 
q['score'].value_counts()

#5 = true positive, 4 = true negative, 1 - false negative, 0 = true negative    
params = {'axes.titlesize':'32',
          'xtick.labelsize':'24',
          'ytick.labelsize':'24'}
plt.rcParams.update(params)


q.groupby(['score'])['NDVI'].mean().plot(kind='bar', color='grey')
plt.xticks([0, 1, 2, 3], ['True Negative', 'False Negative', 'False Positive', 'True Positive'])
plt.ylabel('Average NDVI', fontsize=24)

### Number of Sampling Points


In [None]:
# Check the number of points for each flood
fig, ax = plt.subplots(figsize=(15,7))
df.groupby(['dfoID']).count()['score'].plot(ax=ax, kind='bar')
ax.hlines(y=200, xmin=0, xmax=117, linewidth=2, color='r')
ax.set_ylabel('Number of Points')
ax.set_xlabel('DFO ID')
ax.set_title('')

### Count floods with fewer than 200 points

In [None]:
df2 = df.groupby('dfoID').filter(lambda g: g.dfoID.count() > 200)
df2.dfoID.unique()

p = df.groupby('dfoID').filter(lambda g: g.dfoID.count() < 100)
p.dfoID.unique()

In [None]:
#%cd '/Users/ckuhn/Desktop/gfd_accuracy_rename/lumped/'
#df2.to_csv('all_student_data.csv', encoding='utf-8')

### Look into NAs

In [121]:
print(raw.groupby(['validation'])['dfoID'].count())

validation
-99.0     1589
 0.0     12111
 1.0     19550
 99.0      557
Name: dfoID, dtype: int64


In [122]:
drops1 = raw.loc[raw['validation'] < -1]
drops2 = raw.loc[raw['validation'] > 98]
drops = drops1.append(drops2)
print('total NA pixels', len(drops))
print('NA/total pixels', float(len(drops)/34141))
print('DFO IDs with NAs', len(drops['dfoID'].unique()))

total NA pixels 2146
NA/total pixels 0.06285697548402214
DFO IDs with NAs 105


In [123]:
dry = raw.loc[raw['validation'] == 0]
water = raw.loc[raw['validation'] == 1]

print('dry/total pixels', len(dry)/34141)
print('water/total pixels', len(water)/34141)

dry/total pixels 0.3547347763685891
water/total pixels 0.5726252892416742


In [124]:
student = drops['validation'] 
model = drops['otsu_2day'] 
df_confusion = pd.crosstab(student, model)
df_confusion
# 0 = dry, 1 = permanent water, 2 = flood

otsu_2day,0.0,1.0,2.0
validation,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
-99.0,459,350,776
99.0,179,112,266


In [125]:
model = drops['otsu_3day'] 
df_confusion = pd.crosstab(student, model)
df_confusion

otsu_3day,0.0,1.0,2.0
validation,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
-99.0,547,350,688
99.0,216,112,229


In [126]:
model = drops['std_2day'] 
df_confusion = pd.crosstab(student, model)
df_confusion

std_2day,0.0,1.0,2.0
validation,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
-99.0,389,350,846
99.0,166,112,279


In [127]:
model = drops['std_3day'] 
df_confusion = pd.crosstab(student, model)
df_confusion

std_3day,0.0,1.0,2.0
validation,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
-99.0,399,350,836
99.0,168,112,277
