# Analysis of results using GroupBy function per cluster

In [1]:
# Numerical
import numpy as np
import pandas as pd

# System & Other
import os

# Spatial
import geopandas as gpd

#Plotting
import plotly.graph_objects as go
import plotly.express as px
from IPython.display import display, Markdown, HTML
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
# Directories
ROOT_DIR = os.path.abspath(os.curdir)
in_path = os.path.join(ROOT_DIR, 'sample_input')
out_path = os.path.join(ROOT_DIR, 'sample_output')

shp_nm = "Ethiopia_vector_clusters_with_attributes.shp"

In [3]:
# Create a new geo-dataframe
data_gdf = gpd.read_file(out_path + "\\" + shp_nm)

In [4]:
def calc_LC_sqkm(df, col_list):
    """ 
    This function takes the df where the LC type for different classes is provided per location (row).
    It adds all pixels per location; then is calculates the ratio of LC class in each location (% of total).
    Finally is estimates the area per LC type in each location by multiplying with the total area each row represents.
    
    INPUT: 
    df -> Pandas dataframe with LC type classification 
    col_list -> list of columns to include in the summary (e.g. LC0-LC16)
    
    OUTPUT: Updated dataframe with estimated area (sqkm) of LC types per row
    """
    df["LC_sum"] = df[col_list].sum(axis=1)
    for col in col_list:
        df[col] = df[col]/df["LC_sum"]*df["sqkm"]
    
    return df

In [5]:
lc_cols = ['LC0', 'LC1', 'LC2', 'LC3', 'LC4', 'LC5', 'LC6', 'LC7', 'LC8', 
           'LC9', 'LC10','LC11','LC12', 'LC13', 'LC14', 'LC16' ]

In [6]:
data_gdf_LCsqkm = calc_LC_sqkm(data_gdf, lc_cols)

## Global summaries

In [7]:
sum_cols = ['sqkm', 'WTD','LC0', 'LC1', 'LC2', 'LC3', 'LC4', 'LC5', 'LC6', 'LC7','LC8', 'LC9', 'LC10', 'LC11', 'LC12', 
            'LC13', 'LC14', 'LC15', 'LC16']

sum_rows = ['sum', 'mean', 'min', 'max']

In [8]:
summary_table = pd.DataFrame(index=sum_rows, columns=sum_cols)

In [9]:
# sqkm
summary_table.sqkm[0] = round(data_gdf_LCsqkm.sqkm.sum(),2)
summary_table.sqkm[1] = round(data_gdf_LCsqkm.sqkm.mean(),2)
summary_table.sqkm[2] = round(data_gdf_LCsqkm.sqkm.min(),2)
summary_table.sqkm[3] = round(data_gdf_LCsqkm.sqkm.max(),2)

# Water Table Depth
summary_table.WTD[0] = "NaN"
summary_table.WTD[1] = round(data_gdf_LCsqkm.wtdmean.mean(),2)
summary_table.WTD[2] = round(data_gdf_LCsqkm.wtdmean.min(),2)
summary_table.WTD[3] = round(data_gdf_LCsqkm.wtdmean.max(),2)

# LC0 - Water
summary_table.LC0[0] = round(data_gdf_LCsqkm.LC0.sum(),2)
summary_table.LC0[1] = round(data_gdf_LCsqkm.LC0.mean(),2)
summary_table.LC0[2] = round(data_gdf_LCsqkm.LC0.min(),2)
summary_table.LC0[3] = round(data_gdf_LCsqkm.LC0.max(),2)

# LC1 - Evergreen Needleleaf Forest
summary_table.LC1[0] = round(data_gdf_LCsqkm.LC1.sum(),2)
summary_table.LC1[1] = round(data_gdf_LCsqkm.LC1.mean(),2)
summary_table.LC1[2] = round(data_gdf_LCsqkm.LC1.min(),2)
summary_table.LC1[3] = round(data_gdf_LCsqkm.LC1.max(),2)

# LC2 - Evergreen Broadleaf Forest
summary_table.LC2[0] = round(data_gdf_LCsqkm.LC2.sum(),2)
summary_table.LC2[1] = round(data_gdf_LCsqkm.LC2.mean(),2)
summary_table.LC2[2] = round(data_gdf_LCsqkm.LC2.min(),2)
summary_table.LC2[3] = round(data_gdf_LCsqkm.LC2.max(),2)

# LC3 - Deciduous Needleleaf Forest
summary_table.LC3[0] = round(data_gdf_LCsqkm.LC3.sum(),2)
summary_table.LC3[1] = round(data_gdf_LCsqkm.LC3.mean(),2)
summary_table.LC3[2] = round(data_gdf_LCsqkm.LC3.min(),2)
summary_table.LC3[3] = round(data_gdf_LCsqkm.LC3.max(),2)

# LC4 - Deciduous Broadleaf Forest
summary_table.LC4[0] = round(data_gdf_LCsqkm.LC4.sum(),2)
summary_table.LC4[1] = round(data_gdf_LCsqkm.LC4.mean(),2)
summary_table.LC4[2] = round(data_gdf_LCsqkm.LC4.min(),2)
summary_table.LC4[3] = round(data_gdf_LCsqkm.LC4.max(),2)

# LC5 - Mixed Forests
summary_table.LC5[0] = round(data_gdf_LCsqkm.LC5.sum(),2)
summary_table.LC5[1] = round(data_gdf_LCsqkm.LC5.mean(),2)
summary_table.LC5[2] = round(data_gdf_LCsqkm.LC5.min(),2)
summary_table.LC5[3] = round(data_gdf_LCsqkm.LC5.max(),2)

# LC6 - Mixed Forests
summary_table.LC6[0] = round(data_gdf_LCsqkm.LC6.sum(),2)
summary_table.LC6[1] = round(data_gdf_LCsqkm.LC6.mean(),2)
summary_table.LC6[2] = round(data_gdf_LCsqkm.LC6.min(),2)
summary_table.LC6[3] = round(data_gdf_LCsqkm.LC6.max(),2)

# LC7 - Open Shrublands
summary_table.LC7[0] = round(data_gdf_LCsqkm.LC7.sum(),2)
summary_table.LC7[1] = round(data_gdf_LCsqkm.LC7.mean(),2)
summary_table.LC7[2] = round(data_gdf_LCsqkm.LC7.min(),2)
summary_table.LC7[3] = round(data_gdf_LCsqkm.LC7.max(),2)

# LC8 - Woody Savannas
summary_table.LC8[0] = round(data_gdf_LCsqkm.LC8.sum(),2)
summary_table.LC8[1] = round(data_gdf_LCsqkm.LC8.mean(),2)
summary_table.LC8[2] = round(data_gdf_LCsqkm.LC8.min(),2)
summary_table.LC8[3] = round(data_gdf_LCsqkm.LC8.max(),2)

# LC9 - Savannas
summary_table.LC9[0] = round(data_gdf_LCsqkm.LC9.sum(),2)
summary_table.LC9[1] = round(data_gdf_LCsqkm.LC9.mean(),2)
summary_table.LC9[2] = round(data_gdf_LCsqkm.LC9.min(),2)
summary_table.LC9[3] = round(data_gdf_LCsqkm.LC9.max(),2)

# LC10 - Grasslands
summary_table.LC10[0] = round(data_gdf_LCsqkm.LC10.sum(),2)
summary_table.LC10[1] = round(data_gdf_LCsqkm.LC10.mean(),2)
summary_table.LC10[2] = round(data_gdf_LCsqkm.LC10.min(),2)
summary_table.LC10[3] = round(data_gdf_LCsqkm.LC10.max(),2)

# LC11 - Permanent Wetlands
summary_table.LC11[0] = round(data_gdf_LCsqkm.LC11.sum(),2)
summary_table.LC11[1] = round(data_gdf_LCsqkm.LC11.mean(),2)
summary_table.LC11[2] = round(data_gdf_LCsqkm.LC11.min(),2)
summary_table.LC11[3] = round(data_gdf_LCsqkm.LC11.max(),2)

# LC12 - Croplands
summary_table.LC12[0] = round(data_gdf_LCsqkm.LC12.sum(),2)
summary_table.LC12[1] = round(data_gdf_LCsqkm.LC12.mean(),2)
summary_table.LC12[2] = round(data_gdf_LCsqkm.LC12.min(),2)
summary_table.LC12[3] = round(data_gdf_LCsqkm.LC12.max(),2)

# LC13 - Urban and Built-Up
summary_table.LC13[0] = round(data_gdf_LCsqkm.LC13.sum(),2)
summary_table.LC13[1] = round(data_gdf_LCsqkm.LC13.mean(),2)
summary_table.LC13[2] = round(data_gdf_LCsqkm.LC13.min(),2)
summary_table.LC13[3] = round(data_gdf_LCsqkm.LC13.max(),2)

# LC14 - Cropland/Natural Vegetation Mosaic
summary_table.LC14[0] = round(data_gdf_LCsqkm.LC14.sum(),2)
summary_table.LC14[1] = round(data_gdf_LCsqkm.LC14.mean(),2)
summary_table.LC14[2] = round(data_gdf_LCsqkm.LC14.min(),2)
summary_table.LC14[3] = round(data_gdf_LCsqkm.LC14.max(),2)

# LC15 - Snow and Ice
summary_table.LC15[0] = "NaN"
summary_table.LC15[1] = "NaN"
summary_table.LC15[2] = "NaN"
summary_table.LC15[3] = "NaN"

# LC16 - Snow and Ice
summary_table.LC16[0] = round(data_gdf_LCsqkm.LC16.sum(),2)
summary_table.LC16[1] = round(data_gdf_LCsqkm.LC16.mean(),2)
summary_table.LC16[2] = round(data_gdf_LCsqkm.LC16.min(),2)
summary_table.LC16[3] = round(data_gdf_LCsqkm.LC16.max(),2)

In [10]:
display(Markdown('### Country Summary \n These are the summarized results for Ethiopia'))
display(summary_table)
display(Markdown('### Class Description \n ##### Percentage \n LC0 - Water \n\n LC1 - Evergreen Needleleaf Forest \n\n LC2 - Evergreen Broadleaf Forest \n\n  LC3 - Deciduous Needleleaf Forest \n\n LC4 - Deciduous Broadleaf Forest \n\n LC5 - Mixed Forests \n\n LC6 - Closed Shrublands \n\n LC7 - Open Shrublands \n\n LC8 - Woody Savannas \n\n LC9 - Savannas \n\n LC10 - Grasslands \n\n LC11 - Permanent Wetlands \n\n LC12 - Croplands \n\n LC13 - Urban and Built-Up \n\n LC14 - Cropland/Natural Vegetation Mosaic \n\n LC15 - Snow and Ice \n\n LC16 - Barren or Sparsely Vegetated'))

### Country Summary 
 These are the summarized results for Ethiopia

Unnamed: 0,sqkm,WTD,LC0,LC1,LC2,LC3,LC4,LC5,LC6,LC7,LC8,LC9,LC10,LC11,LC12,LC13,LC14,LC15,LC16
sum,1102710.0,,6226.09,0.2,27274.5,7.42,1289.1,1016.55,30871.3,307832.0,99267.3,163427.0,163084.0,913.66,122489.0,711.14,111814.0,,66492.0
mean,80.17,42.59,32.09,0.2,23.93,0.32,1.71,1.02,16.27,44.52,19.57,24.76,22.02,1.67,18.22,2.39,15.35,,30.46
min,80.17,0.0,0.2,0.2,0.2,0.2,0.2,0.2,0.19,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.19,,0.2
max,80.17,140.65,80.17,0.2,80.17,1.4,35.48,21.85,80.17,80.17,80.17,80.17,80.17,38.68,80.17,72.36,80.17,,80.17


### Class Description 
 ##### Percentage 
 LC0 - Water 

 LC1 - Evergreen Needleleaf Forest 

 LC2 - Evergreen Broadleaf Forest 

  LC3 - Deciduous Needleleaf Forest 

 LC4 - Deciduous Broadleaf Forest 

 LC5 - Mixed Forests 

 LC6 - Closed Shrublands 

 LC7 - Open Shrublands 

 LC8 - Woody Savannas 

 LC9 - Savannas 

 LC10 - Grasslands 

 LC11 - Permanent Wetlands 

 LC12 - Croplands 

 LC13 - Urban and Built-Up 

 LC14 - Cropland/Natural Vegetation Mosaic 

 LC15 - Snow and Ice 

 LC16 - Barren or Sparsely Vegetated

## Cluster Summaries

In [11]:
data_gdf_LCsqkm["cluster"] = data_gdf_LCsqkm["cluster"].astype(str)
non_clustered_data = data_gdf_LCsqkm[data_gdf_LCsqkm["cluster"] == "None"]

display(Markdown('**Note** that there are {} polygons that are not assigned to a cluster  -- classified as "None"'
                 .format(len(non_clustered_data))))

**Note** that there are 405 polygons that are not assigned to a cluster  -- classified as "None"

#### Groupby on clusters

In [12]:
clusters = data_gdf_LCsqkm.groupby(['cluster'])

In [13]:
clust_sum_cols = ['sqkm','LC0', 'LC1', 'LC2', 'LC3', 'LC4', 'LC5', 'LC6', 'LC7','LC8', 'LC9', 'LC10', 'LC11', 'LC12', 
            'LC13', 'LC14', 'LC16']

In [14]:
clusters_sum = clusters[clust_sum_cols].sum().sort_values(ascending=False, by='sqkm').reset_index()
display(Markdown('#### Cluster summary statistics for area and land cover'))
clusters_sum

#### Cluster summary statistics for area and land cover

Unnamed: 0,cluster,sqkm,LC0,LC1,LC2,LC3,LC4,LC5,LC6,LC7,LC8,LC9,LC10,LC11,LC12,LC13,LC14,LC16
0,OROC03,167563.90244,1039.256979,0.0,1137.559762,0.801741,421.515415,183.059451,6604.449699,17936.56038,20060.819637,15694.286907,85257.951424,107.834186,5620.400312,165.359114,12678.824471,655.222963
1,SOMC01,139182.265376,0.0,0.0,0.0,0.0,258.561524,1.803918,3613.665154,122560.516104,61.734069,14.431341,963.867512,0.801741,33.407279,9.220023,366.846725,11297.409986
2,SOMC03,100217.645,0.0,0.0,0.0,0.0,18.039176,0.0,1975.690654,91799.385027,16.636129,20.4444,2383.138022,0.0,129.29316,15.84281,309.442262,3549.733361
3,AMHC02,99335.729724,2848.385906,0.0,12.026117,4.008706,1.403047,45.498811,7.816976,500.086049,916.990952,50859.832748,1450.972104,91.999798,32514.682047,43.093587,9965.974429,72.958446
4,OROC01,79212.026608,143.912538,0.200435,7766.667052,1.603482,2.806094,402.073192,40.888799,146.117326,11841.316063,3268.297839,13964.727525,166.561726,17939.960631,287.624641,23224.437052,14.832211
5,OROC02,76967.15136,921.801899,0.0,6795.357637,0.400871,3.808271,61.132763,20.84527,337.933899,32304.55656,12149.986409,5424.179818,75.964975,4856.346641,46.100117,13886.758197,81.978034
6,AFAC02,67346.25744,329.921986,0.0,1.202612,0.0,0.801741,8.819153,21.045705,13957.482667,3.607835,102.221998,9724.308523,43.694893,2412.880943,8.819153,1368.570668,39362.879564
7,SOMC02,53796.831836,0.0,0.0,0.0,0.0,513.916084,6.213494,16712.695351,19398.728672,549.794,163.956067,12169.629068,0.400871,113.847245,5.812623,501.489096,3660.349266
8,SOUC03,49948.474268,366.195275,0.0,3206.96464,0.0,5.411753,8.017412,17.085985,2531.746544,13894.215131,7103.475395,10337.555306,91.765724,1292.996171,3.808271,11062.179818,27.056844
9,AMHC01,48104.4696,0.0,0.0,4.008706,0.200435,0.200435,125.672927,46.901858,257.158477,363.389181,10514.234008,2403.219127,13.028294,29357.957362,31.468341,4986.228709,0.801741


In [15]:
clusters_mean = clusters['wtdmean'].mean().sort_values(ascending=False).reset_index()
display(Markdown('#### Cluster summary statistics for Water Table Depth (in m)'))
clusters_mean

#### Cluster summary statistics for Water Table Depth (in m)

Unnamed: 0,cluster,wtdmean
0,TIGC01,76.71003
1,TIGC02,61.306492
2,AMHC01,58.198648
3,AFAC01,56.126008
4,AMHC02,49.518228
5,BENC02,48.478069
6,AFAC02,47.413433
7,SOMC02,45.57461
8,OROC03,45.480629
9,SOUC02,43.116401


### Explore particular clusters

In [16]:
clusters.get_group('BENC01')[clust_sum_cols].sum()

sqkm    19321.961956
LC0         0.000000
LC1         0.000000
LC2         0.000000
LC3         0.000000
LC4         3.407400
LC5         7.416106
LC6         0.000000
LC7         0.000000
LC8       801.367427
LC9     18465.765988
LC10        0.801741
LC11        0.000000
LC12       30.469184
LC13        0.000000
LC14       12.734110
LC16        0.000000
dtype: float64

In [17]:
clusters.get_group('BENC01')[clust_sum_cols].describe()

Unnamed: 0,sqkm,LC0,LC1,LC2,LC3,LC4,LC5,LC6,LC7,LC8,LC9,LC10,LC11,LC12,LC13,LC14,LC16
count,241.0,0.0,0.0,0.0,0.0,11.0,14.0,0.0,0.0,86.0,241.0,1.0,0.0,42.0,0.0,22.0,0.0
mean,80.174116,,,,,0.309764,0.529722,,,9.318226,76.621436,0.801741,,0.725457,,0.578823,
std,0.0,,,,,0.207596,0.482117,,,13.787101,9.621854,,,1.059642,,0.666079,
min,80.174116,,,,,0.200435,0.200435,,,0.200435,9.220023,0.801741,,0.200435,,0.200435,
25%,80.174116,,,,,0.200435,0.200435,,,0.400871,78.771069,0.801741,,0.200435,,0.200435,
50%,80.174116,,,,,0.200435,0.300653,,,2.605659,80.174116,0.801741,,0.31497,,0.300653,
75%,80.174116,,,,,0.300653,0.751632,,,12.677532,80.174116,0.801741,,0.601306,,0.601306,
max,80.174116,,,,,0.801741,1.803918,,,70.152351,80.174116,0.801741,,5.812623,,3.096854,


In [18]:
#variable = "LC12"     #  Cropland
#
#fig, ax = plt.subplots(figsize=(10, 10))
#clusters[variable].sum().sort_values(ascending=False).reset_index().plot.bar(ax=ax, 
#                                                                                                       x='cluster', 
#                                                                                                       y=variable)
#txt = ax.set_title('{} per cluster in Ethiopia'.format(variable))

### Interactive graphs

In [19]:
# Get cluster names
clust_names = list(data_gdf_LCsqkm.cluster.unique())

# Explude non clustered data
clust_names.remove("None")

In [23]:
# Create a dictionary that includes the name of the clusters and a selected parameter
clust_dict = dict.fromkeys(clust_names, 1)

# Update the values based on parameter
parameter = "LC12"                        # This is the column you want to include in the graph

for key, value in clust_dict.items():
    clust_dict[key] = round(clusters.get_group(key)[parameter].sum(),2)   # Note that this calculates the sum, change if needed

fig_Cluster = px.bar(pd.DataFrame.from_dict(clust_dict, orient='index', columns=['Sum']), title="Dictribution of {} over clusters in Ethiopia".format(parameter))
fig_Cluster.show()

In [21]:
fig_Cluster.write_html("Total area (sqkm) per cluster.html")

### Export data - tables - summary - graphs as needed

In [22]:
#data_gdf_clusters.to_csv("testFun.csv")