# Analysis of results using GroupBy function per cluster

In [200]:
# Numerical
import numpy as np
import pandas as pd

# System & Other
import os

# Spatial
import geopandas as gpd

#Plotting
import ipywidgets
import plotly.graph_objects as go
import plotly.express as px
from IPython.display import display, Markdown, HTML
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
country_name = "lka"     # suggent using UN 3 letter ISO code
clust_method = 1         # choose 1 for admin level 1 clustering, 2 for admin level 2 clustering and 3 for CLEWs clustering

In [3]:
# Directories
ROOT_DIR = os.path.abspath(os.curdir)
#in_path = os.path.join(ROOT_DIR, 'sample_input')
out_path = os.path.join(ROOT_DIR, country_name + "\\"+ 'output')

# Import result file  
if clust_method == 1:
    input_nm = "{}_vector_{}_clusters.gpkg".format(country_name, "admin1")
    result_nm = "{}_vector_{}_clusters_with_attributes.gpkg".format(country_name, "admin1")
elif clust_method == 2:
    input_nm = "{}_vector_{}_clusters.gpkg".format(country_name, "admin2")
    result_nm = "{}_vector_{}_clusters_with_attributes.gpkg".format(country_name, "admin2")
else:
    input_nm = "{}_vector_{}_clusters.gpkg".format(country_name, "clews")
    result_nm = "{}_vector_{}_clusters_with_attributes.gpkg".format(country_name, "clews")

In [4]:
# Create a new geo-dataframe
origin_data_gdf = gpd.read_file(out_path + "\\" + input_nm)
final_data_gdf = gpd.read_file(out_path + "\\" + result_nm)

In [5]:
origin_list_of_cols = list(origin_data_gdf.columns)
final_list_of_cols = list(final_data_gdf.columns)

## National summary stats

### Land cover and area stats

In [6]:
# Land cover area estimator
def calc_LC_sqkm(df, col_list):
    """ 
    This function takes the df where the LC type for different classes is provided per location (row).
    It adds all pixels per location; then is calculates the ratio of LC class in each location (% of total).
    Finally is estimates the area per LC type in each location by multiplying with the total area each row represents.
    
    INPUT: 
    df -> Pandas dataframe with LC type classification 
    col_list -> list of columns to include in the summary (e.g. LC0-LC16)
    
    OUTPUT: Updated dataframe with estimated area (sqkm) of LC types per row
    """
    df["LC_sum"] = df[col_list].sum(axis=1)
    for col in col_list:
        df[col] = df[col]/df["LC_sum"]*df["sqkm"]
    
    return df

In [7]:
# Identify land cover related columns
landCover_cols = []
for col in final_list_of_cols:
    if "LCType" in col:
        landCover_cols.append(col)
if not landCover_cols:
    print ("There is not any Land Cover associated column in the dataframe; please revise")
else:
    pass

In [8]:
data_gdf_LCsqkm = calc_LC_sqkm(final_data_gdf, landCover_cols)

In [9]:
# list of stast to be calculated
lc_sum_rows = ['sum', 'min', 'max']

# initiate the summary table 
LC_summary_table = pd.DataFrame(index=lc_sum_rows, columns=landCover_cols)

# filling in the table
for col in landCover_cols:
    LC_summary_table[col][0] = round(data_gdf_LCsqkm[col].sum(),2)
    LC_summary_table[col][1] = round(data_gdf_LCsqkm[col].min(),2)
    LC_summary_table[col][2] = round(data_gdf_LCsqkm[col].max(),2)

In [10]:
display(Markdown('###  These are the summarized results for land cover (sq.km) in **{}**'.format(country_name)))
display(Markdown(' **Total area:** {:0.1f} sq.km'.format(data_gdf_LCsqkm.sqkm.sum())))
display(LC_summary_table)
display(Markdown('#### Class Description \n\n LCType0 - Water \n\n LCType1 - Evergreen Needleleaf Forest \n\n LCType2 - Evergreen Broadleaf Forest \n\n  LCType3 - Deciduous Needleleaf Forest \n\n LCType4 - Deciduous Broadleaf Forest \n\n LCType5 - Mixed Forests \n\n LCType6 - Closed Shrublands \n\n LCType7 - Open Shrublands \n\n LCType8 - Woody Savannas \n\n LCType9 - Savannas \n\n LCType10 - Grasslands \n\n LCType11 - Permanent Wetlands \n\n LCType12 - Croplands \n\n LCType13 - Urban and Built-Up \n\n LCType14 - Cropland/Natural Vegetation Mosaic \n\n LCType15 - Snow and Ice \n\n LCType16 - Barren or Sparsely Vegetated'))

###  These are the summarized results for land cover (sq.km) in **lka**

 **Total area:** 66126.9 sq.km

Unnamed: 0,LCType11,LCType13,LCType0,LCType10,LCType12,LCType14,LCType16,LCType8,LCType4,LCType9,LCType2,LCType6,LCType7,LCType5,LCType3,LCType1
sum,1371.24,304.99,688.86,304.34,5118.25,28107.9,209.7,1696.13,64.31,3369.52,24653.5,43.92,50.84,129.46,13.39,0.48
min,0.0,0.04,0.0,0.0,0.05,0.03,0.03,0.1,0.1,0.1,0.1,0.05,0.12,0.0,0.09,0.19
max,26.46,45.6,23.01,15.2,66.68,84.68,8.16,62.87,3.91,43.61,84.68,2.98,5.0,2.89,1.0,0.29


#### Class Description 

 LCType0 - Water 

 LCType1 - Evergreen Needleleaf Forest 

 LCType2 - Evergreen Broadleaf Forest 

  LCType3 - Deciduous Needleleaf Forest 

 LCType4 - Deciduous Broadleaf Forest 

 LCType5 - Mixed Forests 

 LCType6 - Closed Shrublands 

 LCType7 - Open Shrublands 

 LCType8 - Woody Savannas 

 LCType9 - Savannas 

 LCType10 - Grasslands 

 LCType11 - Permanent Wetlands 

 LCType12 - Croplands 

 LCType13 - Urban and Built-Up 

 LCType14 - Cropland/Natural Vegetation Mosaic 

 LCType15 - Snow and Ice 

 LCType16 - Barren or Sparsely Vegetated

### Other variables

In [11]:
final_list_of_cols = list(data_gdf_LCsqkm.columns)
sum_cols = [x for x in final_list_of_cols if x not in origin_list_of_cols]
sum_cols = [x for x in sum_cols if x not in landCover_cols]
sum_cols.remove("id")
sum_cols.remove("LC_sum")

# 
sum_rows = ['mean', 'min', 'max']

other_summary_table = pd.DataFrame(index=sum_rows, columns=sum_cols)

for col in sum_cols:
    other_summary_table[col][0] = round(data_gdf_LCsqkm[col].mean(),2)
    other_summary_table[col][1] = round(data_gdf_LCsqkm[col].min(),2)
    other_summary_table[col][2] = round(data_gdf_LCsqkm[col].max(),2)

In [12]:
display(Markdown('###  \n These are the summarized results for the rest of the variables collected for **{}**'.format(country_name)))
display(other_summary_table)
display(Markdown('### Note! \n Units are similar to the original source; you may refer to the [documantation]() for more info.'))

###  
 These are the summarized results for the rest of the variables collected for **lka**

Unnamed: 0,cwd_whe_rai_int_basmean,rai_basmean,evt_whe_rai_int_basemean,evt_mai_irr_hig_basmean,yie_whe_irr_hig_basmean,yie_whe_rai_int_basmean,cwd_mai_irr_hig_basmean
mean,0.03,1708.81,13.4,558.77,8.0,0.06,98.66
min,0.0,973.0,0.0,415.0,-0.01,-0.01,1.0
max,1.0,2642.0,507.0,729.0,11.48,2.87,344.0


### Note! 
 Units are similar to the original source; you may refer to the [documantation]() for more info.

### Export national stats to csv

In [13]:
#Export national stats to csv
LC_summary_table.to_csv(os.path.join(out_path,"{}_LandCover_National_summary.csv".format(country_name)))
other_summary_table.to_csv(os.path.join(out_path,"{}_Variable_National_summary.csv".format(country_name)))

## Cluster Summaries

In [14]:
data_gdf_LCsqkm["cluster"] = data_gdf_LCsqkm["cluster"].astype(str)
non_clustered_data = data_gdf_LCsqkm[data_gdf_LCsqkm["cluster"] == "None"]

display(Markdown('**Note** that there are {} polygons that are not assigned to a cluster  -- classified as "None"'
                 .format(len(non_clustered_data))))

**Note** that there are 0 polygons that are not assigned to a cluster  -- classified as "None"

### Groupby on clusters

In [15]:
clusters = data_gdf_LCsqkm.groupby(['cluster'])

#### Land cover and area

In [20]:
clusters_lc = clusters[landCover_cols].sum().merge(clusters["sqkm"].sum(), on="cluster").round(decimals = 1)

In [25]:
clusters_lc.sort_values(ascending=False, by='sqkm').reset_index()
display(Markdown('#### Cluster summary statistics for area and land cover in {}'.format(country_name)))
clusters_lc

#### Cluster summary statistics for area and land cover

Unnamed: 0,cluster,LCType11,LCType13,LCType0,LCType10,LCType12,LCType14,LCType16,LCType8,LCType4,LCType9,LCType2,LCType6,LCType7,LCType5,LCType3,LCType1,sqkm
0,North_Central,236.3,9.2,18.0,0.2,1155.6,5501.3,0.4,153.9,2.4,736.2,2927.2,2.5,0.0,9.7,0.8,0.0,10753.7
1,Eastern,316.5,66.9,172.9,31.4,1435.3,4185.0,75.2,217.5,16.2,1165.4,2051.3,21.0,4.5,33.1,2.8,0.3,9795.1
2,Northern,234.3,8.9,258.9,215.7,1032.4,2524.0,100.3,272.8,12.7,295.9,4097.1,14.5,42.9,38.0,4.3,0.0,9152.6
3,Uva,95.0,4.4,16.9,0.2,149.9,3539.4,0.4,529.9,14.2,904.1,3025.9,1.3,0.2,16.1,0.2,0.0,8298.2
4,North_Western,135.2,16.8,111.5,45.0,559.5,5351.7,22.0,56.7,0.6,38.3,1714.9,2.0,2.3,9.4,1.6,0.0,8067.4
5,Central,78.1,16.5,1.7,0.2,228.8,1894.7,0.0,89.8,0.4,59.3,3384.0,0.4,0.2,3.6,0.2,0.0,5757.9
6,Southern,191.8,14.9,46.8,3.4,443.5,2308.9,9.9,324.1,14.8,142.4,2055.0,1.6,0.2,13.9,3.3,0.2,5574.7
7,Sabaragamuwa,28.6,4.7,1.7,0.0,23.5,1099.5,0.0,21.0,0.4,21.8,3709.4,0.0,0.0,0.6,0.0,0.0,4911.2
8,Western,55.4,162.7,60.3,8.2,89.8,1703.5,1.4,30.6,2.5,6.2,1688.9,0.8,0.6,5.0,0.2,0.0,3816.2


In [23]:
#Export national stats to csv
clusters_lc.to_csv(os.path.join(out_path,"{}_LandCover_byCluster_summary.csv".format(country_name)))

#### Other variable summaries

In [42]:
clusters_other = clusters[sum_cols].mean().round(decimals = 2)

In [45]:
display(Markdown('#### Cluster summary statistics for other variables in {}'.format(country_name)))
clusters_other

#### Cluster summary statistics for other variables in lka

Unnamed: 0_level_0,cwd_whe_rai_int_basmean,rai_basmean,evt_whe_rai_int_basemean,evt_mai_irr_hig_basmean,yie_whe_irr_hig_basmean,yie_whe_rai_int_basmean,cwd_mai_irr_hig_basmean
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Central,0.28,1992.68,122.34,551.33,5.69,0.52,7.24
Eastern,0.0,1656.41,0.0,571.7,8.24,0.0,135.36
North_Central,0.0,1464.41,0.0,570.26,9.93,0.0,129.23
North_Western,0.0,1601.55,0.0,560.64,8.17,-0.0,79.29
Northern,0.0,1138.42,0.0,578.11,11.24,-0.0,252.56
Sabaragamuwa,0.0,2333.16,0.0,527.84,5.59,0.0,3.66
Southern,0.0,2098.95,0.0,543.99,6.02,0.0,26.42
Uva,0.09,1831.47,35.18,538.47,5.96,0.14,20.53
Western,0.0,2398.43,0.0,542.23,5.61,0.0,12.94


In [46]:
#Export national stats to csv
clusters_other.to_csv(os.path.join(out_path,"{}_Parameter_byCluster_summary.csv".format(country_name)))

### Explore individual clusters

**Provide name of cluster you want to retrieve values for**

In [80]:
# Provide a cluster name or code
cluster_name = clusters["cluster"].unique()[0][0]      # You can replace with any string e.g. "Central"

**Land cover info**

In [81]:
display(Markdown('Land cover summary stats (sum) for **{}** cluster'.format(cluster_name)))
clusters.get_group(cluster_name)[landCover_cols].sum().round(decimals = 2)

Land cover summary stats (sum) for **Central** cluster

LCType11      78.11
LCType13      16.51
LCType0        1.69
LCType10       0.21
LCType12     228.78
LCType14    1894.70
LCType16       0.00
LCType8       89.76
LCType4        0.42
LCType9       59.27
LCType2     3384.01
LCType6        0.42
LCType7        0.21
LCType5        3.60
LCType3        0.21
LCType1        0.00
dtype: float64

**Other Parameter info**

In [82]:
display(Markdown('Parameter summary stats (mean) for **{}** cluster'.format(cluster_name)))
clusters.get_group(cluster_name)[sum_cols].mean().round(decimals = 2)

Parameter summary stats (mean) for **Central** cluster

cwd_whe_rai_int_basmean        0.28
rai_basmean                 1992.68
evt_whe_rai_int_basemean     122.34
evt_mai_irr_hig_basmean      551.33
yie_whe_irr_hig_basmean        5.69
yie_whe_rai_int_basmean        0.52
cwd_mai_irr_hig_basmean        7.24
dtype: float64

### Provide parameter and stat method you want to generate a graph for

In [211]:
def make_interactive_graph(clust_dict, parameter, method, name):
    if method == "sum":
        for key, value in clust_dict.items():
            clust_dict[key] = round(clusters.get_group(key)[parameter].sum(),2)
        fig_Cluster = px.bar(pd.DataFrame.from_dict(clust_dict, orient='index', columns=["sum"]), title="Dictribution of {} over clusters in {}".format(parameter, country_name))
        fig_Cluster.show()
    if method == "mean":
        for key, value in clust_dict.items():
            clust_dict[key] = round(clusters.get_group(key)[parameter].mean(),2)
        fig_Cluster = px.bar(pd.DataFrame.from_dict(clust_dict, orient='index', columns=["mean"]), title="Dictribution of {} over clusters in {}".format(parameter, country_name))
        fig_Cluster.show()
    if method == "median":
        for key, value in clust_dict.items():
            clust_dict[key] = round(clusters.get_group(key)[parameter].median(),2)
        fig_Cluster = px.bar(pd.DataFrame.from_dict(clust_dict, orient='index', columns=["median"]), title="Dictribution of {} over clusters in {}".format(parameter, country_name))
        fig_Cluster.show()
    # Export figure as html
    fig_Cluster.write_html((os.path.join(out_path,"{}_{}_{}_perCluster.html".format(name, parameter, method))))

In [212]:
# Get cluster names
clust_names = list(data_gdf_LCsqkm.cluster.unique())

# Explude non clustered data
#clust_names.remove("None")

# Create a dictionary that includes the name of the clusters and a selected parameter
clust_dict = dict.fromkeys(clust_names, 1)

param = ipywidgets.Dropdown(options=final_list_of_cols)
method = ipywidgets.Dropdown(options=["sum", "mean", "median"])
display(param)
display(method)

Dropdown(options=('id', 'buf_val_1', 'cluster', 'col', 'index', 'index_left', 'lat', 'lon', 'row', 'sqkm', 'cw…

Dropdown(options=('sum', 'mean', 'median'), value='sum')

In [213]:
make_interactive_graph(clust_dict,param.value, method.value, country_name)