# Analysis and Visualization of Complex Agro-Environmental Data
---
### Exercise #7 - correction

In [10]:
import numpy as np # for getting our distribution
import pandas as pd # to handle data frames
import matplotlib.pyplot as plt # for plotting
import seaborn as sns # for plotting
from scipy import stats # to compute statistics
from scipy.cluster.hierarchy import dendrogram, linkage # to run the linkage method and create dendrograms
from sklearn.cluster import AgglomerativeClustering # to perform agglomerative clustering

#### Exercise 7.1

In [11]:
df = pd.read_csv('EFIplus_medit.zip',compression='zip', sep=";")
df = df.dropna() # remove all rows with missing data

In [None]:
# Subset the df by selecting the environmental variables and the species richness columns
dfsub = df[(df['Catchment_name']=='Douro') | (df['Catchment_name']=='Tejo')]
df_env = dfsub[["Altitude", "Actual_river_slope","Elevation_mean_catch", "prec_ann_catch","temp_ann","temp_jan","temp_jul"]]
df_env

In [None]:
dendrogram_plot = dendrogram(linkage(df_env, method='ward'))
plt.xlabel('objects')
plt.ylabel('Distance')

In [None]:
# run linkage
clust_link = linkage(df_env, 
                   metric = 'euclidean', # cityblock or Manhattan dissimilarity for the dissimilarity matrix
                   method='average')

# run dendrogram
plt.figure(figsize=(10, 60))
dendrogram_plot = dendrogram(clust_link, 
                            truncate_mode='lastp',  # show only the last p merged clusters - important when there are too many objects
                            
                            p=400,  # p merged clusters to show 
                            leaf_font_size=8.,
                            show_contracted=True,  # to get a distribution impression in truncated branches
                            orientation='right') # orientation 90º right
plt.title('Average' )
plt.xlabel('Distance')
plt.ylabel('Objects')

# set the number and cluster composition by considering a maximum distance of 8 by drawing a vertical line (x=8)
plt.axvline(x=800, color='r', linestyle='--')

In [None]:
# run cluster analysis and define 3 clusters
cluster3 = AgglomerativeClustering(n_clusters=3, 
                                    metric='euclidean',
                                    linkage='average')
cluster3.fit_predict(df_env)
group3_labels = cluster3.labels_
group3_labels # labels of each group of objects (each line in df)

In [None]:
# Let's now check if the clusters that were defined have anything to do with the river basins (Tejo and Douro)
mydict2 = {'Catch_name': dfsub['Catchment_name'], 'label': group3_labels}
mydf = pd.DataFrame(mydict2)

cross_df = pd.crosstab(index=mydict2['Catch_name'], columns=mydict2['label'])
print(cross_df)



#### Exercise 7.2

In [None]:
# Plot the heatmap + dendrogram
sns.clustermap(df_env, col_cluster=False, row_cluster=True, method='average')

In [20]:
# need to standardize the data (variables with different units)
from sklearn.preprocessing import StandardScaler

df_env_scaled = StandardScaler().fit_transform(df_env)

In [None]:
sns.clustermap(df_env_scaled, col_cluster=False, row_cluster=True, method='average')

Compare dendrograms obtained with non-scaled and with scaled variables:

In [None]:
# dendrogram with non-scaled variables
dendrogram_plot = dendrogram(linkage(df_env, method='ward'))
plt.xlabel('objects')
plt.ylabel('Distance')

In [None]:
# dendrogram with scaled variables
dendrogram_plot = dendrogram(linkage(df_env_scaled, method='ward'))
plt.xlabel('objects')
plt.ylabel('Distance')