# Analysis and Visualization of Complex Agro-Environmental Data
---
### Exercise #8 - correction

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.manifold import MDS
from sklearn.metrics.pairwise import manhattan_distances, euclidean_distances
from matplotlib.offsetbox import OffsetImage, AnnotationBbox

Exercise 8.1 

In [2]:
df = pd.read_csv('EFIplus_medit.zip',compression='zip', sep=";")
df = df.dropna() # remove all rows with missing data
# Subset the df by selecting the environmental variables and the species richness columns
dfsub = df[(df['Catchment_name']=='Douro') | (df['Catchment_name']=='Tejo')]
df_env = dfsub[["Altitude", "Actual_river_slope","Elevation_mean_catch", "prec_ann_catch","temp_ann","temp_jan","temp_jul"]]
df_catch = dfsub[["Catchment_name"]]

In [None]:
efi_scaled = StandardScaler().fit_transform(df_env)
# As a result, we obtained a two-dimensional NumPy array. We can convert it to a pandas DataFrame for a better display.
df_scaled = pd.DataFrame(data=efi_scaled, 
                                columns=df_env.columns)
df_scaled.head()

In [None]:
# select number of PCs
pca = PCA(n_components=7)
pca.fit_transform(df_scaled)
eigenvalues = pca.explained_variance_ # eigenvalues
prop_var = pca.explained_variance_ratio_ # proportion of explained variance

# Scree Plot
PC_numbers = np.arange(pca.n_components_) + 1
 
plt.plot(PC_numbers, 
         prop_var,
         'ro-')
plt.title('Scree Plot', fontsize=20)
plt.ylabel('Proportion of Variance', fontsize=16)
plt.show()

In [None]:
pca = PCA(n_components=2)
PC = pca.fit_transform(df_scaled)
pca_efi = pd.DataFrame(data = PC, 
                            columns = ['PC1', 'PC2'])
pca_efi.head(6)

In [None]:
# Biplot
PC1 = pca_efi['PC1']/(pca_efi['PC1'].max() - pca_efi['PC1'].min())
PC2 = pca_efi['PC2']/(pca_efi['PC2'].max() - pca_efi['PC2'].min())

plt.figure(figsize=(10, 8))
plt.title('Biplot of PCA')
sns.scatterplot(x=PC1,
              y=PC2,
              hue = df_catch['Catchment_name'].tolist(),
              linewidth=0,
              )

n = np.transpose(pca.components_).shape[0] # number of dimensions (2)
for i in range(n):
        plt.arrow(0, 0, np.transpose(pca.components_)[i,0], 
                  np.transpose(pca.components_)[i,1], 
                  color = (0.1, 0.1, 0.1, 0.8),
                  head_width=0.02) # plot arrows for each variable
        plt.text(np.transpose(pca.components_)[i,0]* 1.15, 
                 np.transpose(pca.components_)[i,1] * 1.15, 
                 list(df_env.columns)[i], 
                 color = (0.1, 0.1, 0.1, 0.8), 
                 ha = 'center', 
                 va = 'center') # variable labels for each arrow
plt.legend(title='Catchment name')
plt.xlabel("PC{}".format(1))
plt.ylabel("PC{}".format(2))


Exercise 8.2

In [None]:
# Run MDS
# by default the 'mds' argument is set to 'True', which means it will run a MDS
# Euclidean distances are also the default 
# Only two axis are extracted by default
mds = MDS(random_state=0, normalized_stress = False) 
mds_transf = mds.fit_transform(df_scaled)
# plot the MDS
sns.scatterplot(x=mds_transf[:,0],
              y=mds_transf[:,1],
              hue = df_catch['Catchment_name'].tolist(),
              linewidth=0,
              )

In [8]:
# Run NMDS
nmds = MDS(n_components=5, metric = False, normalized_stress="auto") # 5 components extracted so that stress is > 0.2
nmds_transf = nmds.fit_transform(df_scaled)


In [None]:
stress = nmds.stress_
print(stress)

In [None]:
sns.scatterplot(x=nmds_transf[:,0],
              y=nmds_transf[:,1],
              hue = df_catch['Catchment_name'].tolist(),
              linewidth=0,
              )