# Analysis and Visualization of Complex Agro-Environmental Data
---
### Exercise #9 - correction

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis 
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier

Exercise 9.1 - NOTE: to be more interesting it is better to consider more catchments, so the Minho and Mondego catchments were also considered

In [None]:
df = pd.read_csv('EFIplus_medit.zip',compression='zip', sep=";")
df = df.dropna() # remove all rows with missing data
# Subset the df by selecting the environmental variables and the species richness columns
dfsub = df[(df['Catchment_name']=='Douro') | (df['Catchment_name']=='Tejo') | (df['Catchment_name']=='Minho') | (df['Catchment_name']=='Mondego')]
df_env = dfsub[["Altitude", "Actual_river_slope","Elevation_mean_catch", "prec_ann_catch","temp_ann","temp_jan","temp_jul"]]
df_catch = dfsub[["Catchment_name"]]

In [None]:
efi_scaled = StandardScaler().fit_transform(df_env)
# As a result, we obtained a two-dimensional NumPy array. We can convert it to a pandas DataFrame for a better display.
df_scaled = pd.DataFrame(data=efi_scaled, 
                                columns=df_env.columns)
df_scaled.head()

Define predictor and response variables

In [4]:
X = df_scaled
y = dfsub['Catchment_name'] # 4 classes (Tagus, Douro and Minho and Mondego)

#Fit the LDA model
model = LinearDiscriminantAnalysis(n_components=2)
DLA = model.fit_transform(X, y)

Extract the first two discriminant axis to a DataFrame

In [None]:
DLA_scores = pd.DataFrame(data = DLA, 
                            columns = ['LD1', 'LD2'])
DLA_scores.head(6)

Define method to evaluate model

In [6]:
#defines the kfold crossvalidation settings for the next function 'cross_val_score'
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1) 

#evaluate model
scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
print(np.mean(scores))

0.8869966209501093


Plot the fist discriminant plane

In [None]:
plt.figure(figsize=(10, 8))

sns.scatterplot(x=DLA_scores['LD1'],
              y=DLA_scores['LD2'],
              hue = dfsub['Catchment_name'].tolist(),
              linewidth=0,
              )

n = model.n_features_in_
for i in range(n):
        plt.arrow(0, 0, model.scalings_[i,0]*2, # Scalings were multiplied by a factor of 4 to just to facilitate the visualization
                  model.scalings_[i,1]*2, # Scalings were multiplied by a factor of 4 to just to facilitate the visualization
                  color = (0.1, 0.1, 0.1, 0.8),
                  head_width=0.02) # plot arrows for each variable
        plt.text(model.scalings_[i,0]*2.1, # plot the names of the variables
                 model.scalings_[i,1]*2.1,
                 list(df_scaled)[i], 
                 color = (0.1, 0.1, 0.1, 0.8), 
                 ha = 'center', 
                 va = 'center') # variable labels for each arrow

plt.xlabel('LD1')
plt.ylabel('LD2')
plt.show()

Exercise 9.2 Interactive plot using Dash

Copy the following code to a python file (e.g. "DLA-visual.py")

In [13]:
# Import modules
# import Dash, dcc (stands for Dash Core Components - this module includes a Graph component called dcc.Graph, 
# which is used to render interactive graphs amd dcc.slider to render an interactive slider).
# We also import sklearn.decomposition.PCA to run a PCA, the plotly.express library to build the interactive graphs, 
# and pandas to work with DataFrames.

from dash import Dash, dcc, html, Input, Output
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
import plotly.express as px
import pandas as pd
from sklearn.preprocessing import StandardScaler


# Initialize the app
# This line is known as the Dash constructor and is responsible for initializing your app. 
# It is almost always the same for any Dash app you create.
app = Dash(__name__)

# App layout
# The app layout represents the app components that will be displayed in the web browser, 
# normally contained within a html.Div.
app.layout = html.Div([
    html.H4("Visualization of LDA explained variance", style={'textAlign':'center'}),
    dcc.Graph(id="LDA-visualization-x-graph"),
    html.P("Number of components:"),
    dcc.Slider(
        id='LDA-visualization-x-slider',
        min=2, max=3, value=2, step=1)
])

# Add controls to build the interaction
# The inputs and outputs of our app are the properties of a particular component. 
# The output is the figure property of the component with the ID "pca-visualization-x-graph"
# THe input is the value property of the component that has the ID "pca-visualization-x-slider".
# The callback function's argument 'n_components' refers to the component property of the input. 
# We build PCA plots inside the callback function, assigning the chosen value in the slider. 
# This means that every time the user selects the number of components with the slider, the figure is rebuilt
# to add more or less components
# Finally, we return the scatter plots at the end of the function. 
# This assigns the plots to the figure property of the dcc.Graph, thus displaying the figure in the app.
@app.callback(
    Output(component_id="LDA-visualization-x-graph", component_property="figure"), 
    Input(component_id="LDA-visualization-x-slider", component_property="value"))


def run_and_plot(n_components):
    df = pd.read_csv('EFIplus_medit.zip',compression='zip', sep=";")
    df = df.dropna() # remove all rows with missing data
    # Subset the df by selecting the environmental variables and the species richness columns
    dfsub = df[(df['Catchment_name']=='Douro') | (df['Catchment_name']=='Tejo') | (df['Catchment_name']=='Minho') | (df['Catchment_name']=='Mondego')]
    df_env = dfsub[["Altitude", "Actual_river_slope","Elevation_mean_catch", "prec_ann_catch","temp_ann","temp_jan","temp_jul"]]
    df_catch = dfsub[["Catchment_name"]]
    efi_scaled = StandardScaler().fit_transform(df_env)
    # As a result, we obtained a two-dimensional NumPy array. We can convert it to a pandas DataFrame for a better display.
    df_scaled = pd.DataFrame(data=efi_scaled, columns=df_env.columns)
    X = df_scaled
    y = dfsub['Catchment_name'] # 4 classes (Tagus, Douro and Minho and Mondego)
    model = LinearDiscriminantAnalysis(n_components=n_components) # defines the number of components in the PCA
    components = model.fit_transform(X, y) # fits a PCA
    var = model.explained_variance_ratio_.sum() * 100 # % of explained variance by each PC
    labels = {str(i): f"DC {i+1}" for i in range(n_components)} # PC labels
    labels['color'] = 'quality'
    fig = px.scatter_matrix(
        components,
        color=dfsub['Catchment_name'],
        dimensions=range(n_components),
        labels=labels,
        title=f'Total Explained Variance: {var:.2f}%',
        width=1400, height=1300
        )
    fig.update_traces(diagonal_visible=False)
    return fig

# Run the app - These lines are for running your app, and they are almost always the same for any Dash app you create.
if __name__ == "__main__":
    app.run_server(debug=True)