# General EDA

In [None]:
import pandas as pd
import numpy as np
import warnings
import matplotlib.pyplot as plt
import rasterio
import geopandas as geopd
import rasterio.rio
import seaborn as sns
import datetime as dt 

from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score


from rasterio.plot import show

import pyreadr

RSEED = 42



First, we import the final dataframes.

In [None]:
df_all = geopd.read_file("../data/final_shapefiles/foxes_modelling_all.shp")
df_resamp = geopd.read_file("../data/final_shapefiles/foxes_modelling_resamp.shp")
#sample_points = geopd.read_file("../data/cleaned_shapefiles/sample_points.shp")

## Create dummie variables
### Bin aspect feature
First, we put the aspect feature into bins. One bin for the -1 values (where the slope is zero) and eight bins for the eight geographic directions.

In [None]:
#in a fist step, the category "N" is created twice
df_all["aspect_bin"] = pd.cut(df_all.aspect, 
                                bins = [-1.1,0,22.5,67.5,112.5,157.5,202.5,247.5,292.5,337.5,360],
                                labels = ["Flat", "N", "NE", "E", "SE", "S", "SW", "W", "NW", "N2"])
#in a second step, the second category is renamed to resemble the first
df_all["aspect_bin"] = df_all.aspect_bin.replace("N2","N")

#repeat for resamp:
df_resamp["aspect_bin"] = pd.cut(df_resamp.aspect, 
                                bins = [-1.1,0,22.5,67.5,112.5,157.5,202.5,247.5,292.5,337.5,360],
                                labels = ["Flat", "N", "NE", "E", "SE", "S", "SW", "W", "NW", "N2"])
df_resamp["aspect_bin"] = df_resamp.aspect_bin.replace("N2","N")


### Create dummie variables for all categorical variables

In [None]:
cat_variables = ["soil", "veg", "aspect_bin"]

In [None]:
categories_all = pd.get_dummies(df_all[cat_variables], drop_first=True)
categories_resamp = pd.get_dummies(df_resamp[cat_variables], drop_first=True)

In [None]:
df_all = pd.concat([df_all, categories_all], axis = 1)
df_resamp = pd.concat([df_resamp, categories_resamp], axis = 1)

In [None]:
df_all_enc = df_all.drop(["veg", "soil"], axis = 1)
df_resamp_enc = df_resamp.drop(["veg", "soil"], axis = 1)

In [None]:
df_all_enc.head()

## Start of Correlation Analysis

In [None]:
corr_columns = ["NDVI", "NDMI", "slope", "aspect", "elev", "target"]

corr_all = df_all_enc[corr_columns].corr()
# corr_all = df_all_enc.corr()

plt.figure(figsize = (14,12))
ax = sns.heatmap(corr_all, linewidths=.5, annot=True, cmap='coolwarm', vmin = -1, vmax = 1)
# ax = sns.heatmap(corr_all, linewidths=.5, cmap='coolwarm', vmin = -1, vmax = 1)

In [None]:
df_query = df_all.query("aspect_bin == 'Flat'")
df_query.groupby("veg").count()

In [None]:
df_query = df_all.query("veg == 'Water'")
df_query.groupby("aspect_bin").count()["veg"]

In [None]:
df_all.iloc[:, -19:-1].columns

In [None]:
#corr_columns_veg_soil = df_all.iloc[:, -19:-9].columns
corr_columns_veg_soil = ['soil_Rest', 'soil_Roesberg', 'soil_Stone', 'soil_Water', 'veg_Bush',
       'veg_Dry Shrub', 'veg_Grassland', 'veg_Moist Shrub', 'veg_Snow',
       'veg_Stone', 'veg_Water', 'aspect_bin_N', 'aspect_bin_NE',
       'aspect_bin_E', 'aspect_bin_SE', 'aspect_bin_S', 'aspect_bin_SW',
       'aspect_bin_W', "target"]

corr_all = df_all_enc[corr_columns_veg_soil].corr()
# corr_all = df_all_enc.corr()

plt.figure(figsize = (14,12))
ax = sns.heatmap(corr_all, linewidths=.5, annot=True, cmap='coolwarm', vmin = -1, vmax = 1)
# ax = sns.heatmap(corr_all, linewidths=.5, cmap='coolwarm', vmin = -1, vmax = 1)

In [None]:
corr_resamp = df_resamp_enc[corr_columns].corr()
# corr_resamp = df_resamp_enc.corr()

plt.figure(figsize = (14,12))
ax = sns.heatmap(corr_resamp, linewidths=.5, annot=True, cmap='coolwarm', vmin = -1, vmax = 1)
# ax = sns.heatmap(corr_resamp, linewidths=.5, cmap='coolwarm', vmin = -1, vmax = 1)

In [None]:
df_all_pair = df_all_enc[corr_columns]
sns.pairplot(df_all_pair, hue = "target")

In [None]:
ax = sns.histplot(data = df_all, x = "elev", hue = "target", stat = "proportion", common_norm = False, element = "poly")
ax.set(xlabel = "Elevation", title = "Elevation Distribution in Available (0) and Used (1) Areas")

In [None]:
ax = sns.histplot(data = df_all, x = "NDMI", hue = "target", stat = "proportion", common_norm = False, element = "poly")
#ax.set(xlim = [-0.99, 1], ylim = [0, 0.05])
ax.set(title = "NDMI Distribution in Available (0) and Used (1) Areas")



In [None]:
ax = sns.histplot(data = df_all, x = "NDVI", hue = "target", stat = "probability", common_norm = False, element = "poly")
ax.set(title = "NDVI Distribution in Available (0) and Used (1) Areas")



In [None]:
sns.scatterplot(data = df_all, x = "NDMI", y = "elev")

In [None]:
df_all.query("target == 1").describe()

In [None]:
df_all.query("target == 0").describe()

In [None]:
df_all.groupby(["target","veg"]).count()["soil"]

In [None]:
df_all.aspect_bin

In [None]:
x,y = 'veg', 'target'

ax = (df_all
.groupby(y)[x]
.value_counts(normalize=True)
.mul(100)
.rename('percent')
.reset_index()
.pipe((sns.catplot,'data'), x=x,y='percent',hue=y,kind='bar'))
ax.set(xlabel = "Vegetation", ylabel = "Percent", title = "Vegetation Distribution in Available (0) and Used (1) Areas")
ax.set_xticklabels(rotation = 30, horizontalalignment = "right")

In [None]:
x,y = 'soil', 'target'

ax = (df_all
.groupby(y)[x]
.value_counts(normalize=True)
.mul(100)
.rename('percent')
.reset_index()
.pipe((sns.catplot,'data'), x=x,y='percent',hue=y,kind='bar'))
ax.set(xlabel = "soil")
ax.set_xticklabels(rotation = 30, horizontalalignment = "right")

In [None]:
df_all["aspect_bin_obj"] = df_all.aspect_bin.astype("object")

In [None]:
x,y = 'aspect_bin_obj', 'target'

ax = (df_all
.groupby(y)[x]
.value_counts(normalize=True)
.mul(100)
.rename('percent')
.reset_index()
.pipe((sns.catplot,'data'), x=x,y='percent',hue=y,kind='bar'))
ax.set(xlabel = "Aspect")
#ax.set_xticklabels(rotation = 30, horizontalalignment = "right")

In [None]:
ax = sns.countplot(data = df_all, x = "veg", hue = "target")
ax.set(xlabel = "vegetation")
ax.set_xticklabels(ax.get_xticklabels(), rotation = 30, horizontalalignment = "right")

None

In [None]:
sns.countplot(data = df_all, x = "soil", hue = "target")


In [None]:
sns.countplot(data = df_all, x = "aspect_bin", hue = "target")


In [None]:
sns.countplot(data = df_all, x = "veg", hue = "soil")


## __Baseline Model__

In [None]:
df_all_enc.aspect_bin

In [None]:
X = df_all_enc.iloc[:,5:]
X = X.drop(["target", "geometry", "aspect_bin", "area", "timestamp"], axis = 1)
y = df_all["target"]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y, random_state = RSEED, test_size = 0.25)

In [None]:
dsc_tree = DecisionTreeClassifier()
dsc_tree.fit(X_train, y_train)

y_pred_train = dsc_tree.predict (X_train)
y_pred = dsc_tree.predict(X_test)

In [None]:
results = confusion_matrix(y_train, y_pred_train)
print(results)

In [None]:
results_test = confusion_matrix(y_test, y_pred)
print(results_test)

In [None]:
ax = sns.heatmap(results_test, annot = True, cmap = "Blues")
ax.set_title('Seaborn Confusion Matrix with labels\n\n');
ax.set_xlabel('\nPredicted Values')
ax.set_ylabel('Actual Values ');

## Ticket labels - List must be in alphabetical order
ax.xaxis.set_ticklabels(['False','True'])
ax.yaxis.set_ticklabels(['False','True'])

## Display the visualization of the Confusion Matrix.
plt.show()

In [None]:
print("Accuracy: ", accuracy_score(y_test, y_pred))
print("F1 score: ", f1_score(y_test, y_pred))

In [None]:
dsc_tree.feature_importances_

In [None]:
feat_importances = pd.DataFrame(dsc_tree.feature_importances_, index=X_train.columns, columns=["Importance"])
feat_importances.sort_values(by='Importance', ascending=False, inplace=True)
feat_importances.plot(kind='bar', figsize=(8,6))