## Importing libraries
---------------

In [1]:
import pandas as pd
import pathlib
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

from functools import reduce

## Reading Files
---------------

In [2]:
# Finding CABRa attributes csv files
path_files = pathlib.Path('CABRa_attributes_csv/')
cabra_attributes_files = path_files.rglob('*.csv')

In [3]:
# Reading files
dfs = []
for file in cabra_attributes_files:
    print(file)
    df = pd.read_csv(file, engine='python', delimiter=';')
    dfs.append(df)

CABRa_attributes_csv\CABra_dataset _Additional_attributes.csv
CABRa_attributes_csv\CABra_dataset _Climate_attributes.csv
CABRa_attributes_csv\CABra_dataset_Groundwater_attributes.csv
CABRa_attributes_csv\CABra_dataset_Hydrologic_disturbance_attributes.csv
CABRa_attributes_csv\CABra_dataset_Land_cover_attributes.csv
CABRa_attributes_csv\CABra_dataset_Streamflow_attributes.csv
CABRa_attributes_csv\cabra_general_attributes.csv
CABRa_attributes_csv\cabra_Geology_attributes.csv
CABRa_attributes_csv\CABra_soil_attributes.csv
CABRa_attributes_csv\CABra_topography_attributes.csv


## Creating Dataset and preparing for PCA
-----------------------

In [4]:
# Merging DataFrames
df_merged = reduce(lambda  left,right: pd.merge(left, right, on=['CABra ID', 'ANA ID'], how='outer'), dfs)

In [5]:
df_merged.head()

Unnamed: 0,CABra ID,ANA ID,longitude_centroid,latitude_centroid,dist_coast,clim_p,clim_tmin,clim_tmax,clim_rh,clim_wind,...,soil_carbon,soil_bulk,soil_depth,catch_area,elev_mean,elev_min,elev_max,elev_gauge,catch_slope,catch_order
0,1,13880000,-68.028,-8.989,1407.55,5.64,21.61,30.48,83.17,0.82,...,34.439,25.456,1.335,243.410.279,178.97,44.65,564.03,44.65,3.14,7
1,2,14110000,-66.695,2.016,1009.09,8.88,21.83,28.6,86.41,0.42,...,21.926,17.609,1.256,76.898.617,222.97,80.4,2892.44,80.4,4.72,6
2,3,14230000,-68.936,1.573,1009.7,8.52,22.18,29.67,85.71,0.6,...,23.241,20.53,1.219,24.018.785,151.47,80.86,952.2,80.86,2.81,4
3,4,14250000,-67.305,1.734,1095.42,8.86,22.0,29.13,86.26,0.49,...,22.725,18.612,1.235,127.540.913,184.31,74.32,2892.44,74.32,3.71,6
4,5,14260000,-70.894,1.201,964.52,7.79,21.91,28.8,85.44,0.67,...,22.066,20.859,1.248,41.762.638,206.08,85.09,615.36,85.09,2.3,5


In [35]:
# All CABRa attributes files
df_merged.columns

Index(['CABra ID', 'ANA ID', 'longitude_centroid', 'latitude_centroid',
       'dist_coast', 'clim_p', 'clim_tmin', 'clim_tmax', 'clim_rh',
       'clim_wind', 'clim_srad', 'clim_et', 'clim_pet', 'aridity_index',
       'p_seasonality', 'clim_quality', 'aquif_name', 'aquif_type',
       'catch_wtd', 'catch_hand', 'hand_class', 'dist_urban', 'cover_urban_x',
       'cover_crops_x', 'res_number', 'res_area', 'res_volume',
       'res_regulation', 'water_demand', 'hdisturb_index', 'cover_main',
       'cover_bare', 'cover_forest', 'cover_crops_y', 'cover_grass',
       'cover_moss', 'cover_shrub', 'cover_urban_y', 'cover_snow',
       'cover_waterp', 'cover_waters', 'ndvi_djf', 'ndvi_mam', 'ndvi_jja',
       'ndvi_son', 'q_mean', 'q_1', 'q_5', 'q_95', 'q_99', 'q_lf', 'q_ld',
       'q_hf', 'q_hd', 'q_hfd', 'q_zero', 'q_cv', 'q_lcv', 'q_hcv',
       'q_elasticity', 'fdc_slope', 'baseflow_index', 'runoff_coef',
       'longitude', 'latitude', 'gauge_hreg', 'gauge_biome', 'gauge_state',
    

In [13]:
# The column 'catch_area' is problematic, because it was a object type.
# Changed to 'int'
df_merged['catch_area'] = df_merged['catch_area'].str.replace('.','')
df_merged['catch_area'] = df_merged['catch_area'].astype('int64')
df_merged['catch_area'] = df_merged['catch_area']/1000

In [18]:
# Pre-selection of the variables from CABra columns
variables = ['dist_coast','aridity_index','p_seasonality',  'aquif_name', 'aquif_type',
             'catch_wtd', 'catch_hand', 'hand_class','res_area',  'hdisturb_index',
             'q_elasticity', 'baseflow_index',  'gauge_hreg', 'gauge_biome', 'gauge_state',
             'catch_lith', 'sub_porosity', 'sub_permeability', 'sub_hconduc', 'soil_type',
             'soil_textclass','elev_mean', 'catch_slope', 'catch_order']

In [36]:
# Removing non-numerical variables
number_variables = []
for v, i in zip(variables, df_merged[variables].dtypes):
    if ('float' in str(i)) or ('int' in str(i)):
        number_variables.append(v)
    else:
        pass

In [37]:
number_variables

['dist_coast',
 'aridity_index',
 'p_seasonality',
 'catch_wtd',
 'catch_hand',
 'res_area',
 'hdisturb_index',
 'q_elasticity',
 'baseflow_index',
 'sub_porosity',
 'sub_permeability',
 'sub_hconduc',
 'elev_mean',
 'catch_slope',
 'catch_order']

In [22]:
# Scaling data before PCA
scaler = StandardScaler()
scaler.fit(df_merged[number_variables])
X = scaler.transform(df_merged[number_variables])

In [41]:
# PCA
pca = PCA(n_components=len(number_variables))

X_new = pca.fit_transform(X)

In [42]:
# Acumulative variance ratio from PCA
pca.explained_variance_ratio_.cumsum()

array([0.27763393, 0.43708579, 0.55017914, 0.64985548, 0.74024437,
       0.79615541, 0.84618548, 0.88420249, 0.91846367, 0.94590118,
       0.96834024, 0.98711237, 0.99690331, 1.        , 1.        ])

In [27]:
pca.singular_values_

array([5.53255284e+01, 4.19279953e+01, 3.53108212e+01, 3.31501392e+01,
       3.15679818e+01, 2.48277907e+01, 2.34857727e+01, 2.04728496e+01,
       1.94352649e+01, 1.73924856e+01, 1.57286552e+01, 1.43862012e+01,
       1.03896629e+01, 5.84302855e+00, 2.74382360e-14])

In [28]:
n_pcs = pca.components_.shape[0]
most_important = [np.abs(pca.components_[i]).argmax() for i in range(n_pcs)]
value = [np.abs(pca.components_[i]).max() for i in range(n_pcs)]

In [55]:
# Checking top 5 variables from each PC
for j in range(n_pcs):
    print(f'PCA #{j}\t | {pca.explained_variance_ratio_[j]:.2f}')
    for i in np.argsort(np.abs(pca.components_[j]))[-5:]:
        print(f'{number_variables[i]}\t: {pca.components_[j][i]:.2f}\t|{pca.components_[j][i]*pca.explained_variance_ratio_[j]:.2f}')
    print()

PCA #0	 | 0.28
elev_mean	: 0.30	|0.08
sub_porosity	: -0.33	|-0.09
catch_wtd	: -0.39	|-0.11
catch_hand	: 0.42	|0.12
catch_slope	: 0.46	|0.13

PCA #1	 | 0.16
q_elasticity	: 0.25	|0.04
p_seasonality	: -0.28	|-0.05
hdisturb_index	: -0.30	|-0.05
sub_hconduc	: -0.54	|-0.09
sub_permeability	: -0.54	|-0.09

PCA #2	 | 0.11
p_seasonality	: -0.27	|-0.03
dist_coast	: -0.36	|-0.04
hdisturb_index	: 0.36	|0.04
res_area	: 0.45	|0.05
baseflow_index	: -0.47	|-0.05

PCA #3	 | 0.10
q_elasticity	: -0.37	|-0.04
sub_hconduc	: -0.39	|-0.04
sub_permeability	: -0.39	|-0.04
hdisturb_index	: 0.42	|0.04
res_area	: 0.49	|0.05

PCA #4	 | 0.09
catch_order	: 0.23	|0.02
baseflow_index	: 0.32	|0.03
q_elasticity	: 0.36	|0.03
p_seasonality	: 0.42	|0.04
aridity_index	: 0.64	|0.06

PCA #5	 | 0.06
baseflow_index	: -0.26	|-0.01
catch_wtd	: 0.28	|0.02
q_elasticity	: 0.30	|0.02
elev_mean	: 0.32	|0.02
catch_order	: -0.69	|-0.04

PCA #6	 | 0.05
aridity_index	: 0.23	|0.01
hdisturb_index	: 0.23	|0.01
res_area	: -0.29	|-0.01
catch_w

In [33]:
most_important_names = [number_variables[most_important[i]] for i in range(n_pcs)]

In [34]:
for i in zip(most_important_names, value,pca.explained_variance_ratio_):
    print(i[0], f'|{i[2]:.2f}',f'|{i[1]:.2f}')

catch_slope |0.28 |0.46
sub_permeability |0.16 |0.54
baseflow_index |0.11 |0.47
res_area |0.10 |0.49
aridity_index |0.09 |0.64
catch_order |0.06 |0.69
sub_porosity |0.05 |0.73
dist_coast |0.04 |0.54
q_elasticity |0.03 |0.51
elev_mean |0.03 |0.60
res_area |0.02 |0.57
p_seasonality |0.02 |0.56
catch_hand |0.01 |0.67
catch_slope |0.00 |0.78
sub_hconduc |0.00 |0.71
