# Predict extreme snowmelt events in climate change scenarios

In [15]:
import pandas as pd
import numpy as np
import geopandas as gpd
from collections import Counter
import ast
import dateutil.parser as parser

from imblearn.under_sampling import NearMiss

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, TimeSeriesSplit, GridSearchCV, KFold, cross_validate, PredefinedSplit
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import make_scorer, precision_recall_curve, auc, classification_report
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline

from matplotlib import pyplot as plt


In [2]:
from tsfresh import extract_features, extract_relevant_features, select_features
from tsfresh.utilities.dataframe_functions import impute
from tsfresh.feature_extraction.settings import MinimalFCParameters
from tsfresh.utilities.dataframe_functions import roll_time_series


### Train best model from previous analysis

In [9]:
## Recreate rolled dataframe with minimal features and undersample it (10 days, time lag zero)
all_data_clean = pd.read_csv('../all_data_clean.csv')
df_rolled = roll_time_series(
    all_data_clean[['date', 'flow_site_id', 'flow', 'binary']], column_id="flow_site_id", column_sort="date", max_timeshift=10, min_timeshift=10 - 1, n_jobs=20)
X_features_all = extract_features(
	df_rolled.drop(["binary", "flow_site_id"], axis=1), column_id='id', column_sort='date',
	n_jobs=20, disable_progressbar=False, default_fc_parameters=MinimalFCParameters())
X_features_all.head()


Rolling: 100%|██████████| 100/100 [00:38<00:00,  2.59it/s]
Feature Extraction: 100%|██████████| 100/100 [00:59<00:00,  1.67it/s]


In [11]:
X_features_all['unique_id'] = X_features_all.index

all_data_clean['shifted_date'] = pd.to_datetime(
    all_data_clean.date) + pd.Timedelta(days=0)
all_data_clean['shifted_date'] = all_data_clean['shifted_date'].dt.strftime(
    '%Y-%m-%d')
all_data_clean['unique_id'] = list(
    zip(all_data_clean.flow_site_id, all_data_clean.shifted_date))
all_data_clean = all_data_clean.dropna()

X_features_all = X_features_all.reset_index(drop=True)

X_features_all = pd.merge(X_features_all, all_data_clean[[
                          'binary', 'unique_id']], how='left', on='unique_id')
X_features_all = X_features_all.set_index(
    X_features_all['unique_id'], drop=True)
X_features_all = X_features_all.dropna()
X_features_all.head()


Unnamed: 0_level_0,flow__sum_values,flow__median,flow__mean,flow__length,flow__standard_deviation,flow__variance,flow__root_mean_square,flow__maximum,flow__minimum,unique_id,binary
unique_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
"(114.0, 1909-01-10)",77.6,7.7,7.76,10.0,0.18,0.0324,7.762087,8.0,7.4,"(114.0, 1909-01-10)",0
"(114.0, 1909-01-11)",85.0,7.7,7.727273,11.0,0.200413,0.040165,7.729871,8.0,7.4,"(114.0, 1909-01-11)",0
"(114.0, 1909-01-12)",84.4,7.7,7.672727,11.0,0.200413,0.040165,7.675344,8.0,7.4,"(114.0, 1909-01-12)",0
"(114.0, 1909-01-13)",83.8,7.7,7.618182,11.0,0.184973,0.034215,7.620427,8.0,7.4,"(114.0, 1909-01-13)",0
"(114.0, 1909-01-14)",83.2,7.7,7.563636,11.0,0.149379,0.022314,7.565111,7.7,7.4,"(114.0, 1909-01-14)",0


In [43]:
## only keep 'flow__sum_values', 'flow__standard_deviation', 'flow__variance', 'flow__minimum'
y1 = X_features_all['binary']
undersample = NearMiss(version=3, n_neighbors=3)
X_under, y_under = undersample.fit_resample(X_features_all.drop(
    columns=['binary', 'unique_id', 'flow__median', 'flow__mean', 'flow__length', 'flow__root_mean_square', 'flow__maximum']), y1)
X_under.index = X_features_all['unique_id'][undersample.sample_indices_]
y_under.index = X_features_all['unique_id'][undersample.sample_indices_]
Counter(y_under)


Counter({0: 5516, 1: 5516})

In [44]:
## fit random forest model with parameters {'max_depth': 1, 'max_features': 2, 'n_estimators': 500} 

clf = RandomForestClassifier(max_depth= 1, max_features=2, n_estimators=500, n_jobs=-1, random_state=42, verbose=0)
clf.fit(X_under, y_under)
present_all = clf.predict(X_under)
present_all

array([0, 0, 0, ..., 1, 0, 0])

In [None]:
## check feature importance

### Create scenario data and predict extreme snowmelt for those

In [49]:
## merge scenario ranges with data based on site id
scenarios = gpd.read_file('../flow_sites_scenarios/flow_sites_scenarios.shp')
X_under[['site_id', 'date']] = X_under.index.to_list()
scenarios['site_id'] = scenarios['site_id'].astype(float)
X_under_scenarios = pd.merge(X_under.drop(columns=['date']), scenarios[['site_id', 'rcp26_rang', 'rcp45_rang', 'rcp85_rang']], how='left', on='site_id')
X_under_scenarios.index = X_under.index
X_under_scenarios.dropna(inplace=True)
X_under_scenarios


Unnamed: 0_level_0,flow__sum_values,flow__standard_deviation,flow__variance,flow__minimum,site_id,rcp26_rang,rcp45_rang,rcp85_rang
unique_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
"(1315.0, 1959-10-28)",4240.000,329.537517,108594.975207,74.0000,1315.0,3.08,7.80,13.65
"(2012.0, 1992-06-06)",5011.897,325.292310,105815.087258,64.9905,2012.0,3.08,7.80,13.65
"(1315.0, 1959-10-31)",4267.000,327.285379,107115.719008,95.0000,1315.0,3.08,7.80,13.65
"(1909.0, 1967-10-28)",2021.000,233.188215,54376.743802,41.0000,1909.0,3.38,7.69,13.49
"(1909.0, 1967-10-24)",3710.000,230.753348,53247.107438,41.0000,1909.0,3.38,7.69,13.49
...,...,...,...,...,...,...,...,...
"(2372.0, 1995-02-03)",371.000,7.886593,62.198347,26.0000,2372.0,-4.16,-1.04,-1.84
"(2372.0, 1995-02-04)",383.000,8.515421,72.512397,26.0000,2372.0,-4.16,-1.04,-1.84
"(2372.0, 1996-01-11)",38.100,0.308288,0.095041,3.0000,2372.0,-4.16,-1.04,-1.84
"(2372.0, 1998-02-05)",111.900,2.383310,5.680165,6.3000,2372.0,-4.16,-1.04,-1.84


In [50]:
y_under = y_under[y_under.index.isin(X_under_scenarios.index)]
Counter(y_under)

Counter({0: 5408, 1: 5369})

In [51]:
X_under_scenarios.isna().sum()

flow__sum_values            0
flow__standard_deviation    0
flow__variance              0
flow__minimum               0
site_id                     0
rcp26_rang                  0
rcp45_rang                  0
rcp85_rang                  0
dtype: int64

In [56]:
## predict extreme snowmelt for all data after changing sum_values and minimum by range value (for all 3 scenarios)
X_under_scenarios_26 = X_under_scenarios.drop(columns=['rcp45_rang', 'rcp85_rang', 'site_id'])
X_under_scenarios_26['flow__sum_values'] = X_under_scenarios_26['flow__sum_values'] + (X_under_scenarios_26['flow__sum_values'] * X_under_scenarios_26['rcp26_rang'] / 100)
X_under_scenarios_26['flow__minimum'] = X_under_scenarios_26['flow__minimum'] + (X_under_scenarios_26['flow__minimum'] * X_under_scenarios_26['rcp26_rang'] / 100)
X_under_scenarios_26.drop(columns=['rcp26_rang'], inplace=True)

X_under_scenarios_45 = X_under_scenarios.drop(columns=['rcp26_rang', 'rcp85_rang', 'site_id'])
X_under_scenarios_45['flow__sum_values'] = X_under_scenarios_45['flow__sum_values'] + (X_under_scenarios_45['flow__sum_values'] * X_under_scenarios_45['rcp45_rang'] / 100)
X_under_scenarios_45['flow__minimum'] = X_under_scenarios_45['flow__minimum'] + (X_under_scenarios_45['flow__minimum'] * X_under_scenarios_45['rcp45_rang'] / 100)
X_under_scenarios_45.drop(columns=['rcp45_rang'], inplace=True)

X_under_scenarios_85 = X_under_scenarios.drop(columns=['rcp26_rang', 'rcp45_rang', 'site_id'])
X_under_scenarios_85['flow__sum_values'] = X_under_scenarios_85['flow__sum_values'] + (X_under_scenarios_85['flow__sum_values'] * X_under_scenarios_85['rcp85_rang'] / 100)
X_under_scenarios_85['flow__minimum'] = X_under_scenarios_85['flow__minimum'] + (X_under_scenarios_85['flow__minimum'] * X_under_scenarios_85['rcp85_rang'] / 100)
X_under_scenarios_85.drop(columns=['rcp85_rang'], inplace=True)


In [57]:
rcp26_all = clf.predict(X_under_scenarios_26)
rcp45_all = clf.predict(X_under_scenarios_45)
rcp85_all = clf.predict(X_under_scenarios_85)

In [63]:
Counter(rcp26_all)

Counter({0: 7349, 1: 3428})

### Remove data outside training range and predict again

In [None]:
## calculate min and sum_values for original training data and remove scenario data that is not within the range


## predict extreme snowmelt with reduced data

### Compare results of different scenarios

In [None]:
## compare present, all-data scenarios and separately cleaned data present and scenarios