# Extract time series parameters from flow and use to predict extreme snowmelt

In [226]:
import pandas as pd
import numpy as np
from collections import Counter
import ast
import dateutil.parser as parser

from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier
from sklearn.model_selection import train_test_split, TimeSeriesSplit, GridSearchCV, KFold, cross_validate, PredefinedSplit
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import make_scorer, precision_recall_curve, auc, classification_report
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline

from matplotlib import pyplot as plt


In [227]:
from tsfresh import extract_features, extract_relevant_features, select_features
from tsfresh.utilities.dataframe_functions import impute
from tsfresh.feature_extraction.settings import MinimalFCParameters
from tsfresh.utilities.dataframe_functions import roll_time_series


In [228]:
# build scorer function
def auc_pr_score(y_true, y_pred):
	precision, recall, _ = precision_recall_curve(y_true, y_pred)
	return auc(recall, precision)


auc_pr = make_scorer(auc_pr_score, greater_is_better=True)

N_DAYS = 30
TIME_LAG = 10


In [229]:
## Define functions

def random_forest_site(X_under, y_under, results, options):
	indx = X_under.index.unique()
	sites = [x[0] for x in indx]
	sites = list(set(sites))
	sites_train = sites[:len(sites)//10*6]
	sites_test = sites[len(sites)//10*6:]
	idx_train = [x for x in indx if x[0] in sites_train]
	idx_test = [x for x in indx if x[0] in sites_test]
	X_filtered_train = X_under[X_under.index.isin(idx_train)]
	X_filtered_test = X_under[X_under.index.isin(idx_test)]
	y_train = y_under[y_under.index.isin(idx_train)]
	y_test = y_under[y_under.index.isin(idx_test)]

	X_all = pd.concat([X_filtered_train, X_filtered_test]).reset_index(drop=True)
	y_all = pd.concat([y_train, y_test]).reset_index(drop=True)
	clf = RandomForestClassifier(n_jobs=-1, random_state=42, verbose=0)

	split_index = [-1 if x in X_filtered_train.index else 0 for x in X_under.index]
	ps = PredefinedSplit(test_fold=split_index)

	param_grid = {
		'max_depth': (1, 5, 10, 25),
		'n_estimators': (100, 500, 750, 1500),
		'max_features': (2, 3, 5, 10)}

	gs = GridSearchCV(clf, param_grid=param_grid, cv=ps,
					scoring=auc_pr, n_jobs=-1, verbose=0)
	gs.fit(X_all, y_all)
	print(gs.best_params_, gs.best_score_)
	results = results.append({'n_days': N_DAYS, 'time_lag': TIME_LAG, 'eval_type': 'site', 'param_type': options['param_type'], 'year': options['year'],
                           'params': gs.best_params_, 'score': gs.best_score_, 'model': 'Random Forest', 
						   'variables': X_under.columns, 'n_features': len(X_under.columns), 'n_samples': len(X_under)}, ignore_index=True)
	
	return results

def random_forest_time(X_under, y_under, results, options):
	X_filtered_sorted = X_under.sort_index(key=lambda d: d.map(lambda x: x[1]))
	y_under_sorted = y_under.sort_index(key=lambda d: d.map(lambda x: x[1]))
	tscv = TimeSeriesSplit(n_splits=5)

	param_grid = {
		'max_depth': (1, 5, 10, 25),
		'n_estimators': (100, 500, 750, 1500),
		'max_features': (2, 3, 5, 10)}

	clf = RandomForestClassifier(n_jobs=-1, random_state=42, verbose=0)

	gs = GridSearchCV(clf, param_grid=param_grid, cv=tscv,
					scoring=auc_pr, n_jobs=-1, verbose=0)
	gs.fit(X_filtered_sorted, y_under_sorted)
	
	results = results.append({'n_days': N_DAYS, 'time_lag': TIME_LAG, 'eval_type': 'time', 'param_type': options['param_type'], 'year': options['year'], 
                           'params': gs.best_params_, 'score': gs.best_score_, 'model': 'Random Forest', 
                           'variables': X_under.columns, 'n_features': len(X_under.columns), 'n_samples': len(X_under)}, ignore_index=True)
	print(gs.best_params_, gs.best_score_)

	return results


def gradient_boost_site(X_under, y_under, results, options):
	indx = X_under.index.unique()
	sites = [x[0] for x in indx]
	sites = list(set(sites))
	sites_train = sites[:len(sites)//10*6]
	sites_test = sites[len(sites)//10*6:]
	idx_train = [x for x in indx if x[0] in sites_train]
	idx_test = [x for x in indx if x[0] in sites_test]
	X_filtered_train = X_under[X_under.index.isin(idx_train)]
	X_filtered_test = X_under[X_under.index.isin(idx_test)]
	y_train = y_under[y_under.index.isin(idx_train)]
	y_test = y_under[y_under.index.isin(idx_test)]

	X_all = pd.concat([X_filtered_train, X_filtered_test]).reset_index(drop=True)
	y_all = pd.concat([y_train, y_test]).reset_index(drop=True)
	clf = HistGradientBoostingClassifier(random_state=42, verbose=0, early_stopping=False)

	split_index = [-1 if x in X_filtered_train.index else 0 for x in X_under.index]
	ps = PredefinedSplit(test_fold=split_index)

	param_grid = {'max_iter': (100, 1000, 1500),
               'learning_rate': (0.01, 0.1, 1),
               'max_depth': (1, 5, 10, 25, 50),
               }

	gs = GridSearchCV(clf, param_grid=param_grid, cv=ps,
                   scoring=auc_pr, n_jobs=-1, verbose=0)
	gs.fit(X_all, y_all)
	print(gs.best_params_, gs.best_score_)
	results = results.append({'n_days': N_DAYS, 'time_lag': TIME_LAG, 'eval_type': 'site', 'param_type': options['param_type'], 'year': options['year'],
                           'params': gs.best_params_, 'score': gs.best_score_, 'model': 'Gradient Boost', 
                           'variables': X_under.columns, 'n_features': len(X_under.columns), 'n_samples': len(X_under)}, ignore_index=True)

	return results


def gradient_boost_time(X_under, y_under, results, options):
	X_filtered_sorted = X_under.sort_index(key=lambda d: d.map(lambda x: x[1]))
	y_under_sorted = y_under.sort_index(key=lambda d: d.map(lambda x: x[1]))
	tscv = TimeSeriesSplit(n_splits=5)

	param_grid = {'max_iter': (100, 1000, 1500),
               'learning_rate': (0.01, 0.1, 1),
               'max_depth': (1, 5, 10, 25, 50),
               }
	clf = HistGradientBoostingClassifier(random_state=42, verbose=0, early_stopping=False)

	gs = GridSearchCV(clf, param_grid=param_grid, cv=tscv,
                   scoring=auc_pr, n_jobs=-1, verbose=0)
	gs.fit(X_filtered_sorted, y_under_sorted)

	results = results.append({'n_days': N_DAYS, 'time_lag': TIME_LAG, 'eval_type': 'time', 'param_type': options['param_type'], 'year': options['year'],
                           'params': gs.best_params_, 'score': gs.best_score_, 'model': 'Gradient Boost', 
                           'variables': X_under.columns, 'n_features': len(X_under.columns), 'n_samples': len(X_under)}, ignore_index=True)
	print(gs.best_params_, gs.best_score_)

	return results


def svm_site(X_under, y_under, results, options):
	indx = X_under.index.unique()
	sites = [x[0] for x in indx]
	sites = list(set(sites))
	sites_train = sites[:len(sites)//10*6]
	sites_test = sites[len(sites)//10*6:]
	idx_train = [x for x in indx if x[0] in sites_train]
	idx_test = [x for x in indx if x[0] in sites_test]
	X_filtered_train = X_under[X_under.index.isin(idx_train)]
	X_filtered_test = X_under[X_under.index.isin(idx_test)]
	y_train = y_under[y_under.index.isin(idx_train)]
	y_test = y_under[y_under.index.isin(idx_test)]

	X_all = pd.concat([X_filtered_train, X_filtered_test]).reset_index(drop=True)
	y_all = pd.concat([y_train, y_test]).reset_index(drop=True)

	split_index = [-1 if x in X_filtered_train.index else 0 for x in X_under.index]
	ps = PredefinedSplit(test_fold=split_index)

	scaler = StandardScaler()
	param_grid = {
               'svc__C': (1, 10, 100),
			   'svc__max_iter': (100, 1000, 1500, 2000),
			   'svc__dual': (True, False),
               }
	svc = LinearSVC(random_state=42, verbose=0)

	pipe = Pipeline([('scaler', scaler), ('svc', svc)])

	gs = GridSearchCV(pipe, param_grid=param_grid, cv=ps,
                   scoring=auc_pr, n_jobs=-1, verbose=0)
	gs.fit(X_all, y_all)
	print(gs.best_params_, gs.best_score_)
	results = results.append({'n_days': N_DAYS, 'time_lag': TIME_LAG, 'eval_type': 'site', 'param_type': options['param_type'], 'year': options['year'],
                           'params': gs.best_params_, 'score': gs.best_score_, 'model': 'SVC',
                           'variables': X_under.columns, 'n_features': len(X_under.columns), 'n_samples': len(X_under)}, ignore_index=True)

	return results


def svm_time(X_under, y_under, results, options):
	X_filtered_sorted = X_under.sort_index(key=lambda d: d.map(lambda x: x[1]))
	y_under_sorted = y_under.sort_index(key=lambda d: d.map(lambda x: x[1]))
	tscv = TimeSeriesSplit(n_splits=5)

	scaler = StandardScaler()
	param_grid = {
            'svc__C': (1, 10, 100),
            'svc__max_iter': (100, 1000, 1500, 2000),
         	'svc__dual': (True, False),
        }
	svc = LinearSVC(random_state=42, verbose=0)

	pipe = Pipeline([('scaler', scaler), ('svc', svc)])

	gs = GridSearchCV(pipe, param_grid=param_grid, cv=tscv,
                   scoring=auc_pr, n_jobs=-1, verbose=0)
	gs.fit(X_filtered_sorted, y_under_sorted)

	results = results.append({'n_days': N_DAYS, 'time_lag': TIME_LAG, 'eval_type': 'time', 'param_type': options['param_type'], 'year': options['year'],
                           'params': gs.best_params_, 'score': gs.best_score_, 'model': 'SVC',
                           'variables': X_under.columns, 'n_features': len(X_under.columns), 'n_samples': len(X_under)}, ignore_index=True)
	print(gs.best_params_, gs.best_score_)

	return results


## Create rolled dataframe 

In [230]:
all_data_clean = pd.read_csv('../all_data_clean.csv')

all_data_clean


Unnamed: 0,date,snow_site_id,snow_depth,depth_diff,binary,flow_site_id,temp_site_id,prec_site_id,flow,temp,prec
0,1965-01-14,75280,0.00,0.00,0,1878.0,74240.0,74240.0,0.0510,1.9,1.5
1,1965-01-15,75280,0.00,0.00,0,1878.0,74240.0,74240.0,0.0510,1.2,4.0
2,1965-01-16,75280,0.00,0.00,0,1878.0,74240.0,74240.0,0.0580,1.5,4.5
3,1965-01-20,75280,0.03,0.03,0,1878.0,74240.0,74240.0,0.0760,-0.9,0.0
4,1965-01-21,75280,0.03,0.03,0,1878.0,74240.0,74240.0,0.0760,-1.7,2.0
...,...,...,...,...,...,...,...,...,...,...,...
108249,2021-06-09,160970,0.00,0.00,0,1387.0,160970.0,160960.0,340.1524,17.2,0.0
108250,2021-06-23,160970,0.00,0.00,0,1387.0,160970.0,160960.0,354.6912,14.6,0.0
108251,2021-06-24,160970,0.00,0.00,0,1387.0,160970.0,160960.0,340.7535,17.3,0.0
108252,2021-06-30,160970,0.00,0.00,0,1387.0,160970.0,160960.0,269.8846,15.0,0.0


In [231]:
## Here can change parameters only once
df_rolled = roll_time_series(
    all_data_clean[['date', 'flow_site_id', 'prec', 'temp', 'binary']], column_id="flow_site_id", column_sort="date", max_timeshift=N_DAYS, min_timeshift=N_DAYS - 1, n_jobs=20)


Rolling: 100%|██████████| 100/100 [00:40<00:00,  2.46it/s]


In [232]:
df_rolled

Unnamed: 0,date,flow_site_id,prec,temp,binary,id
0,1909-01-01,114.0,0.3,-16.0,0,"(114.0, 1909-02-12)"
1,1909-01-02,114.0,0.0,-3.9,0,"(114.0, 1909-02-12)"
2,1909-01-03,114.0,0.0,-14.5,0,"(114.0, 1909-02-12)"
3,1909-01-04,114.0,0.0,-2.5,0,"(114.0, 1909-02-12)"
4,1909-01-05,114.0,0.0,-7.8,0,"(114.0, 1909-02-12)"
...,...,...,...,...,...,...
2045230,1998-04-16,2372.0,1.0,5.9,0,"(2372.0, 1998-04-29)"
2045231,1998-04-17,2372.0,0.0,5.6,0,"(2372.0, 1998-04-29)"
2045232,1998-04-22,2372.0,0.0,7.2,0,"(2372.0, 1998-04-29)"
2045233,1998-04-23,2372.0,0.0,9.7,0,"(2372.0, 1998-04-29)"


In [233]:
df_rolled.to_csv('../df_rolled_' + str(N_DAYS) + '.csv', index=False)

## Extract minimal timeseries features

In [234]:
df_rolled = pd.read_csv('../df_rolled_' + str(N_DAYS) + '.csv')
all_data_clean = pd.read_csv('../all_data_clean.csv')

In [235]:
df_rolled

Unnamed: 0,date,flow_site_id,prec,temp,binary,id
0,1909-01-01,114.0,0.3,-16.0,0,"(114.0, '1909-02-12')"
1,1909-01-02,114.0,0.0,-3.9,0,"(114.0, '1909-02-12')"
2,1909-01-03,114.0,0.0,-14.5,0,"(114.0, '1909-02-12')"
3,1909-01-04,114.0,0.0,-2.5,0,"(114.0, '1909-02-12')"
4,1909-01-05,114.0,0.0,-7.8,0,"(114.0, '1909-02-12')"
...,...,...,...,...,...,...
3336504,1998-04-16,2372.0,1.0,5.9,0,"(2372.0, '1998-04-29')"
3336505,1998-04-17,2372.0,0.0,5.6,0,"(2372.0, '1998-04-29')"
3336506,1998-04-22,2372.0,0.0,7.2,0,"(2372.0, '1998-04-29')"
3336507,1998-04-23,2372.0,0.0,9.7,0,"(2372.0, '1998-04-29')"


In [236]:
# extract timeseries features

X_features_all = extract_features(
	df_rolled.drop(["binary", "flow_site_id"], axis=1), column_id='id', column_sort='date',
	n_jobs=20, disable_progressbar=False, default_fc_parameters=MinimalFCParameters())


X_features_all.head()


Feature Extraction: 100%|██████████| 100/100 [01:16<00:00,  1.30it/s]


Unnamed: 0,prec__sum_values,prec__median,prec__mean,prec__length,prec__standard_deviation,prec__variance,prec__root_mean_square,prec__maximum,prec__minimum,temp__sum_values,temp__median,temp__mean,temp__length,temp__standard_deviation,temp__variance,temp__root_mean_square,temp__maximum,temp__minimum
"(114.0, '1909-02-12')",11.9,0.0,0.396667,30.0,0.762663,0.581656,0.859651,2.9,0.0,-314.2,-11.45,-10.473333,30.0,6.19876,38.424622,12.170264,3.2,-23.3
"(114.0, '1909-02-13')",11.9,0.0,0.383871,31.0,0.753528,0.567804,0.845672,2.9,0.0,-316.8,-10.9,-10.219355,31.0,6.25462,39.120271,11.981464,3.2,-23.3
"(114.0, '1909-02-14')",11.9,0.0,0.383871,31.0,0.753528,0.567804,0.845672,2.9,0.0,-302.5,-9.8,-9.758065,31.0,6.338046,40.170822,11.635749,3.2,-23.3
"(114.0, '1909-02-15')",11.9,0.0,0.383871,31.0,0.753528,0.567804,0.845672,2.9,0.0,-300.9,-9.8,-9.706452,31.0,6.391826,40.855442,11.62199,3.2,-23.3
"(114.0, '1909-02-16')",11.9,0.0,0.383871,31.0,0.753528,0.567804,0.845672,2.9,0.0,-296.5,-9.8,-9.564516,31.0,6.332382,40.099063,11.47079,3.2,-23.3


In [237]:
## Add binary response variable back based on unique id

X_features_all['unique_id'] = X_features_all.index
X_features_all['unique_id'] = X_features_all['unique_id'].apply(ast.literal_eval)

all_data_clean['shifted_date'] = pd.to_datetime(
    all_data_clean.date) + pd.Timedelta(days=TIME_LAG)
all_data_clean['shifted_date'] = all_data_clean['shifted_date'].dt.strftime(
    '%Y-%m-%d')
all_data_clean['unique_id'] = list(
    zip(all_data_clean.flow_site_id, all_data_clean.shifted_date))
all_data_clean = all_data_clean.dropna()

X_features_all = X_features_all.reset_index(drop=True)
	
X_features_all = pd.merge(X_features_all, all_data_clean[[
                          'binary', 'unique_id']], how='left', on='unique_id')
X_features_all = X_features_all.set_index(
    X_features_all['unique_id'], drop=True)
X_features_all = X_features_all.dropna()
X_features_all.head()


Unnamed: 0_level_0,prec__sum_values,prec__median,prec__mean,prec__length,prec__standard_deviation,prec__variance,prec__root_mean_square,prec__maximum,prec__minimum,temp__sum_values,temp__median,temp__mean,temp__length,temp__standard_deviation,temp__variance,temp__root_mean_square,temp__maximum,temp__minimum,unique_id,binary
unique_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
"(114.0, 1909-02-12)",11.9,0.0,0.396667,30.0,0.762663,0.581656,0.859651,2.9,0.0,-314.2,-11.45,-10.473333,30.0,6.19876,38.424622,12.170264,3.2,-23.3,"(114.0, 1909-02-12)",0.0
"(114.0, 1909-02-13)",11.9,0.0,0.383871,31.0,0.753528,0.567804,0.845672,2.9,0.0,-316.8,-10.9,-10.219355,31.0,6.25462,39.120271,11.981464,3.2,-23.3,"(114.0, 1909-02-13)",0.0
"(114.0, 1909-02-14)",11.9,0.0,0.383871,31.0,0.753528,0.567804,0.845672,2.9,0.0,-302.5,-9.8,-9.758065,31.0,6.338046,40.170822,11.635749,3.2,-23.3,"(114.0, 1909-02-14)",0.0
"(114.0, 1909-02-15)",11.9,0.0,0.383871,31.0,0.753528,0.567804,0.845672,2.9,0.0,-300.9,-9.8,-9.706452,31.0,6.391826,40.855442,11.62199,3.2,-23.3,"(114.0, 1909-02-15)",0.0
"(114.0, 1909-02-16)",11.9,0.0,0.383871,31.0,0.753528,0.567804,0.845672,2.9,0.0,-296.5,-9.8,-9.564516,31.0,6.332382,40.099063,11.47079,3.2,-23.3,"(114.0, 1909-02-16)",0.0


In [238]:
X_features_all.to_csv('../df_extracted_min_' + str(N_DAYS) + '.csv', index=False)

## Undersample minimal timeseries feature dataset and run Random Forest model

In [239]:
X_features_all = pd.read_csv('../df_extracted_min_' + str(N_DAYS) + '.csv')


In [240]:
X_features_all['unique_id'] = X_features_all['unique_id'].apply(ast.literal_eval)
X_features_all.head()


Unnamed: 0,prec__sum_values,prec__median,prec__mean,prec__length,prec__standard_deviation,prec__variance,prec__root_mean_square,prec__maximum,prec__minimum,temp__sum_values,temp__median,temp__mean,temp__length,temp__standard_deviation,temp__variance,temp__root_mean_square,temp__maximum,temp__minimum,unique_id,binary
0,11.9,0.0,0.396667,30.0,0.762663,0.581656,0.859651,2.9,0.0,-314.2,-11.45,-10.473333,30.0,6.19876,38.424622,12.170264,3.2,-23.3,"(114.0, 1909-02-12)",0.0
1,11.9,0.0,0.383871,31.0,0.753528,0.567804,0.845672,2.9,0.0,-316.8,-10.9,-10.219355,31.0,6.25462,39.120271,11.981464,3.2,-23.3,"(114.0, 1909-02-13)",0.0
2,11.9,0.0,0.383871,31.0,0.753528,0.567804,0.845672,2.9,0.0,-302.5,-9.8,-9.758065,31.0,6.338046,40.170822,11.635749,3.2,-23.3,"(114.0, 1909-02-14)",0.0
3,11.9,0.0,0.383871,31.0,0.753528,0.567804,0.845672,2.9,0.0,-300.9,-9.8,-9.706452,31.0,6.391826,40.855442,11.62199,3.2,-23.3,"(114.0, 1909-02-15)",0.0
4,11.9,0.0,0.383871,31.0,0.753528,0.567804,0.845672,2.9,0.0,-296.5,-9.8,-9.564516,31.0,6.332382,40.099063,11.47079,3.2,-23.3,"(114.0, 1909-02-16)",0.0


In [241]:
y1 = X_features_all['binary']
Counter(y1)

Counter({0.0: 93399, 1.0: 4680})

In [242]:
## undersample

from imblearn.under_sampling import NearMiss

undersample = NearMiss(version=3, n_neighbors=3)
X_under, y_under = undersample.fit_resample(
    X_features_all.drop(columns=['binary', 'unique_id']), y1)


In [243]:
X_under.index = X_features_all['unique_id'][undersample.sample_indices_]
y_under.index = X_features_all['unique_id'][undersample.sample_indices_]
Counter(y_under)


Counter({0.0: 4680, 1.0: 4680})

In [244]:
results = pd.DataFrame(columns=['n_days', 'time_lag', 'eval_type', 'param_type', 'year', 'params', 'score', 'model', 'variables', 'n_features', 'n_samples'])


In [245]:
# Remove correlated features
X_under = X_under.drop(['prec__length', 'temp__length'], axis=1)
corr_matrix = X_under.corr().abs()
upper = corr_matrix.where(
    np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool_))
to_drop = [column for column in upper.columns if any(upper[column] > 0.90)]
X_selected = X_under.drop(to_drop, axis=1)
X_selected


Unnamed: 0_level_0,prec__sum_values,prec__median,prec__standard_deviation,prec__minimum,temp__sum_values,temp__standard_deviation,temp__root_mean_square,temp__maximum,temp__minimum
unique_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
"(2340.0, 1999-12-25)",317.7,5.0,11.673319,0.0,27.6,5.343513,5.417177,7.6,-14.0
"(2340.0, 1999-12-26)",316.5,5.0,11.671093,0.0,22.1,5.254844,5.302982,7.6,-14.0
"(2340.0, 1999-12-28)",314.2,5.0,11.726770,0.0,7.8,4.984872,4.991218,6.3,-14.0
"(2340.0, 1999-12-31)",284.3,1.7,11.506133,0.0,-12.4,4.770947,4.787686,6.3,-14.0
"(2340.0, 2000-01-03)",263.4,1.9,11.028979,0.0,-17.1,4.647640,4.680261,6.3,-14.0
...,...,...,...,...,...,...,...,...,...
"(591.0, 2021-05-24)",46.0,0.0,4.066234,0.0,60.4,2.859943,3.460561,8.0,-2.2
"(591.0, 2021-05-27)",46.0,0.0,4.066234,0.0,85.7,3.093025,4.148416,8.8,-2.0
"(591.0, 2021-05-28)",46.0,0.0,4.066234,0.0,93.3,3.180014,4.378430,8.8,-2.0
"(591.0, 2021-05-29)",46.0,0.0,4.066234,0.0,101.6,3.197349,4.578702,8.8,-2.0


In [246]:
## run model without year and split by site and time
results = random_forest_site(X_selected, y_under, results, options={'param_type': 'Minimal', 'year': 'No'})
results = random_forest_time(X_selected, y_under, results, options={'param_type': 'Minimal', 'year': 'No'})


16 fits failed out of a total of 64.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
16 fits failed with the following error:
Traceback (most recent call last):
  File "/home/cecilia/.local/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 681, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/cecilia/.local/lib/python3.8/site-packages/sklearn/ensemble/_forest.py", line 442, in fit
    trees = Parallel(
  File "/home/cecilia/.local/lib/python3.8/site-packages/joblib/parallel.py", line 1061, in __call__
    self.retrieve()
  File "/home/cecilia/.local/lib/python3.8/site-packages/joblib/parallel.py", line 940, in retrieve
    self._output.extend(job.get(timeout=self.timeout))
  File "

{'max_depth': 10, 'max_features': 3, 'n_estimators': 750} 0.836455694682998


80 fits failed out of a total of 320.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
80 fits failed with the following error:
Traceback (most recent call last):
  File "/home/cecilia/.local/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 681, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/cecilia/.local/lib/python3.8/site-packages/sklearn/ensemble/_forest.py", line 442, in fit
    trees = Parallel(
  File "/home/cecilia/.local/lib/python3.8/site-packages/joblib/parallel.py", line 1061, in __call__
    self.retrieve()
  File "/home/cecilia/.local/lib/python3.8/site-packages/joblib/parallel.py", line 940, in retrieve
    self._output.extend(job.get(timeout=self.timeout))
  File 

{'max_depth': 10, 'max_features': 2, 'n_estimators': 1500} 0.7649896542596559


In [247]:
## run model without year and split by site and time
results = svm_site(X_selected, y_under, results, options={'param_type': 'Minimal', 'year': 'No'})
results = svm_time(X_selected, y_under, results, options={'param_type': 'Minimal', 'year': 'No'})




{'svc__C': 100, 'svc__dual': True, 'svc__max_iter': 2000} 0.8369555590648399
{'svc__C': 1, 'svc__dual': True, 'svc__max_iter': 100} 0.746728373309892




In [248]:
## add year to features
dates = [parser.parse(x[1]).year for x in X_selected.index]
X_selected['year'] = dates


In [249]:
## run model with year and split by site and time
results = random_forest_site(X_selected, y_under, results, options={'param_type': 'Minimal', 'year': 'Yes'})
results = random_forest_time(X_selected, y_under, results, options={'param_type': 'Minimal', 'year': 'Yes'})


{'max_depth': 5, 'max_features': 2, 'n_estimators': 500} 0.7952029706174917
{'max_depth': 5, 'max_features': 5, 'n_estimators': 1500} 0.7613223148729052


In [250]:
## run model with year and split by site and time
results = svm_site(X_selected, y_under, results, options={'param_type': 'Minimal', 'year': 'Yes'})
results = svm_time(X_selected, y_under, results, options={'param_type': 'Minimal', 'year': 'Yes'})




{'svc__C': 100, 'svc__dual': True, 'svc__max_iter': 1500} 0.8156570543775972
{'svc__C': 10, 'svc__dual': True, 'svc__max_iter': 1000} 0.7476022723428198




In [251]:
df_rolled['id'] = df_rolled['id'].apply(ast.literal_eval)
df_rolled.id

0           (114.0, 1909-02-12)
1           (114.0, 1909-02-12)
2           (114.0, 1909-02-12)
3           (114.0, 1909-02-12)
4           (114.0, 1909-02-12)
                   ...         
3336504    (2372.0, 1998-04-29)
3336505    (2372.0, 1998-04-29)
3336506    (2372.0, 1998-04-29)
3336507    (2372.0, 1998-04-29)
3336508    (2372.0, 1998-04-29)
Name: id, Length: 3336509, dtype: object

In [252]:
X_under_all = df_rolled[df_rolled.id.isin(X_under.index)]
X_under_all


Unnamed: 0,date,flow_site_id,prec,temp,binary,id
805,1909-02-08,114.0,0.0,-14.3,0,"(114.0, 1909-03-10)"
806,1909-02-09,114.0,0.3,-12.6,0,"(114.0, 1909-03-10)"
807,1909-02-10,114.0,0.0,-6.0,0,"(114.0, 1909-03-10)"
808,1909-02-11,114.0,0.0,-18.2,0,"(114.0, 1909-03-10)"
809,1909-02-12,114.0,0.0,-6.8,0,"(114.0, 1909-03-10)"
...,...,...,...,...,...,...
3329157,1997-02-16,2372.0,0.0,-12.2,0,"(2372.0, 1997-02-20)"
3329158,1997-02-17,2372.0,0.0,-5.9,0,"(2372.0, 1997-02-20)"
3329159,1997-02-18,2372.0,16.0,0.5,0,"(2372.0, 1997-02-20)"
3329160,1997-02-19,2372.0,12.5,3.3,0,"(2372.0, 1997-02-20)"


In [253]:
X_under_all.to_csv('../df_undersampled_nearmiss_' +
                   str(N_DAYS) + '_time_lag_' + str(TIME_LAG) + '.csv', index=False)


## Extract complete set of timeseries features

In [254]:
X_under_all = pd.read_csv('../df_undersampled_nearmiss_' +
                          str(N_DAYS) + '_time_lag_' + str(TIME_LAG) + '.csv')
X_under_all.head()

Unnamed: 0,date,flow_site_id,prec,temp,binary,id
0,1909-02-08,114.0,0.0,-14.3,0,"(114.0, '1909-03-10')"
1,1909-02-09,114.0,0.3,-12.6,0,"(114.0, '1909-03-10')"
2,1909-02-10,114.0,0.0,-6.0,0,"(114.0, '1909-03-10')"
3,1909-02-11,114.0,0.0,-18.2,0,"(114.0, '1909-03-10')"
4,1909-02-12,114.0,0.0,-6.8,0,"(114.0, '1909-03-10')"


In [255]:
# extract timeseries features

X_features_all = extract_features(
	X_under_all.drop(["binary", "flow_site_id"], axis=1), column_id='id', column_sort='date',
	n_jobs=20, disable_progressbar=False)


X_features_all.head()

Feature Extraction: 100%|██████████| 100/100 [01:40<00:00,  1.01s/it]


Unnamed: 0,prec__variance_larger_than_standard_deviation,prec__has_duplicate_max,prec__has_duplicate_min,prec__has_duplicate,prec__sum_values,prec__abs_energy,prec__mean_abs_change,prec__mean_change,prec__mean_second_derivative_central,prec__median,...,temp__permutation_entropy__dimension_5__tau_1,temp__permutation_entropy__dimension_6__tau_1,temp__permutation_entropy__dimension_7__tau_1,temp__query_similarity_count__query_None__threshold_0.0,"temp__matrix_profile__feature_""min""__threshold_0.98","temp__matrix_profile__feature_""max""__threshold_0.98","temp__matrix_profile__feature_""mean""__threshold_0.98","temp__matrix_profile__feature_""median""__threshold_0.98","temp__matrix_profile__feature_""25""__threshold_0.98","temp__matrix_profile__feature_""75""__threshold_0.98"
"(114.0, '1909-03-10')",0.0,0.0,1.0,1.0,5.6,6.74,0.193333,0.0,-0.008621,0.0,...,3.039116,3.204778,3.218876,,,,,,,
"(114.0, '1909-04-03')",1.0,0.0,1.0,1.0,51.8,275.62,1.676667,-0.036667,-0.012069,0.8,...,3.193148,3.258097,3.218876,,,,,,,
"(114.0, '1909-04-17')",1.0,0.0,1.0,1.0,28.9,134.49,1.036667,-0.003333,0.001724,0.0,...,2.987771,3.151459,3.218876,,,,,,,
"(114.0, '1909-04-18')",1.0,0.0,1.0,1.0,31.9,144.09,1.136667,0.103333,0.053448,0.0,...,2.895104,3.078015,3.218876,,,,,,,
"(114.0, '1909-04-19')",1.0,0.0,1.0,1.0,32.7,144.73,1.213333,0.026667,-0.039655,0.0,...,2.895104,3.078015,3.218876,,,,,,,


In [256]:
X_features_all = X_features_all.dropna(axis=1)
X_features_all['unique_id'] = X_features_all.index

In [257]:
X_features_all.to_csv(
    '../df_extracted_all_nearmiss_' + str(N_DAYS) + '_time_lag_' + str(TIME_LAG) + '.csv', index=False)


## Build Random Forest model with complete set of extracted timeseries parameters

In [258]:
X_features_all = pd.read_csv(
    '../df_extracted_all_nearmiss_' + str(N_DAYS) + '_time_lag_' + str(TIME_LAG) + '.csv')

all_data_clean = pd.read_csv('../all_data_clean.csv')
all_data_clean['year'] = pd.to_datetime(all_data_clean['date']).dt.year

### to get the binary labels for n days after end of rolled time series
all_data_clean['shifted_date'] = pd.to_datetime(
    all_data_clean.date) + pd.Timedelta(days=TIME_LAG)
all_data_clean['shifted_date'] = all_data_clean['shifted_date'].dt.strftime(
    '%Y-%m-%d')
all_data_clean['unique_id'] = list(
    zip(all_data_clean.flow_site_id, all_data_clean.shifted_date))
all_data_clean = all_data_clean.dropna()
all_data_clean.head()


Unnamed: 0,date,snow_site_id,snow_depth,depth_diff,binary,flow_site_id,temp_site_id,prec_site_id,flow,temp,prec,year,shifted_date,unique_id
0,1965-01-14,75280,0.0,0.0,0,1878.0,74240.0,74240.0,0.051,1.9,1.5,1965,1965-01-24,"(1878.0, 1965-01-24)"
1,1965-01-15,75280,0.0,0.0,0,1878.0,74240.0,74240.0,0.051,1.2,4.0,1965,1965-01-25,"(1878.0, 1965-01-25)"
2,1965-01-16,75280,0.0,0.0,0,1878.0,74240.0,74240.0,0.058,1.5,4.5,1965,1965-01-26,"(1878.0, 1965-01-26)"
3,1965-01-20,75280,0.03,0.03,0,1878.0,74240.0,74240.0,0.076,-0.9,0.0,1965,1965-01-30,"(1878.0, 1965-01-30)"
4,1965-01-21,75280,0.03,0.03,0,1878.0,74240.0,74240.0,0.076,-1.7,2.0,1965,1965-01-31,"(1878.0, 1965-01-31)"


In [259]:
X_features_all['unique_id'] = X_features_all['unique_id'].apply(ast.literal_eval)

In [260]:
X_features_under_all = pd.merge(X_features_all, all_data_clean[[
    'binary', 'year', 'unique_id']], how='left', on='unique_id')
X_features_under_all = X_features_under_all.set_index(
    X_features_under_all['unique_id'], drop=True)
y_under = X_features_under_all['binary']
X_features_under_filtered  =X_features_under_all.replace(np.inf, np.nan)
X_features_under_filtered = X_features_under_filtered.dropna(axis=1)
X_features_under_filtered = X_features_under_filtered.drop(
    columns=['unique_id', 'binary'])
X_features_under_filtered.head()


Unnamed: 0_level_0,prec__variance_larger_than_standard_deviation,prec__has_duplicate_max,prec__has_duplicate_min,prec__has_duplicate,prec__sum_values,prec__abs_energy,prec__mean_abs_change,prec__mean_change,prec__mean_second_derivative_central,prec__median,...,temp__fourier_entropy__bins_3,temp__fourier_entropy__bins_5,temp__fourier_entropy__bins_10,temp__fourier_entropy__bins_100,temp__permutation_entropy__dimension_3__tau_1,temp__permutation_entropy__dimension_4__tau_1,temp__permutation_entropy__dimension_5__tau_1,temp__permutation_entropy__dimension_6__tau_1,temp__permutation_entropy__dimension_7__tau_1,year
unique_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"(114.0, 1909-03-10)",0.0,0.0,1.0,1.0,5.6,6.74,0.193333,0.0,-0.008621,0.0,...,0.601924,0.918046,1.440235,2.220025,1.657161,2.49611,3.039116,3.204778,3.218876,1909
"(114.0, 1909-04-03)",1.0,0.0,1.0,1.0,51.8,275.62,1.676667,-0.036667,-0.012069,0.8,...,1.039721,1.353591,1.786808,2.566599,1.693247,2.651193,3.193148,3.258097,3.218876,1909
"(114.0, 1909-04-17)",1.0,0.0,1.0,1.0,28.9,134.49,1.036667,-0.003333,0.001724,0.0,...,0.482578,0.984087,1.602056,2.685945,1.636381,2.483974,2.987771,3.151459,3.218876,1909
"(114.0, 1909-04-18)",1.0,0.0,1.0,1.0,31.9,144.09,1.136667,0.103333,0.053448,0.0,...,0.482578,1.043353,1.474057,2.599302,1.595254,2.461779,2.895104,3.078015,3.218876,1909
"(114.0, 1909-04-19)",1.0,0.0,1.0,1.0,32.7,144.73,1.213333,0.026667,-0.039655,0.0,...,0.601924,1.143275,1.580819,2.772589,1.648098,2.529976,2.895104,3.078015,3.218876,1909


In [261]:
## run model without year and split by site and time
#results = random_forest_site(X_features_under_filtered.drop(columns=['year']), y_under, results, options={'param_type': 'All', 'year': 'No'})
#results = random_forest_time(X_features_under_filtered.drop(columns=['year']), y_under, results, options={'param_type': 'All', 'year': 'No'})


In [262]:
## run model without year and split by site and time
#results = gradient_boost_site(X_features_under_filtered.drop(columns=['year']), y_under, results, options={'param_type': 'All', 'year': 'No'})
#results = gradient_boost_time(X_features_under_filtered.drop(columns=['year']), y_under, results, options={'param_type': 'All', 'year': 'No'})

In [263]:
## run model with year and split by site and time
#results = random_forest_site(X_features_under_filtered, y_under, results, options={'param_type': 'All', 'year': 'Yes'})
#results = random_forest_time(X_features_under_filtered, y_under, results, options={'param_type': 'All', 'year': 'Yes'})


In [264]:
## run model with year and split by site and time
#results = gradient_boost_site(X_features_under_filtered, y_under, results, options={'param_type': 'All', 'year': 'Yes'})
#results = gradient_boost_time(X_features_under_filtered, y_under, results, options={'param_type': 'All', 'year': 'Yes'})

### Run the model with selected features

In [265]:
# Remove correlated features
X_features_under_filtered = X_features_under_filtered.drop(['prec__length', 'temp__length'], axis=1)
corr_matrix = X_features_under_filtered.corr().abs()
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
to_drop = [column for column in upper.columns if any(upper[column] > 0.90)]
X_selected = X_features_under_filtered.drop(to_drop, axis=1)
X_selected['year'] = X_features_under_filtered['year']
X_selected


Unnamed: 0_level_0,prec__variance_larger_than_standard_deviation,prec__has_duplicate_max,prec__has_duplicate_min,prec__has_duplicate,prec__sum_values,prec__abs_energy,prec__mean_change,prec__mean_second_derivative_central,prec__median,prec__skewness,...,temp__fourier_entropy__bins_3,temp__fourier_entropy__bins_5,temp__fourier_entropy__bins_10,temp__fourier_entropy__bins_100,temp__permutation_entropy__dimension_3__tau_1,temp__permutation_entropy__dimension_4__tau_1,temp__permutation_entropy__dimension_5__tau_1,temp__permutation_entropy__dimension_6__tau_1,temp__permutation_entropy__dimension_7__tau_1,year
unique_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"(114.0, 1909-03-10)",0.0,0.0,1.0,1.0,5.6,6.74,0.000000,-0.008621,0.0,2.865845,...,0.601924,0.918046,1.440235,2.220025,1.657161,2.496110,3.039116,3.204778,3.218876,1909
"(114.0, 1909-04-03)",1.0,0.0,1.0,1.0,51.8,275.62,-0.036667,-0.012069,0.8,2.004597,...,1.039721,1.353591,1.786808,2.566599,1.693247,2.651193,3.193148,3.258097,3.218876,1909
"(114.0, 1909-04-17)",1.0,0.0,1.0,1.0,28.9,134.49,-0.003333,0.001724,0.0,3.028967,...,0.482578,0.984087,1.602056,2.685945,1.636381,2.483974,2.987771,3.151459,3.218876,1909
"(114.0, 1909-04-18)",1.0,0.0,1.0,1.0,31.9,144.09,0.103333,0.053448,0.0,2.770898,...,0.482578,1.043353,1.474057,2.599302,1.595254,2.461779,2.895104,3.078015,3.218876,1909
"(114.0, 1909-04-19)",1.0,0.0,1.0,1.0,32.7,144.73,0.026667,-0.039655,0.0,2.771989,...,0.601924,1.143275,1.580819,2.772589,1.648098,2.529976,2.895104,3.078015,3.218876,1909
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
"(591.0, 2021-05-27)",1.0,0.0,1.0,1.0,46.0,580.82,0.000000,0.000000,0.0,3.680835,...,0.233792,0.463414,0.918046,2.393312,1.577478,2.453151,3.019736,3.258097,3.218876,2021
"(591.0, 2021-05-28)",1.0,0.0,1.0,1.0,46.0,580.82,0.000000,0.000000,0.0,3.680835,...,0.463414,0.463414,0.918046,2.306669,1.615951,2.453151,3.019736,3.258097,3.218876,2021
"(591.0, 2021-05-29)",1.0,0.0,1.0,1.0,46.0,580.82,0.000000,0.000000,0.0,3.680835,...,0.463414,0.688567,1.037392,2.252728,1.570535,2.453151,3.019736,3.258097,3.218876,2021
"(591.0, 2021-05-30)",1.0,0.0,1.0,1.0,46.0,580.82,0.000000,0.000000,0.0,3.680835,...,0.463414,0.822265,1.037392,2.096353,1.597908,2.502661,3.019736,3.258097,3.218876,2021


In [266]:
## run model without year and split by site and time
results = random_forest_site(X_selected.drop(columns=['year']), y_under, results, options={'param_type': 'Selected', 'year': 'No'})
results = random_forest_time(X_selected.drop(columns=['year']), y_under, results, options={'param_type': 'Selected', 'year': 'No'})

{'max_depth': 25, 'max_features': 10, 'n_estimators': 500} 0.8374069743016002




{'max_depth': 25, 'max_features': 5, 'n_estimators': 500} 0.8050098341070981


In [267]:
## run model without year and split by site and time
results = svm_site(X_selected.drop(columns=['year']), y_under, results, options={'param_type': 'Selected', 'year': 'No'})
results = svm_time(X_selected.drop(columns=['year']), y_under, results, options={'param_type': 'Selected', 'year': 'No'})


{'svc__C': 1, 'svc__dual': False, 'svc__max_iter': 100} 0.8302085038973711
{'svc__C': 100, 'svc__dual': False, 'svc__max_iter': 100} 0.7868251309693197


In [268]:
## run model with year and split by site and time
results = random_forest_site(X_selected, y_under, results, options={'param_type': 'Selected', 'year': 'Yes'})
results = random_forest_time(X_selected, y_under, results, options={'param_type': 'Selected', 'year': 'Yes'})

{'max_depth': 25, 'max_features': 10, 'n_estimators': 1500} 0.8354930425884517




{'max_depth': 25, 'max_features': 10, 'n_estimators': 750} 0.8064745158651967


In [269]:
## run model with year and split by site and time
results = svm_site(X_selected, y_under, results, options={'param_type': 'Selected', 'year': 'Yes'})
results = svm_time(X_selected, y_under, results, options={'param_type': 'Selected', 'year': 'Yes'})


{'svc__C': 10, 'svc__dual': False, 'svc__max_iter': 1000} 0.8309723615185435




{'svc__C': 1, 'svc__dual': False, 'svc__max_iter': 100} 0.7854986646508018


In [270]:
results.to_csv('../results_' + str(N_DAYS) + '_time_lag_' + str(TIME_LAG) + '_prec_temp.csv')