# Extract time series parameters from flow and use to predict extreme snowmelt

In [1]:
import pandas as pd
import numpy as np
from collections import Counter
import ast
import dateutil.parser as parser

from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier
from sklearn.model_selection import train_test_split, TimeSeriesSplit, GridSearchCV, KFold, cross_validate, PredefinedSplit
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import make_scorer, precision_recall_curve, auc, classification_report
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline

from matplotlib import pyplot as plt


In [2]:
from tsfresh import extract_features, extract_relevant_features, select_features
from tsfresh.utilities.dataframe_functions import impute
from tsfresh.feature_extraction.settings import MinimalFCParameters
from tsfresh.utilities.dataframe_functions import roll_time_series


In [3]:
# build scorer function
def auc_pr_score(y_true, y_pred):
	precision, recall, _ = precision_recall_curve(y_true, y_pred)
	return auc(recall, precision)


auc_pr = make_scorer(auc_pr_score, greater_is_better=True)

N_DAYS = 2
TIME_LAG = 10

In [4]:
## Define functions

def random_forest_site(X_under, y_under, results, options):
	indx = X_under.index.unique()
	sites = [x[0] for x in indx]
	sites = list(set(sites))
	sites_train = sites[:len(sites)//10*6]
	sites_test = sites[len(sites)//10*6:]
	idx_train = [x for x in indx if x[0] in sites_train]
	idx_test = [x for x in indx if x[0] in sites_test]
	X_filtered_train = X_under[X_under.index.isin(idx_train)]
	X_filtered_test = X_under[X_under.index.isin(idx_test)]
	y_train = y_under[y_under.index.isin(idx_train)]
	y_test = y_under[y_under.index.isin(idx_test)]

	X_all = pd.concat([X_filtered_train, X_filtered_test]).reset_index(drop=True)
	y_all = pd.concat([y_train, y_test]).reset_index(drop=True)
	clf = RandomForestClassifier(n_jobs=-1, random_state=42, verbose=0)

	split_index = [-1 if x in X_filtered_train.index else 0 for x in X_under.index]
	ps = PredefinedSplit(test_fold=split_index)

	param_grid = {
		'max_depth': (1, 5, 10, 25),
		'n_estimators': (100, 500, 750, 1500),
		'max_features': (2, 3, 5, 10)}

	gs = GridSearchCV(clf, param_grid=param_grid, cv=ps,
					scoring=auc_pr, n_jobs=-1, verbose=0)
	gs.fit(X_all, y_all)
	print(gs.best_params_, gs.best_score_)
	results = results.append({'n_days': N_DAYS, 'time_lag': TIME_LAG, 'eval_type': 'site', 'param_type': options['param_type'], 'year': options['year'],
                           'params': gs.best_params_, 'score': gs.best_score_, 'model': 'Random Forest', 
						   'variables': X_under.columns, 'n_features': len(X_under.columns), 'n_samples': len(X_under)}, ignore_index=True)
	
	return results

def random_forest_time(X_under, y_under, results, options):
	X_filtered_sorted = X_under.sort_index(key=lambda d: d.map(lambda x: x[1]))
	y_under_sorted = y_under.sort_index(key=lambda d: d.map(lambda x: x[1]))
	tscv = TimeSeriesSplit(n_splits=5)

	param_grid = {
		'max_depth': (1, 5, 10, 25),
		'n_estimators': (100, 500, 750, 1500),
		'max_features': (2, 3, 5, 10)}

	clf = RandomForestClassifier(n_jobs=-1, random_state=42, verbose=0)

	gs = GridSearchCV(clf, param_grid=param_grid, cv=tscv,
					scoring=auc_pr, n_jobs=-1, verbose=0)
	gs.fit(X_filtered_sorted, y_under_sorted)
	
	results = results.append({'n_days': N_DAYS, 'time_lag': TIME_LAG, 'eval_type': 'time', 'param_type': options['param_type'], 'year': options['year'], 
                           'params': gs.best_params_, 'score': gs.best_score_, 'model': 'Random Forest', 
                           'variables': X_under.columns, 'n_features': len(X_under.columns), 'n_samples': len(X_under)}, ignore_index=True)
	print(gs.best_params_, gs.best_score_)

	return results


def gradient_boost_site(X_under, y_under, results, options):
	indx = X_under.index.unique()
	sites = [x[0] for x in indx]
	sites = list(set(sites))
	sites_train = sites[:len(sites)//10*6]
	sites_test = sites[len(sites)//10*6:]
	idx_train = [x for x in indx if x[0] in sites_train]
	idx_test = [x for x in indx if x[0] in sites_test]
	X_filtered_train = X_under[X_under.index.isin(idx_train)]
	X_filtered_test = X_under[X_under.index.isin(idx_test)]
	y_train = y_under[y_under.index.isin(idx_train)]
	y_test = y_under[y_under.index.isin(idx_test)]

	X_all = pd.concat([X_filtered_train, X_filtered_test]).reset_index(drop=True)
	y_all = pd.concat([y_train, y_test]).reset_index(drop=True)
	clf = HistGradientBoostingClassifier(random_state=42, verbose=0, early_stopping=False)

	split_index = [-1 if x in X_filtered_train.index else 0 for x in X_under.index]
	ps = PredefinedSplit(test_fold=split_index)

	param_grid = {'max_iter': (100, 1000, 1500),
               'learning_rate': (0.01, 0.1, 1),
               'max_depth': (1, 5, 10, 25, 50),
               }

	gs = GridSearchCV(clf, param_grid=param_grid, cv=ps,
                   scoring=auc_pr, n_jobs=-1, verbose=0)
	gs.fit(X_all, y_all)
	print(gs.best_params_, gs.best_score_)
	results = results.append({'n_days': N_DAYS, 'time_lag': TIME_LAG, 'eval_type': 'site', 'param_type': options['param_type'], 'year': options['year'],
                           'params': gs.best_params_, 'score': gs.best_score_, 'model': 'Gradient Boost', 
                           'variables': X_under.columns, 'n_features': len(X_under.columns), 'n_samples': len(X_under)}, ignore_index=True)

	return results


def gradient_boost_time(X_under, y_under, results, options):
	X_filtered_sorted = X_under.sort_index(key=lambda d: d.map(lambda x: x[1]))
	y_under_sorted = y_under.sort_index(key=lambda d: d.map(lambda x: x[1]))
	tscv = TimeSeriesSplit(n_splits=5)

	param_grid = {'max_iter': (100, 1000, 1500),
               'learning_rate': (0.01, 0.1, 1),
               'max_depth': (1, 5, 10, 25, 50),
               }
	clf = HistGradientBoostingClassifier(random_state=42, verbose=0, early_stopping=False)

	gs = GridSearchCV(clf, param_grid=param_grid, cv=tscv,
                   scoring=auc_pr, n_jobs=-1, verbose=0)
	gs.fit(X_filtered_sorted, y_under_sorted)

	results = results.append({'n_days': N_DAYS, 'time_lag': TIME_LAG, 'eval_type': 'time', 'param_type': options['param_type'], 'year': options['year'],
                           'params': gs.best_params_, 'score': gs.best_score_, 'model': 'Gradient Boost', 
                           'variables': X_under.columns, 'n_features': len(X_under.columns), 'n_samples': len(X_under)}, ignore_index=True)
	print(gs.best_params_, gs.best_score_)

	return results


def svm_site(X_under, y_under, results, options):
	indx = X_under.index.unique()
	sites = [x[0] for x in indx]
	sites = list(set(sites))
	sites_train = sites[:len(sites)//10*6]
	sites_test = sites[len(sites)//10*6:]
	idx_train = [x for x in indx if x[0] in sites_train]
	idx_test = [x for x in indx if x[0] in sites_test]
	X_filtered_train = X_under[X_under.index.isin(idx_train)]
	X_filtered_test = X_under[X_under.index.isin(idx_test)]
	y_train = y_under[y_under.index.isin(idx_train)]
	y_test = y_under[y_under.index.isin(idx_test)]

	X_all = pd.concat([X_filtered_train, X_filtered_test]).reset_index(drop=True)
	y_all = pd.concat([y_train, y_test]).reset_index(drop=True)
	clf = HistGradientBoostingClassifier(
		random_state=42, verbose=0, early_stopping=False)

	split_index = [-1 if x in X_filtered_train.index else 0 for x in X_under.index]
	ps = PredefinedSplit(test_fold=split_index)

	scaler = StandardScaler()
	param_grid = {
               'svc__C': (1, 10, 100),
			   'svc__max_iter': (100, 1000, 1500, 2000),
			   'svc__dual': (True, False),
               }
	svc = LinearSVC(random_state=42, verbose=0)

	pipe = Pipeline([('scaler', scaler), ('svc', svc)])

	gs = GridSearchCV(pipe, param_grid=param_grid, cv=ps,
                   scoring=auc_pr, n_jobs=-1, verbose=0)
	gs.fit(X_all, y_all)
	print(gs.best_params_, gs.best_score_)
	results = results.append({'n_days': N_DAYS, 'time_lag': TIME_LAG, 'eval_type': 'site', 'param_type': options['param_type'], 'year': options['year'],
                           'params': gs.best_params_, 'score': gs.best_score_, 'model': 'SVC',
                           'variables': X_under.columns, 'n_features': len(X_under.columns), 'n_samples': len(X_under)}, ignore_index=True)

	return results


def svm_time(X_under, y_under, results, options):
	X_filtered_sorted = X_under.sort_index(key=lambda d: d.map(lambda x: x[1]))
	y_under_sorted = y_under.sort_index(key=lambda d: d.map(lambda x: x[1]))
	tscv = TimeSeriesSplit(n_splits=5)

	scaler = StandardScaler()
	param_grid = {
            'svc__C': (1, 10, 100),
            'svc__max_iter': (100, 1000, 1500, 2000),
         	'svc__dual': (True, False),
        }
	svc = LinearSVC(random_state=42, verbose=0)

	pipe = Pipeline([('scaler', scaler), ('svc', svc)])

	gs = GridSearchCV(pipe, param_grid=param_grid, cv=tscv,
                   scoring=auc_pr, n_jobs=-1, verbose=0)
	gs.fit(X_filtered_sorted, y_under_sorted)

	results = results.append({'n_days': N_DAYS, 'time_lag': TIME_LAG, 'eval_type': 'time', 'param_type': options['param_type'], 'year': options['year'],
                           'params': gs.best_params_, 'score': gs.best_score_, 'model': 'SVC',
                           'variables': X_under.columns, 'n_features': len(X_under.columns), 'n_samples': len(X_under)}, ignore_index=True)
	print(gs.best_params_, gs.best_score_)

	return results


## Create rolled dataframe 

In [5]:
all_data_clean = pd.read_csv('../all_data_clean.csv')

all_data_clean


Unnamed: 0,date,snow_site_id,snow_depth,depth_diff,binary,flow_site_id,temp_site_id,prec_site_id,flow,temp,prec
0,1965-01-14,75280,0.00,0.00,0,1878.0,74240.0,74240.0,0.0510,1.9,1.5
1,1965-01-15,75280,0.00,0.00,0,1878.0,74240.0,74240.0,0.0510,1.2,4.0
2,1965-01-16,75280,0.00,0.00,0,1878.0,74240.0,74240.0,0.0580,1.5,4.5
3,1965-01-20,75280,0.03,0.03,0,1878.0,74240.0,74240.0,0.0760,-0.9,0.0
4,1965-01-21,75280,0.03,0.03,0,1878.0,74240.0,74240.0,0.0760,-1.7,2.0
...,...,...,...,...,...,...,...,...,...,...,...
108249,2021-06-09,160970,0.00,0.00,0,1387.0,160970.0,160960.0,340.1524,17.2,0.0
108250,2021-06-23,160970,0.00,0.00,0,1387.0,160970.0,160960.0,354.6912,14.6,0.0
108251,2021-06-24,160970,0.00,0.00,0,1387.0,160970.0,160960.0,340.7535,17.3,0.0
108252,2021-06-30,160970,0.00,0.00,0,1387.0,160970.0,160960.0,269.8846,15.0,0.0


In [6]:

## Here can change parameters only once
df_rolled = roll_time_series(
    all_data_clean[['date', 'flow_site_id', 'flow', 'temp', 'prec', 'binary']], column_id="flow_site_id", column_sort="date", max_timeshift=N_DAYS, min_timeshift=N_DAYS - 1, n_jobs=20)


Rolling: 100%|██████████| 100/100 [00:39<00:00,  2.56it/s]


In [7]:
df_rolled

Unnamed: 0,date,flow_site_id,flow,temp,prec,binary,id
54705,1909-01-01,114.0,8.0,-16.0,0.3,0,"(114.0, 1909-01-02)"
54706,1909-01-02,114.0,8.0,-3.9,0.0,0,"(114.0, 1909-01-02)"
54749,1909-01-01,114.0,8.0,-16.0,0.3,0,"(114.0, 1909-01-03)"
54750,1909-01-02,114.0,8.0,-3.9,0.0,0,"(114.0, 1909-01-03)"
54751,1909-01-03,114.0,8.0,-14.5,0.0,0,"(114.0, 1909-01-03)"
...,...,...,...,...,...,...,...
55539,1993-06-05,2445.0,41.0,15.1,0.0,0,"(2445.0, 1993-06-09)"
55540,1993-06-09,2445.0,31.0,16.5,0.0,0,"(2445.0, 1993-06-09)"
55604,1993-06-05,2445.0,41.0,15.1,0.0,0,"(2445.0, 1993-06-18)"
55605,1993-06-09,2445.0,31.0,16.5,0.0,0,"(2445.0, 1993-06-18)"


In [8]:
df_rolled.to_csv('../df_rolled_' + str(N_DAYS) + '.csv', index=False)

## Extract minimal timeseries features

In [9]:
df_rolled = pd.read_csv('../df_rolled_' + str(N_DAYS) + '.csv')
all_data_clean = pd.read_csv('../all_data_clean.csv')

In [10]:
df_rolled

Unnamed: 0,date,flow_site_id,flow,temp,prec,binary,id
0,1909-01-01,114.0,8.0,-16.0,0.3,0,"(114.0, '1909-01-02')"
1,1909-01-02,114.0,8.0,-3.9,0.0,0,"(114.0, '1909-01-02')"
2,1909-01-01,114.0,8.0,-16.0,0.3,0,"(114.0, '1909-01-03')"
3,1909-01-02,114.0,8.0,-3.9,0.0,0,"(114.0, '1909-01-03')"
4,1909-01-03,114.0,8.0,-14.5,0.0,0,"(114.0, '1909-01-03')"
...,...,...,...,...,...,...,...
324669,1993-06-05,2445.0,41.0,15.1,0.0,0,"(2445.0, '1993-06-09')"
324670,1993-06-09,2445.0,31.0,16.5,0.0,0,"(2445.0, '1993-06-09')"
324671,1993-06-05,2445.0,41.0,15.1,0.0,0,"(2445.0, '1993-06-18')"
324672,1993-06-09,2445.0,31.0,16.5,0.0,0,"(2445.0, '1993-06-18')"


In [11]:
# extract timeseries features

X_features_all = extract_features(
	df_rolled.drop(["binary", "flow_site_id"], axis=1), column_id='id', column_sort='date',
	n_jobs=20, disable_progressbar=False, default_fc_parameters=MinimalFCParameters())


X_features_all.head()


Feature Extraction: 100%|██████████| 100/100 [01:26<00:00,  1.15it/s]


Unnamed: 0,flow__sum_values,flow__median,flow__mean,flow__length,flow__standard_deviation,flow__variance,flow__root_mean_square,flow__maximum,flow__minimum,temp__sum_values,...,temp__minimum,prec__sum_values,prec__median,prec__mean,prec__length,prec__standard_deviation,prec__variance,prec__root_mean_square,prec__maximum,prec__minimum
"(114.0, '1909-01-02')",16.0,8.0,8.0,2.0,0.0,0.0,8.0,8.0,8.0,-19.9,...,-16.0,0.3,0.15,0.15,2.0,0.15,0.0225,0.212132,0.3,0.0
"(114.0, '1909-01-03')",24.0,8.0,8.0,3.0,0.0,0.0,8.0,8.0,8.0,-34.4,...,-16.0,0.3,0.0,0.1,3.0,0.141421,0.02,0.173205,0.3,0.0
"(114.0, '1909-01-04')",23.7,8.0,7.9,3.0,0.141421,0.02,7.901266,8.0,7.7,-20.9,...,-14.5,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0
"(114.0, '1909-01-05')",23.4,7.7,7.8,3.0,0.141421,0.02,7.801282,8.0,7.7,-24.8,...,-14.5,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0
"(114.0, '1909-01-06')",23.1,7.7,7.7,3.0,0.0,0.0,7.7,7.7,7.7,-22.3,...,-12.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0


In [12]:
## Add binary response variable back based on unique id

X_features_all['unique_id'] = X_features_all.index
X_features_all['unique_id'] = X_features_all['unique_id'].apply(ast.literal_eval)

all_data_clean['shifted_date'] = pd.to_datetime(
    all_data_clean.date) + pd.Timedelta(days=TIME_LAG)
all_data_clean['shifted_date'] = all_data_clean['shifted_date'].dt.strftime(
    '%Y-%m-%d')
all_data_clean['unique_id'] = list(
    zip(all_data_clean.flow_site_id, all_data_clean.shifted_date))
all_data_clean = all_data_clean.dropna()

X_features_all = X_features_all.reset_index(drop=True)
	
X_features_all = pd.merge(X_features_all, all_data_clean[[
                          'binary', 'unique_id']], how='left', on='unique_id')
X_features_all = X_features_all.set_index(
    X_features_all['unique_id'], drop=True)
X_features_all = X_features_all.dropna()
X_features_all.head()


Unnamed: 0_level_0,flow__sum_values,flow__median,flow__mean,flow__length,flow__standard_deviation,flow__variance,flow__root_mean_square,flow__maximum,flow__minimum,temp__sum_values,...,prec__median,prec__mean,prec__length,prec__standard_deviation,prec__variance,prec__root_mean_square,prec__maximum,prec__minimum,unique_id,binary
unique_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"(114.0, 1909-01-11)",22.5,7.4,7.5,3.0,0.1414214,0.02,7.501333,7.7,7.4,-16.0,...,0.0,0.066667,3.0,0.094281,0.008889,0.11547,0.2,0.0,"(114.0, 1909-01-11)",0.0
"(114.0, 1909-01-12)",22.2,7.4,7.4,3.0,8.881784e-16,7.888609e-31,7.4,7.4,7.4,-10.5,...,0.2,0.133333,3.0,0.094281,0.008889,0.163299,0.2,0.0,"(114.0, 1909-01-12)",0.0
"(114.0, 1909-01-13)",22.2,7.4,7.4,3.0,8.881784e-16,7.888609e-31,7.4,7.4,7.4,-30.2,...,0.2,0.133333,3.0,0.094281,0.008889,0.163299,0.2,0.0,"(114.0, 1909-01-13)",0.0
"(114.0, 1909-01-14)",22.2,7.4,7.4,3.0,8.881784e-16,7.888609e-31,7.4,7.4,7.4,-47.6,...,0.2,0.233333,3.0,0.20548,0.042222,0.310913,0.5,0.0,"(114.0, 1909-01-14)",0.0
"(114.0, 1909-01-15)",21.9,7.4,7.3,3.0,0.1414214,0.02,7.30137,7.4,7.1,-47.1,...,0.5,0.8,3.0,0.804156,0.646667,1.134313,1.9,0.0,"(114.0, 1909-01-15)",0.0


In [13]:
X_features_all.to_csv('../df_extracted_min_' + str(N_DAYS) + '.csv', index=False)

## Undersample minimal timeseries feature dataset and run Random Forest model

In [14]:
X_features_all = pd.read_csv('../df_extracted_min_' + str(N_DAYS) + '.csv')


In [15]:
X_features_all['unique_id'] = X_features_all['unique_id'].apply(ast.literal_eval)
X_features_all.head()


Unnamed: 0,flow__sum_values,flow__median,flow__mean,flow__length,flow__standard_deviation,flow__variance,flow__root_mean_square,flow__maximum,flow__minimum,temp__sum_values,...,prec__median,prec__mean,prec__length,prec__standard_deviation,prec__variance,prec__root_mean_square,prec__maximum,prec__minimum,unique_id,binary
0,22.5,7.4,7.5,3.0,0.1414214,0.02,7.501333,7.7,7.4,-16.0,...,0.0,0.066667,3.0,0.094281,0.008889,0.11547,0.2,0.0,"(114.0, 1909-01-11)",0.0
1,22.2,7.4,7.4,3.0,8.881784e-16,7.888609e-31,7.4,7.4,7.4,-10.5,...,0.2,0.133333,3.0,0.094281,0.008889,0.163299,0.2,0.0,"(114.0, 1909-01-12)",0.0
2,22.2,7.4,7.4,3.0,8.881784e-16,7.888609e-31,7.4,7.4,7.4,-30.2,...,0.2,0.133333,3.0,0.094281,0.008889,0.163299,0.2,0.0,"(114.0, 1909-01-13)",0.0
3,22.2,7.4,7.4,3.0,8.881784e-16,7.888609e-31,7.4,7.4,7.4,-47.6,...,0.2,0.233333,3.0,0.20548,0.042222,0.310913,0.5,0.0,"(114.0, 1909-01-14)",0.0
4,21.9,7.4,7.3,3.0,0.1414214,0.02,7.30137,7.4,7.1,-47.1,...,0.5,0.8,3.0,0.804156,0.646667,1.134313,1.9,0.0,"(114.0, 1909-01-15)",0.0


In [16]:
y1 = X_features_all['binary']
Counter(y1)

Counter({0.0: 93713, 1.0: 4688})

In [17]:
## undersample

from imblearn.under_sampling import NearMiss

undersample = NearMiss(version=3, n_neighbors=3)
X_under, y_under = undersample.fit_resample(
    X_features_all.drop(columns=['binary', 'unique_id']), y1)


In [18]:
X_under.index = X_features_all['unique_id'][undersample.sample_indices_]
y_under.index = X_features_all['unique_id'][undersample.sample_indices_]
Counter(y_under)


Counter({0.0: 4688, 1.0: 4688})

In [19]:
results = pd.DataFrame(columns=['n_days', 'time_lag', 'eval_type', 'param_type', 'year', 'params', 'score', 'model', 'variables', 'n_features', 'n_samples'])


In [20]:
# Remove correlated features
X_under = X_under.drop(['flow__length', 'temp__length','prec__length'], axis=1)
corr_matrix = X_under.corr().abs()
upper = corr_matrix.where(
    np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool_))
to_drop = [column for column in upper.columns if any(upper[column] > 0.90)]
X_selected = X_under.drop(to_drop, axis=1)
X_selected


Unnamed: 0_level_0,flow__sum_values,flow__standard_deviation,flow__variance,temp__sum_values,temp__standard_deviation,temp__root_mean_square,prec__sum_values,prec__median,prec__standard_deviation,prec__variance,prec__minimum
unique_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
"(1147.0, 1932-07-06)",1642.0000,185.749533,34502.888889,42.6,2.546894,14.426596,0.0,0.0,0.000000,0.00,0.0
"(1147.0, 1932-07-05)",2074.0000,193.098133,37286.888889,34.6,1.763204,11.667333,0.0,0.0,0.000000,0.00,0.0
"(2012.0, 1984-05-17)",2855.5252,168.226277,28300.080332,37.4,1.014342,12.507864,0.0,0.0,0.000000,0.00,0.0
"(1315.0, 1953-05-21)",2516.0000,166.557965,27741.555556,30.8,0.821922,10.299515,10.2,0.5,4.101219,16.82,0.5
"(1315.0, 1958-05-28)",1680.0000,165.730705,27466.666667,32.6,1.744197,11.005756,1.5,0.0,0.707107,0.50,0.0
...,...,...,...,...,...,...,...,...,...,...,...
"(591.0, 2021-05-24)",390.0506,6.687880,44.727736,9.6,0.778888,3.293428,0.0,0.0,0.000000,0.00,0.0
"(591.0, 2021-05-27)",312.8142,2.141603,4.586465,22.2,1.042433,7.473063,0.0,0.0,0.000000,0.00,0.0
"(591.0, 2021-05-28)",318.9124,4.987923,24.879377,23.7,0.697615,7.930742,0.0,0.0,0.000000,0.00,0.0
"(591.0, 2021-05-29)",336.6982,6.847591,46.889502,24.1,0.555778,8.052536,0.0,0.0,0.000000,0.00,0.0


In [21]:
## run model without year and split by site and time
results = random_forest_site(X_selected, y_under, results, options={'param_type': 'Minimal', 'year': 'No'})
results = random_forest_time(X_selected, y_under, results, options={'param_type': 'Minimal', 'year': 'No'})


{'max_depth': 10, 'max_features': 3, 'n_estimators': 1500} 0.825436933369911
{'max_depth': 10, 'max_features': 2, 'n_estimators': 750} 0.8153242872656286


In [22]:
## run model without year and split by site and time
results = svm_site(X_selected, y_under, results, options={'param_type': 'Minimal', 'year': 'No'})
results = svm_time(X_selected, y_under, results, options={'param_type': 'Minimal', 'year': 'No'})




{'svc__C': 1, 'svc__dual': True, 'svc__max_iter': 100} 0.8221397762280059
{'svc__C': 10, 'svc__dual': True, 'svc__max_iter': 1000} 0.7719839657474779




In [23]:
## add year to features
dates = [parser.parse(x[1]).year for x in X_selected.index]
X_selected['year'] = dates


In [24]:
## run model with year and split by site and time
results = random_forest_site(X_selected, y_under, results, options={'param_type': 'Minimal', 'year': 'Yes'})
results = random_forest_time(X_selected, y_under, results, options={'param_type': 'Minimal', 'year': 'Yes'})


{'max_depth': 10, 'max_features': 2, 'n_estimators': 100} 0.7912097583268535
{'max_depth': 5, 'max_features': 3, 'n_estimators': 500} 0.7986259164935218


In [25]:
## run model with year and split by site and time
results = svm_site(X_selected, y_under, results, options={'param_type': 'Minimal', 'year': 'Yes'})
results = svm_time(X_selected, y_under, results, options={'param_type': 'Minimal', 'year': 'Yes'})




{'svc__C': 100, 'svc__dual': True, 'svc__max_iter': 1000} 0.723634185201772
{'svc__C': 100, 'svc__dual': True, 'svc__max_iter': 2000} 0.7496035078277279




In [26]:
df_rolled['id'] = df_rolled['id'].apply(ast.literal_eval)
df_rolled.id

0          (114.0, 1909-01-02)
1          (114.0, 1909-01-02)
2          (114.0, 1909-01-03)
3          (114.0, 1909-01-03)
4          (114.0, 1909-01-03)
                  ...         
324669    (2445.0, 1993-06-09)
324670    (2445.0, 1993-06-09)
324671    (2445.0, 1993-06-18)
324672    (2445.0, 1993-06-18)
324673    (2445.0, 1993-06-18)
Name: id, Length: 324674, dtype: object

In [27]:
X_under_all = df_rolled[df_rolled.id.isin(X_under.index)]
X_under_all


Unnamed: 0,date,flow_site_id,flow,temp,prec,binary,id
35,1909-01-12,114.0,7.4,-7.0,0.2,0,"(114.0, 1909-01-14)"
36,1909-01-13,114.0,7.4,-23.3,0.0,0,"(114.0, 1909-01-14)"
37,1909-01-14,114.0,7.4,-17.3,0.5,0,"(114.0, 1909-01-14)"
38,1909-01-13,114.0,7.4,-23.3,0.0,0,"(114.0, 1909-01-15)"
39,1909-01-14,114.0,7.4,-17.3,0.5,0,"(114.0, 1909-01-15)"
...,...,...,...,...,...,...,...
323941,1997-02-26,2372.0,30.9,4.1,0.0,0,"(2372.0, 1997-02-27)"
323942,1997-02-27,2372.0,21.0,3.0,0.0,0,"(2372.0, 1997-02-27)"
323943,1997-02-26,2372.0,30.9,4.1,0.0,0,"(2372.0, 1997-02-28)"
323944,1997-02-27,2372.0,21.0,3.0,0.0,0,"(2372.0, 1997-02-28)"


In [28]:
X_under_all.to_csv('../df_undersampled_nearmiss_' +
                   str(N_DAYS) + '_time_lag_' + str(TIME_LAG) + '.csv', index=False)


## Extract complete set of timeseries features

In [29]:
X_under_all = pd.read_csv('../df_undersampled_nearmiss_' +
                          str(N_DAYS) + '_time_lag_' + str(TIME_LAG) + '.csv')
X_under_all.head()

Unnamed: 0,date,flow_site_id,flow,temp,prec,binary,id
0,1909-01-12,114.0,7.4,-7.0,0.2,0,"(114.0, '1909-01-14')"
1,1909-01-13,114.0,7.4,-23.3,0.0,0,"(114.0, '1909-01-14')"
2,1909-01-14,114.0,7.4,-17.3,0.5,0,"(114.0, '1909-01-14')"
3,1909-01-13,114.0,7.4,-23.3,0.0,0,"(114.0, '1909-01-15')"
4,1909-01-14,114.0,7.4,-17.3,0.5,0,"(114.0, '1909-01-15')"


In [30]:
# extract timeseries features

X_features_all = extract_features(
	X_under_all.drop(["binary", "flow_site_id"], axis=1), column_id='id', column_sort='date',
	n_jobs=20, disable_progressbar=False)


X_features_all.head()

Feature Extraction: 100%|██████████| 100/100 [02:04<00:00,  1.25s/it]


Unnamed: 0,flow__variance_larger_than_standard_deviation,flow__has_duplicate_max,flow__has_duplicate_min,flow__has_duplicate,flow__sum_values,flow__abs_energy,flow__mean_abs_change,flow__mean_change,flow__mean_second_derivative_central,flow__median,...,prec__permutation_entropy__dimension_5__tau_1,prec__permutation_entropy__dimension_6__tau_1,prec__permutation_entropy__dimension_7__tau_1,prec__query_similarity_count__query_None__threshold_0.0,"prec__matrix_profile__feature_""min""__threshold_0.98","prec__matrix_profile__feature_""max""__threshold_0.98","prec__matrix_profile__feature_""mean""__threshold_0.98","prec__matrix_profile__feature_""median""__threshold_0.98","prec__matrix_profile__feature_""25""__threshold_0.98","prec__matrix_profile__feature_""75""__threshold_0.98"
"(114.0, '1909-01-14')",0.0,1.0,1.0,1.0,22.2,164.28,0.0,0.0,0.0,7.4,...,,,,,,,,,,
"(114.0, '1909-01-15')",0.0,1.0,0.0,1.0,21.9,159.93,0.15,-0.15,-0.15,7.4,...,,,,,,,,,,
"(114.0, '1909-01-18')",0.0,1.0,1.0,1.0,21.3,151.23,0.0,0.0,0.0,7.1,...,,,,,,,,,,
"(114.0, '1909-02-12')",0.0,1.0,1.0,1.0,17.7,104.43,0.0,0.0,0.0,5.9,...,,,,,,,,,,
"(114.0, '1909-03-05')",0.0,1.0,1.0,1.0,15.9,84.27,0.0,0.0,0.0,5.3,...,,,,,,,,,,


In [31]:
X_features_all = X_features_all.dropna(axis=1)
X_features_all['unique_id'] = X_features_all.index

In [32]:
X_features_all.to_csv(
    '../df_extracted_all_nearmiss_' + str(N_DAYS) + '_time_lag_' + str(TIME_LAG) + '.csv', index=False)


## Build Random Forest model with complete set of extracted timeseries parameters

In [33]:
X_features_all = pd.read_csv(
    '../df_extracted_all_nearmiss_' + str(N_DAYS) + '_time_lag_' + str(TIME_LAG) + '.csv')

all_data_clean = pd.read_csv('../all_data_clean.csv')
all_data_clean['year'] = pd.to_datetime(all_data_clean['date']).dt.year

### to get the binary labels for n days after end of rolled time series
all_data_clean['shifted_date'] = pd.to_datetime(
    all_data_clean.date) + pd.Timedelta(days=TIME_LAG)
all_data_clean['shifted_date'] = all_data_clean['shifted_date'].dt.strftime(
    '%Y-%m-%d')
all_data_clean['unique_id'] = list(
    zip(all_data_clean.flow_site_id, all_data_clean.shifted_date))
all_data_clean = all_data_clean.dropna()
all_data_clean.head()


Unnamed: 0,date,snow_site_id,snow_depth,depth_diff,binary,flow_site_id,temp_site_id,prec_site_id,flow,temp,prec,year,shifted_date,unique_id
0,1965-01-14,75280,0.0,0.0,0,1878.0,74240.0,74240.0,0.051,1.9,1.5,1965,1965-01-24,"(1878.0, 1965-01-24)"
1,1965-01-15,75280,0.0,0.0,0,1878.0,74240.0,74240.0,0.051,1.2,4.0,1965,1965-01-25,"(1878.0, 1965-01-25)"
2,1965-01-16,75280,0.0,0.0,0,1878.0,74240.0,74240.0,0.058,1.5,4.5,1965,1965-01-26,"(1878.0, 1965-01-26)"
3,1965-01-20,75280,0.03,0.03,0,1878.0,74240.0,74240.0,0.076,-0.9,0.0,1965,1965-01-30,"(1878.0, 1965-01-30)"
4,1965-01-21,75280,0.03,0.03,0,1878.0,74240.0,74240.0,0.076,-1.7,2.0,1965,1965-01-31,"(1878.0, 1965-01-31)"


In [34]:
X_features_all['unique_id'] = X_features_all['unique_id'].apply(ast.literal_eval)

In [35]:
X_features_under_all = pd.merge(X_features_all, all_data_clean[[
    'binary', 'year', 'unique_id']], how='left', on='unique_id')
X_features_under_all = X_features_under_all.set_index(
    X_features_under_all['unique_id'], drop=True)
y_under = X_features_under_all['binary']
X_features_under_filtered  =X_features_under_all.replace(np.inf, np.nan)
X_features_under_filtered = X_features_under_filtered.dropna(axis=1)
X_features_under_filtered = X_features_under_filtered.drop(
    columns=['unique_id', 'binary'])
X_features_under_filtered.head()


In [None]:
## run model without year and split by site and time
#results = random_forest_site(X_features_under_filtered.drop(columns=['year']), y_under, results, options={'param_type': 'All', 'year': 'No'})
#results = random_forest_time(X_features_under_filtered.drop(columns=['year']), y_under, results, options={'param_type': 'All', 'year': 'No'})


In [None]:
## run model without year and split by site and time
#results = gradient_boost_site(X_features_under_filtered.drop(columns=['year']), y_under, results, options={'param_type': 'All', 'year': 'No'})
#results = gradient_boost_time(X_features_under_filtered.drop(columns=['year']), y_under, results, options={'param_type': 'All', 'year': 'No'})

In [None]:
## run model with year and split by site and time
#results = random_forest_site(X_features_under_filtered, y_under, results, options={'param_type': 'All', 'year': 'Yes'})
#results = random_forest_time(X_features_under_filtered, y_under, results, options={'param_type': 'All', 'year': 'Yes'})


In [None]:
## run model with year and split by site and time
#results = gradient_boost_site(X_features_under_filtered, y_under, results, options={'param_type': 'All', 'year': 'Yes'})
#results = gradient_boost_time(X_features_under_filtered, y_under, results, options={'param_type': 'All', 'year': 'Yes'})

### Run the model with selected features

In [None]:
# Remove correlated features
X_features_under_filtered = X_features_under_filtered.drop(['flow__length', 'temp__length', 'prec__length'], axis=1)
corr_matrix = X_features_under_filtered.corr().abs()
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
to_drop = [column for column in upper.columns if any(upper[column] > 0.90)]
X_selected = X_features_under_filtered.drop(to_drop, axis=1)
X_selected['year'] = X_features_under_filtered['year']
X_selected


In [None]:
## run model without year and split by site and time
results = random_forest_site(X_selected.drop(columns=['year']), y_under, results, options={'param_type': 'Selected', 'year': 'No'})
results = random_forest_time(X_selected.drop(columns=['year']), y_under, results, options={'param_type': 'Selected', 'year': 'No'})

In [None]:
## run model without year and split by site and time
results = svm_site(X_selected.drop(columns=['year']), y_under, results, options={'param_type': 'Selected', 'year': 'No'})
results = svm_time(X_selected.drop(columns=['year']), y_under, results, options={'param_type': 'Selected', 'year': 'No'})


In [None]:
## run model with year and split by site and time
results = random_forest_site(X_selected, y_under, results, options={'param_type': 'Selected', 'year': 'Yes'})
results = random_forest_time(X_selected, y_under, results, options={'param_type': 'Selected', 'year': 'Yes'})

In [None]:
## run model with year and split by site and time
results = svm_site(X_selected, y_under, results, options={'param_type': 'Selected', 'year': 'Yes'})
results = svm_time(X_selected, y_under, results, options={'param_type': 'Selected', 'year': 'Yes'})


In [None]:
results.to_csv('../results_' + str(N_DAYS) + '_time_lag_' + str(TIME_LAG) + '_flow_temp_prec.csv')