# Extract time series parameters from flow and use to predict extreme snowmelt

In [2]:
import pandas as pd
import numpy as np
from collections import Counter
import ast
import dateutil.parser as parser

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, TimeSeriesSplit, GridSearchCV, KFold, cross_validate, PredefinedSplit
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import make_scorer, precision_recall_curve, auc, classification_report
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

from matplotlib import pyplot as plt


In [3]:
from tsfresh import extract_features, extract_relevant_features, select_features
from tsfresh.utilities.dataframe_functions import impute
from tsfresh.feature_extraction.settings import MinimalFCParameters
from tsfresh.utilities.dataframe_functions import roll_time_series


In [4]:
# build scorer function
def auc_pr_score(y_true, y_pred):
	precision, recall, _ = precision_recall_curve(y_true, y_pred)
	return auc(recall, precision)


auc_pr = make_scorer(auc_pr_score, greater_is_better=True)

N_DAYS = 30
TIME_LAG = 2

In [5]:
## Define functions

def random_forest_site(X_under, y_under, results, options):
	indx = X_under.index.unique()
	sites = [x[0] for x in indx]
	sites = list(set(sites))
	sites_train = sites[:len(sites)//10*6]
	sites_test = sites[len(sites)//10*6:]
	idx_train = [x for x in indx if x[0] in sites_train]
	idx_test = [x for x in indx if x[0] in sites_test]
	X_filtered_train = X_under[X_under.index.isin(idx_train)]
	X_filtered_test = X_under[X_under.index.isin(idx_test)]
	y_train = y_under[y_under.index.isin(idx_train)]
	y_test = y_under[y_under.index.isin(idx_test)]

	X_all = pd.concat([X_filtered_train, X_filtered_test]).reset_index(drop=True)
	y_all = pd.concat([y_train, y_test]).reset_index(drop=True)
	clf = RandomForestClassifier(n_jobs=-1, random_state=42, verbose=0)

	split_index = [-1 if x in X_filtered_train.index else 0 for x in X_under.index]
	ps = PredefinedSplit(test_fold=split_index)

	param_grid = {
		'max_depth': (1, 5, 10),
		'n_estimators': (100, 500, 750, 1500),
		'max_features': (2, 3, 5, 10)}

	gs = GridSearchCV(clf, param_grid=param_grid, cv=ps,
					scoring=auc_pr, n_jobs=-1, verbose=1)
	gs.fit(X_all, y_all)
	print(gs.best_params_, gs.best_score_)
	results = results.append({'n_days': N_DAYS, 'time_lag': TIME_LAG, 'eval_type': 'site', 'param_type': options['param_type'], 'year': options['year'],
                           'params': gs.best_params_, 'score': gs.best_score_}, ignore_index=True)
	
	return results

def random_forest_time(X_under, y_under, results, options):
	X_filtered_sorted = X_under.sort_index(key=lambda d: d.map(lambda x: x[1]))
	y_under_sorted = y_under.sort_index(key=lambda d: d.map(lambda x: x[1]))
	tscv = TimeSeriesSplit(n_splits=5)

	param_grid = {
		'max_depth': (1, 5, 10),
		'n_estimators': (100, 500, 750, 1500),
		'max_features': (2, 3, 5, 10)}

	clf = RandomForestClassifier(n_jobs=-1, random_state=42, verbose=0)

	gs = GridSearchCV(clf, param_grid=param_grid, cv=tscv,
					scoring=auc_pr, n_jobs=-1, verbose=1)
	gs.fit(X_filtered_sorted, y_under_sorted)
	
	results = results.append({'n_days': N_DAYS, 'time_lag': TIME_LAG, 'eval_type': 'time', 'param_type': options['param_type'], 'year': options['year'], 
                           'params': gs.best_params_, 'score': gs.best_score_}, ignore_index=True)
	print(gs.best_params_, gs.best_score_)

	return results



## Create rolled dataframe 

In [6]:
all_data_clean = pd.read_csv('../all_data_clean.csv')

all_data_clean


Unnamed: 0,date,snow_site_id,snow_depth,depth_diff,binary,flow_site_id,temp_site_id,prec_site_id,flow,temp,prec,year,month
0,1965-01-14,75280,0.00,0.00,0,1878.0,74240.0,75280.0,0.0510,1.9,1.0,1965,1
1,1965-01-15,75280,0.00,0.00,0,1878.0,74240.0,75280.0,0.0510,1.2,0.0,1965,1
2,1965-01-16,75280,0.00,0.00,0,1878.0,74240.0,75280.0,0.0580,1.5,6.2,1965,1
3,1965-01-20,75280,0.03,0.03,0,1878.0,74240.0,75280.0,0.0760,-0.9,0.0,1965,1
4,1965-01-21,75280,0.03,0.03,0,1878.0,74240.0,75280.0,0.0760,-1.7,2.7,1965,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
166815,2021-06-09,160970,0.00,0.00,0,1387.0,160970.0,160960.0,340.1524,17.2,0.0,2021,6
166816,2021-06-23,160970,0.00,0.00,0,1387.0,160970.0,160960.0,354.6912,14.6,0.0,2021,6
166817,2021-06-24,160970,0.00,0.00,0,1387.0,160970.0,160960.0,340.7535,17.3,0.0,2021,6
166818,2021-06-30,160970,0.00,0.00,0,1387.0,160970.0,160960.0,269.8846,15.0,0.0,2021,6


In [7]:

## Here can change parameters only once
df_rolled = roll_time_series(
    all_data_clean[['date', 'flow_site_id', 'flow', 'temp', 'prec', 'binary']], column_id="flow_site_id", column_sort="date", max_timeshift=N_DAYS, min_timeshift=N_DAYS - 1, n_jobs=20)


Rolling: 100%|██████████| 100/100 [00:55<00:00,  1.81it/s]


In [8]:
df_rolled

Unnamed: 0,date,flow_site_id,flow,temp,prec,binary,id
0,1969-11-03,436.0,13.4037,-10.6,0.0,0,"(436.0, 1970-01-08)"
1,1969-11-11,436.0,8.3640,-2.8,0.0,0,"(436.0, 1970-01-08)"
2,1969-11-12,436.0,7.9604,-6.3,11.7,0,"(436.0, 1970-01-08)"
3,1969-11-13,436.0,7.7586,-1.4,0.1,0,"(436.0, 1970-01-08)"
4,1969-11-14,436.0,7.3548,-7.7,0.0,0,"(436.0, 1970-01-08)"
...,...,...,...,...,...,...,...
739373,2021-07-28,10006.0,48.0000,18.3,0.0,0,"(10006.0, 2021-08-01)"
739374,2021-07-29,10006.0,46.0000,12.6,0.0,0,"(10006.0, 2021-08-01)"
739375,2021-07-30,10006.0,45.0000,10.0,0.0,0,"(10006.0, 2021-08-01)"
739376,2021-07-31,10006.0,43.0000,9.1,9.6,0,"(10006.0, 2021-08-01)"


In [9]:
df_rolled.to_csv('../df_rolled_' + str(N_DAYS) + '.csv', index=False)

## Extract minimal timeseries features

In [10]:
df_rolled = pd.read_csv('../df_rolled_' + str(N_DAYS) + '.csv')
all_data_clean = pd.read_csv('../all_data_clean.csv')

In [11]:
df_rolled

Unnamed: 0,date,flow_site_id,flow,temp,prec,binary,id
0,1969-11-03,436.0,13.4037,-10.6,0.0,0,"(436.0, '1970-01-08')"
1,1969-11-11,436.0,8.3640,-2.8,0.0,0,"(436.0, '1970-01-08')"
2,1969-11-12,436.0,7.9604,-6.3,11.7,0,"(436.0, '1970-01-08')"
3,1969-11-13,436.0,7.7586,-1.4,0.1,0,"(436.0, '1970-01-08')"
4,1969-11-14,436.0,7.3548,-7.7,0.0,0,"(436.0, '1970-01-08')"
...,...,...,...,...,...,...,...
5139482,2021-07-28,10006.0,48.0000,18.3,0.0,0,"(10006.0, '2021-08-01')"
5139483,2021-07-29,10006.0,46.0000,12.6,0.0,0,"(10006.0, '2021-08-01')"
5139484,2021-07-30,10006.0,45.0000,10.0,0.0,0,"(10006.0, '2021-08-01')"
5139485,2021-07-31,10006.0,43.0000,9.1,9.6,0,"(10006.0, '2021-08-01')"


In [12]:
# extract timeseries features

X_features_all = extract_features(
	df_rolled.drop(["binary", "flow_site_id"], axis=1), column_id='id', column_sort='date',
	n_jobs=20, disable_progressbar=False, default_fc_parameters=MinimalFCParameters())


X_features_all.head()


Feature Extraction: 100%|██████████| 100/100 [02:12<00:00,  1.33s/it]


Unnamed: 0,flow__sum_values,flow__median,flow__mean,flow__length,flow__standard_deviation,flow__variance,flow__root_mean_square,flow__maximum,flow__minimum,temp__sum_values,...,temp__minimum,prec__sum_values,prec__median,prec__mean,prec__length,prec__standard_deviation,prec__variance,prec__root_mean_square,prec__maximum,prec__minimum
"(10006.0, '2008-12-01')",1697.74,58.95,56.591333,30.0,14.401176,207.393865,58.394973,75.7,31.96,-254.5,...,-21.2,44.1,0.5,1.47,30.0,2.102562,4.420767,2.565476,9.9,0.0
"(10006.0, '2008-12-02')",1728.25,57.23,55.75,31.0,14.897617,221.938994,57.706165,75.7,30.51,-270.6,...,-21.2,44.1,0.5,1.422581,31.0,2.084615,4.345619,2.523758,9.9,0.0
"(10006.0, '2008-12-03')",1682.01,55.27,54.258387,31.0,15.138392,229.17091,56.330662,75.7,29.46,-271.8,...,-21.2,47.1,0.5,1.519355,31.0,2.085962,4.351238,2.580635,9.9,0.0
"(10006.0, '2008-12-04')",1635.43,52.75,52.755806,31.0,15.246897,232.467863,54.91487,75.7,29.12,-271.8,...,-21.2,45.7,0.5,1.474194,31.0,2.083726,4.341915,2.552481,9.9,0.0
"(10006.0, '2008-12-05')",1588.51,51.54,51.242258,31.0,15.222957,231.73843,53.455659,75.7,28.78,-280.3,...,-21.2,45.7,0.5,1.474194,31.0,2.083726,4.341915,2.552481,9.9,0.0


In [13]:
## Add binary response variable back based on unique id

X_features_all['unique_id'] = X_features_all.index
X_features_all['unique_id'] = X_features_all['unique_id'].apply(ast.literal_eval)

all_data_clean['shifted_date'] = pd.to_datetime(
    all_data_clean.date) + pd.Timedelta(days=TIME_LAG)
all_data_clean['shifted_date'] = all_data_clean['shifted_date'].dt.strftime(
    '%Y-%m-%d')
all_data_clean['unique_id'] = list(
    zip(all_data_clean.flow_site_id, all_data_clean.shifted_date))
all_data_clean = all_data_clean.dropna()

X_features_all = X_features_all.reset_index(drop=True)
	
X_features_all = pd.merge(X_features_all, all_data_clean[[
                          'binary', 'unique_id']], how='left', on='unique_id')
X_features_all = X_features_all.set_index(
    X_features_all['unique_id'], drop=True)
X_features_all = X_features_all.dropna()
X_features_all.head()


Unnamed: 0_level_0,flow__sum_values,flow__median,flow__mean,flow__length,flow__standard_deviation,flow__variance,flow__root_mean_square,flow__maximum,flow__minimum,temp__sum_values,...,prec__median,prec__mean,prec__length,prec__standard_deviation,prec__variance,prec__root_mean_square,prec__maximum,prec__minimum,unique_id,binary
unique_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"(10006.0, 2008-12-01)",1697.74,58.95,56.591333,30.0,14.401176,207.393865,58.394973,75.7,31.96,-254.5,...,0.5,1.47,30.0,2.102562,4.420767,2.565476,9.9,0.0,"(10006.0, 2008-12-01)",0.0
"(10006.0, 2008-12-02)",1728.25,57.23,55.75,31.0,14.897617,221.938994,57.706165,75.7,30.51,-270.6,...,0.5,1.422581,31.0,2.084615,4.345619,2.523758,9.9,0.0,"(10006.0, 2008-12-02)",0.0
"(10006.0, 2008-12-03)",1682.01,55.27,54.258387,31.0,15.138392,229.17091,56.330662,75.7,29.46,-271.8,...,0.5,1.519355,31.0,2.085962,4.351238,2.580635,9.9,0.0,"(10006.0, 2008-12-03)",0.0
"(10006.0, 2008-12-04)",1635.43,52.75,52.755806,31.0,15.246897,232.467863,54.91487,75.7,29.12,-271.8,...,0.5,1.474194,31.0,2.083726,4.341915,2.552481,9.9,0.0,"(10006.0, 2008-12-04)",0.0
"(10006.0, 2008-12-05)",1588.51,51.54,51.242258,31.0,15.222957,231.73843,53.455659,75.7,28.78,-280.3,...,0.5,1.474194,31.0,2.083726,4.341915,2.552481,9.9,0.0,"(10006.0, 2008-12-05)",0.0


In [14]:
X_features_all.to_csv('../df_extracted_min_' + str(N_DAYS) + '.csv', index=False)

## Undersample minimal timeseries feature dataset and run Random Forest model

In [15]:
X_features_all = pd.read_csv('../df_extracted_min_' + str(N_DAYS) + '.csv')


In [16]:
X_features_all['unique_id'] = X_features_all['unique_id'].apply(ast.literal_eval)
X_features_all.head()


Unnamed: 0,flow__sum_values,flow__median,flow__mean,flow__length,flow__standard_deviation,flow__variance,flow__root_mean_square,flow__maximum,flow__minimum,temp__sum_values,...,prec__median,prec__mean,prec__length,prec__standard_deviation,prec__variance,prec__root_mean_square,prec__maximum,prec__minimum,unique_id,binary
0,1697.74,58.95,56.591333,30.0,14.401176,207.393865,58.394973,75.7,31.96,-254.5,...,0.5,1.47,30.0,2.102562,4.420767,2.565476,9.9,0.0,"(10006.0, 2008-12-01)",0.0
1,1728.25,57.23,55.75,31.0,14.897617,221.938994,57.706165,75.7,30.51,-270.6,...,0.5,1.422581,31.0,2.084615,4.345619,2.523758,9.9,0.0,"(10006.0, 2008-12-02)",0.0
2,1682.01,55.27,54.258387,31.0,15.138392,229.17091,56.330662,75.7,29.46,-271.8,...,0.5,1.519355,31.0,2.085962,4.351238,2.580635,9.9,0.0,"(10006.0, 2008-12-03)",0.0
3,1635.43,52.75,52.755806,31.0,15.246897,232.467863,54.91487,75.7,29.12,-271.8,...,0.5,1.474194,31.0,2.083726,4.341915,2.552481,9.9,0.0,"(10006.0, 2008-12-04)",0.0
4,1588.51,51.54,51.242258,31.0,15.222957,231.73843,53.455659,75.7,28.78,-280.3,...,0.5,1.474194,31.0,2.083726,4.341915,2.552481,9.9,0.0,"(10006.0, 2008-12-05)",0.0


In [17]:
y1 = X_features_all['binary']
Counter(y1)

Counter({0.0: 149146, 1.0: 9297})

In [18]:
## undersample

from imblearn.under_sampling import NearMiss

undersample = NearMiss(version=3, n_neighbors=3)
X_under, y_under = undersample.fit_resample(
    X_features_all.drop(columns=['binary', 'unique_id']), y1)


In [19]:
X_under.index = X_features_all['unique_id'][undersample.sample_indices_]
y_under.index = X_features_all['unique_id'][undersample.sample_indices_]
Counter(y_under)


Counter({0.0: 9297, 1.0: 9297})

In [20]:
results = pd.DataFrame(columns=['n_days', 'time_lag', 'eval_type', 'param_type', 'year', 'params', 'score'])

In [21]:
## run model without year and split by site and time
results = random_forest_site(X_under, y_under, results, options={'param_type': 'Minimal', 'year': 'No'})
results = random_forest_time(X_under, y_under, results, options={'param_type': 'Minimal', 'year': 'No'})


Fitting 1 folds for each of 48 candidates, totalling 48 fits
{'max_depth': 10, 'max_features': 2, 'n_estimators': 750} 0.8137492917831822
Fitting 5 folds for each of 48 candidates, totalling 240 fits
{'max_depth': 10, 'max_features': 3, 'n_estimators': 750} 0.7602084094693671


In [22]:
## add year to features
dates = [parser.parse(x[1]).year for x in X_under.index]
X_under['year'] = dates


In [23]:
## run model with year and split by site and time
results = random_forest_site(X_under, y_under, results, options={'param_type': 'Minimal', 'year': 'Yes'})
results = random_forest_time(X_under, y_under, results, options={'param_type': 'Minimal', 'year': 'Yes'})


Fitting 1 folds for each of 48 candidates, totalling 48 fits
{'max_depth': 10, 'max_features': 2, 'n_estimators': 750} 0.8128651777294058
Fitting 5 folds for each of 48 candidates, totalling 240 fits
{'max_depth': 10, 'max_features': 3, 'n_estimators': 1500} 0.7627111300681941


In [24]:
df_rolled['id'] = df_rolled['id'].apply(ast.literal_eval)
df_rolled.id

0            (436.0, 1970-01-08)
1            (436.0, 1970-01-08)
2            (436.0, 1970-01-08)
3            (436.0, 1970-01-08)
4            (436.0, 1970-01-08)
                   ...          
5139482    (10006.0, 2021-08-01)
5139483    (10006.0, 2021-08-01)
5139484    (10006.0, 2021-08-01)
5139485    (10006.0, 2021-08-01)
5139486    (10006.0, 2021-08-01)
Name: id, Length: 5139487, dtype: object

In [25]:
X_under_all = df_rolled[df_rolled.id.isin(X_under.index)]
X_under_all


Unnamed: 0,date,flow_site_id,flow,temp,prec,binary,id
1270,1970-01-19,436.0,3.0093,-14.5,1.6,0,"(436.0, 1970-03-09)"
1271,1970-01-20,436.0,2.9076,-9.0,0.6,0,"(436.0, 1970-03-09)"
1272,1970-01-21,436.0,2.9078,-0.7,0.0,0,"(436.0, 1970-03-09)"
1273,1970-01-22,436.0,2.9079,-4.3,0.3,0,"(436.0, 1970-03-09)"
1274,1970-01-23,436.0,2.9080,-9.4,0.0,0,"(436.0, 1970-03-09)"
...,...,...,...,...,...,...,...
5137436,2021-05-23,10006.0,296.0000,1.1,0.0,0,"(10006.0, 2021-05-27)"
5137437,2021-05-24,10006.0,269.0000,2.0,0.4,0,"(10006.0, 2021-05-27)"
5137438,2021-05-25,10006.0,247.0000,3.6,0.0,0,"(10006.0, 2021-05-27)"
5137439,2021-05-26,10006.0,229.0000,5.2,0.0,0,"(10006.0, 2021-05-27)"


In [26]:
X_under_all.to_csv('../df_undersampled_nearmiss_' +
                   str(N_DAYS) + '_time_lag_' + str(TIME_LAG) + '.csv', index=False)


## Extract complete set of timeseries features

In [27]:
X_under_all = pd.read_csv('../df_undersampled_nearmiss_' +
                          str(N_DAYS) + '_time_lag_' + str(TIME_LAG) + '.csv')
X_under_all.head()

Unnamed: 0,date,flow_site_id,flow,temp,prec,binary,id
0,1970-01-19,436.0,3.0093,-14.5,1.6,0,"(436.0, '1970-03-09')"
1,1970-01-20,436.0,2.9076,-9.0,0.6,0,"(436.0, '1970-03-09')"
2,1970-01-21,436.0,2.9078,-0.7,0.0,0,"(436.0, '1970-03-09')"
3,1970-01-22,436.0,2.9079,-4.3,0.3,0,"(436.0, '1970-03-09')"
4,1970-01-23,436.0,2.908,-9.4,0.0,0,"(436.0, '1970-03-09')"


In [28]:
# extract timeseries features

X_features_all = extract_features(
	X_under_all.drop(["binary", "flow_site_id"], axis=1), column_id='id', column_sort='date',
	n_jobs=20, disable_progressbar=False)


X_features_all.head()

Feature Extraction: 100%|██████████| 100/100 [04:55<00:00,  2.95s/it]


Unnamed: 0,flow__variance_larger_than_standard_deviation,flow__has_duplicate_max,flow__has_duplicate_min,flow__has_duplicate,flow__sum_values,flow__abs_energy,flow__mean_abs_change,flow__mean_change,flow__mean_second_derivative_central,flow__median,...,prec__permutation_entropy__dimension_5__tau_1,prec__permutation_entropy__dimension_6__tau_1,prec__permutation_entropy__dimension_7__tau_1,prec__query_similarity_count__query_None__threshold_0.0,"prec__matrix_profile__feature_""min""__threshold_0.98","prec__matrix_profile__feature_""max""__threshold_0.98","prec__matrix_profile__feature_""mean""__threshold_0.98","prec__matrix_profile__feature_""median""__threshold_0.98","prec__matrix_profile__feature_""25""__threshold_0.98","prec__matrix_profile__feature_""75""__threshold_0.98"
"(10006.0, '2008-12-19')",1.0,0.0,1.0,1.0,985.4,34576.9774,1.105333,-1.105333,0.020862,29.12,...,2.968392,3.204778,3.218876,,,,,,,
"(10006.0, '2008-12-20')",1.0,0.0,0.0,1.0,951.14,32136.295,1.101667,-1.101667,0.011552,28.78,...,3.019736,3.204778,3.218876,,,,,,,
"(10006.0, '2008-12-21')",1.0,0.0,0.0,1.0,917.66,29806.087,1.057,-1.057,0.012414,27.79,...,3.019736,3.204778,3.218876,,,,,,,
"(10006.0, '2008-12-28')",1.0,0.0,0.0,1.0,725.22,18080.373,0.693667,-0.693667,0.032241,21.26,...,3.07108,3.204778,3.218876,,,,,,,
"(10006.0, '2008-12-29')",1.0,0.0,0.0,1.0,703.53,16974.8337,0.654667,-0.654667,0.005172,20.77,...,3.019736,3.204778,3.218876,,,,,,,


In [29]:
X_features_all = X_features_all.dropna(axis=1)
X_features_all['unique_id'] = X_features_all.index

In [30]:
X_features_all.to_csv(
    '../df_extracted_all_nearmiss_' + str(N_DAYS) + '_time_lag_' + str(TIME_LAG) + '.csv', index=False)


## Build Random Forest model with complete set of extracted timeseries parameters

In [31]:
X_features_all = pd.read_csv(
    '../df_extracted_all_nearmiss_' + str(N_DAYS) + '_time_lag_' + str(TIME_LAG) + '.csv')

all_data_clean = pd.read_csv('../all_data_clean.csv')

### to get the binary labels for n days after end of rolled time series
all_data_clean['shifted_date'] = pd.to_datetime(
    all_data_clean.date) + pd.Timedelta(days=TIME_LAG)
all_data_clean['shifted_date'] = all_data_clean['shifted_date'].dt.strftime(
    '%Y-%m-%d')
all_data_clean['unique_id'] = list(
    zip(all_data_clean.flow_site_id, all_data_clean.shifted_date))
all_data_clean = all_data_clean.dropna()
all_data_clean.head()


Unnamed: 0,date,snow_site_id,snow_depth,depth_diff,binary,flow_site_id,temp_site_id,prec_site_id,flow,temp,prec,year,month,shifted_date,unique_id
0,1965-01-14,75280,0.0,0.0,0,1878.0,74240.0,75280.0,0.051,1.9,1.0,1965,1,1965-01-16,"(1878.0, 1965-01-16)"
1,1965-01-15,75280,0.0,0.0,0,1878.0,74240.0,75280.0,0.051,1.2,0.0,1965,1,1965-01-17,"(1878.0, 1965-01-17)"
2,1965-01-16,75280,0.0,0.0,0,1878.0,74240.0,75280.0,0.058,1.5,6.2,1965,1,1965-01-18,"(1878.0, 1965-01-18)"
3,1965-01-20,75280,0.03,0.03,0,1878.0,74240.0,75280.0,0.076,-0.9,0.0,1965,1,1965-01-22,"(1878.0, 1965-01-22)"
4,1965-01-21,75280,0.03,0.03,0,1878.0,74240.0,75280.0,0.076,-1.7,2.7,1965,1,1965-01-23,"(1878.0, 1965-01-23)"


In [32]:
X_features_all['unique_id'] = X_features_all['unique_id'].apply(ast.literal_eval)


In [33]:
X_features_under_all = pd.merge(X_features_all, all_data_clean[[
    'binary', 'year', 'unique_id']], how='left', on='unique_id')
X_features_under_all = X_features_under_all.set_index(
    X_features_under_all['unique_id'], drop=True)
y_under = X_features_under_all['binary']
X_features_under_filtered  =X_features_under_all.replace(np.inf, np.nan)
X_features_under_filtered = X_features_under_filtered.dropna(axis=1)
X_features_under_filtered = X_features_under_filtered.drop(
    columns=['unique_id', 'binary'])
X_features_under_filtered.head()


Unnamed: 0_level_0,flow__variance_larger_than_standard_deviation,flow__has_duplicate_max,flow__has_duplicate_min,flow__has_duplicate,flow__sum_values,flow__abs_energy,flow__mean_abs_change,flow__mean_change,flow__mean_second_derivative_central,flow__median,...,prec__lempel_ziv_complexity__bins_3,prec__lempel_ziv_complexity__bins_5,prec__lempel_ziv_complexity__bins_10,prec__lempel_ziv_complexity__bins_100,prec__permutation_entropy__dimension_3__tau_1,prec__permutation_entropy__dimension_4__tau_1,prec__permutation_entropy__dimension_5__tau_1,prec__permutation_entropy__dimension_6__tau_1,prec__permutation_entropy__dimension_7__tau_1,year
unique_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"(10006.0, 2008-12-19)",1.0,0.0,1.0,1.0,985.4,34576.9774,1.105333,-1.105333,0.020862,29.12,...,0.419355,0.451613,0.516129,0.645161,1.674369,2.462814,2.968392,3.204778,3.218876,2008
"(10006.0, 2008-12-20)",1.0,0.0,0.0,1.0,951.14,32136.295,1.101667,-1.101667,0.011552,28.78,...,0.419355,0.483871,0.548387,0.677419,1.720621,2.552172,3.019736,3.204778,3.218876,2008
"(10006.0, 2008-12-21)",1.0,0.0,0.0,1.0,917.66,29806.087,1.057,-1.057,0.012414,27.79,...,0.419355,0.516129,0.548387,0.677419,1.720621,2.632506,3.019736,3.204778,3.218876,2008
"(10006.0, 2008-12-28)",1.0,0.0,0.0,1.0,725.22,18080.373,0.693667,-0.693667,0.032241,21.26,...,0.451613,0.483871,0.516129,0.612903,1.693799,2.651193,3.07108,3.204778,3.218876,2008
"(10006.0, 2008-12-29)",1.0,0.0,0.0,1.0,703.53,16974.8337,0.654667,-0.654667,0.005172,20.77,...,0.419355,0.483871,0.483871,0.580645,1.633344,2.570859,3.019736,3.204778,3.218876,2008


In [34]:
## run model without year and split by site and time
results = random_forest_site(X_features_under_filtered.drop(columns=['year']), y_under, results, options={'param_type': 'All', 'year': 'No'})
results = random_forest_time(X_features_under_filtered.drop(columns=['year']), y_under, results, options={'param_type': 'All', 'year': 'No'})


Fitting 1 folds for each of 48 candidates, totalling 48 fits
{'max_depth': 10, 'max_features': 10, 'n_estimators': 1500} 0.8363553534105892
Fitting 5 folds for each of 48 candidates, totalling 240 fits
{'max_depth': 10, 'max_features': 10, 'n_estimators': 100} 0.7944154579574428


In [35]:
## run model with year and split by site and time
results = random_forest_site(X_features_under_filtered, y_under, results, options={'param_type': 'All', 'year': 'Yes'})
results = random_forest_time(X_features_under_filtered, y_under, results, options={'param_type': 'All', 'year': 'Yes'})


Fitting 1 folds for each of 48 candidates, totalling 48 fits
{'max_depth': 10, 'max_features': 10, 'n_estimators': 500} 0.8364200451679122
Fitting 5 folds for each of 48 candidates, totalling 240 fits
{'max_depth': 10, 'max_features': 10, 'n_estimators': 1500} 0.7930323832727459


### Run the model with selected features

In [36]:
sel = SelectFromModel(RandomForestClassifier(n_jobs=-1, random_state=42))
sel.fit(X_features_under_filtered.drop(columns=['year']), y_under)
selected_feat = X_features_under_filtered.drop(
    columns=['year']).columns[(sel.get_support())]
X_selected = X_features_under_filtered[selected_feat]
X_selected['year'] = X_features_under_filtered['year']
X_selected

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_selected['year'] = X_features_under_filtered['year']


Unnamed: 0_level_0,flow__abs_energy,flow__mean_change,flow__mean_second_derivative_central,flow__median,flow__mean,flow__variance,flow__skewness,flow__kurtosis,flow__root_mean_square,flow__absolute_sum_of_changes,...,"prec__linear_trend__attr_""intercept""","prec__linear_trend__attr_""slope""","prec__linear_trend__attr_""stderr""","prec__agg_linear_trend__attr_""rvalue""__chunk_len_5__f_agg_""mean""","prec__agg_linear_trend__attr_""rvalue""__chunk_len_5__f_agg_""var""","prec__agg_linear_trend__attr_""rvalue""__chunk_len_10__f_agg_""mean""","prec__agg_linear_trend__attr_""intercept""__chunk_len_10__f_agg_""var""","prec__agg_linear_trend__attr_""slope""__chunk_len_5__f_agg_""max""","prec__agg_linear_trend__attr_""stderr""__chunk_len_10__f_agg_""var""",year
unique_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"(10006.0, 2008-12-19)",34576.977400,-1.105333,0.020862,29.1200,31.787097,104.966846,0.666122,-0.735178,33.397401,33.1600,...,2.576411,-0.050040,0.049233,-0.694684,-0.187292,-0.964455,6.03413,-0.621429,2.208811,2008
"(10006.0, 2008-12-20)",32136.295000,-1.101667,0.011552,28.7800,30.681935,95.273512,0.707253,-0.592379,32.197122,33.0500,...,2.883669,-0.069234,0.048160,-0.788692,-0.232711,-0.996016,5.95328,-0.775000,2.285604,2008
"(10006.0, 2008-12-21)",29806.087000,-1.057000,0.012414,27.7900,29.601935,85.212093,0.728868,-0.483604,31.007849,31.7100,...,3.010685,-0.074476,0.047706,-0.435232,-0.325023,-0.365434,6.29468,-0.742857,0.836975,2008
"(10006.0, 2008-12-28)",18080.373000,-0.693667,0.032241,21.2600,23.394194,35.949547,0.493074,-0.890704,24.150318,20.8100,...,2.616129,-0.077419,0.047224,-0.767213,-0.741838,-0.956290,9.17665,-1.239286,0.626653,2008
"(10006.0, 2008-12-29)",16974.833700,-0.654667,0.005172,20.7700,22.694516,32.534218,0.443320,-1.012500,23.400327,19.6400,...,2.308266,-0.066573,0.046641,-0.830542,-0.828098,-0.998268,9.05583,-1.425000,0.676297,2008
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
"(591.0, 2021-05-25)",80088.783429,3.369363,-0.230514,6.2219,28.099781,1793.911471,1.797640,1.619215,50.828232,156.8491,...,0.329234,0.076976,0.083202,0.161076,0.206865,0.009717,8.09065,0.717857,9.221192,2021
"(591.0, 2021-05-27)",101145.702559,3.237840,0.019233,6.5219,34.333561,2083.971168,1.306977,-0.053901,57.120614,162.9002,...,0.885685,0.039879,0.084095,0.094906,0.170083,-0.051790,7.81605,0.403571,9.403951,2021
"(591.0, 2021-05-28)",113950.377716,3.570217,0.172198,7.0266,37.788768,2247.827669,1.100367,-0.622573,60.628530,172.8715,...,1.163911,0.021331,0.084328,-0.114942,-0.062179,-0.258199,14.76880,-0.321429,9.768643,2021
"(591.0, 2021-05-30)",143508.778124,3.906463,0.059507,8.8010,45.240410,2582.620756,0.749833,-1.367912,68.039073,182.9363,...,1.720363,-0.015766,0.084370,-0.156928,-0.041968,-0.261960,14.84316,-0.335714,9.815616,2021


In [37]:
# Remove correlated features
corr_matrix = X_selected.corr().abs()
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
to_drop = [column for column in upper.columns if any(upper[column] > 0.90)]
X_selected.drop(to_drop, axis=1, inplace=True)

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [38]:
## run model without year and split by site and time
results = random_forest_site(X_selected.drop(columns=['year']), y_under, results, options={'param_type': 'Selected', 'year': 'No'})
results = random_forest_time(X_selected.drop(columns=['year']), y_under, results, options={'param_type': 'Selected', 'year': 'No'})

Fitting 1 folds for each of 48 candidates, totalling 48 fits
{'max_depth': 10, 'max_features': 10, 'n_estimators': 100} 0.8388884764344147
Fitting 5 folds for each of 48 candidates, totalling 240 fits
{'max_depth': 10, 'max_features': 10, 'n_estimators': 500} 0.7949286243885687


In [39]:
## run model with year and split by site and time
results = random_forest_site(X_selected, y_under, results, options={'param_type': 'Selected', 'year': 'Yes'})
results = random_forest_time(X_selected, y_under, results, options={'param_type': 'Selected', 'year': 'Yes'})

Fitting 1 folds for each of 48 candidates, totalling 48 fits
{'max_depth': 10, 'max_features': 10, 'n_estimators': 1500} 0.8378582906428896
Fitting 5 folds for each of 48 candidates, totalling 240 fits
{'max_depth': 10, 'max_features': 10, 'n_estimators': 750} 0.7952719292470424


In [40]:
results.to_csv('../results_' + str(N_DAYS) + '_time_lag_' + str(TIME_LAG) + '_flow_temp_prec.csv')