### Extract time series parameters from flow and use to predict extreme snowmelt

In [1]:
import pandas as pd

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

from tsfresh import extract_features, extract_relevant_features, select_features
from tsfresh.utilities.dataframe_functions import impute
from tsfresh.feature_extraction.settings import MinimalFCParameters
from tsfresh.utilities.dataframe_functions import roll_time_series 


In [10]:
all_data_clean = pd.read_csv('../all_data_clean.csv')

all_data_clean.head()


Unnamed: 0.1,Unnamed: 0,date,snow_depth,depth_diff,binary,flow_site_id,snow_site_id,distance,flow,year,month,flow_prev_month,flow_prev_month_max,site_id
0,0,1945-05-23,0.0,0.0,0,51.0,157010.0,35296.19718,182.0,1945,5,413.3518,1746.0,51.0 157010.0
1,1,1945-05-24,0.0,0.0,0,51.0,157010.0,35296.19718,176.0,1945,5,413.3518,1746.0,51.0 157010.0
2,2,1945-05-25,0.0,0.0,0,51.0,157010.0,35296.19718,164.0,1945,5,413.3518,1746.0,51.0 157010.0
3,3,1945-05-26,0.0,0.0,0,51.0,157010.0,35296.19718,152.0,1945,5,413.3518,1746.0,51.0 157010.0
4,4,1945-05-27,0.0,0.0,0,51.0,157010.0,35296.19718,143.0,1945,5,413.3518,1746.0,51.0 157010.0


In [11]:

df_rolled = roll_time_series(
    all_data_clean[['date', 'site_id', 'flow', 'binary']], column_id="site_id", column_sort="date", max_timeshift=30, min_timeshift=29, n_jobs=20)


Rolling: 100%|██████████| 100/100 [20:02<00:00, 12.02s/it] 


In [12]:
df_rolled

Unnamed: 0,date,site_id,flow,binary,id
12202251,1945-01-23,100.0 122510.0,97.0,0,"(100.0 122510.0, 1945-02-21)"
12202252,1945-01-24,100.0 122510.0,97.0,0,"(100.0 122510.0, 1945-02-21)"
12202253,1945-01-25,100.0 122510.0,95.0,0,"(100.0 122510.0, 1945-02-21)"
12202254,1945-01-26,100.0 122510.0,95.0,0,"(100.0 122510.0, 1945-02-21)"
12202255,1945-01-27,100.0 122510.0,94.0,1,"(100.0 122510.0, 1945-02-21)"
...,...,...,...,...,...
15189511,1959-10-24,937.0 104720.0,2.7,0,"(937.0 104720.0, 1959-10-31)"
15189512,1959-10-28,937.0 104720.0,3.3,0,"(937.0 104720.0, 1959-10-31)"
15189513,1959-10-29,937.0 104720.0,3.5,0,"(937.0 104720.0, 1959-10-31)"
15189514,1959-10-30,937.0 104720.0,2.8,0,"(937.0 104720.0, 1959-10-31)"


In [13]:
df_rolled.to_csv('../df_rolled.csv', index=False)

In [17]:
df_rolled = pd.read_csv('../df_rolled.csv')


In [18]:
df_rolled

Unnamed: 0,date,site_id,flow,binary,id
0,1945-01-23,100.0 122510.0,97.0,0,"('100.0 122510.0', '1945-02-21')"
1,1945-01-24,100.0 122510.0,97.0,0,"('100.0 122510.0', '1945-02-21')"
2,1945-01-25,100.0 122510.0,95.0,0,"('100.0 122510.0', '1945-02-21')"
3,1945-01-26,100.0 122510.0,95.0,0,"('100.0 122510.0', '1945-02-21')"
4,1945-01-27,100.0 122510.0,94.0,1,"('100.0 122510.0', '1945-02-21')"
...,...,...,...,...,...
46453011,1959-10-24,937.0 104720.0,2.7,0,"('937.0 104720.0', '1959-10-31')"
46453012,1959-10-28,937.0 104720.0,3.3,0,"('937.0 104720.0', '1959-10-31')"
46453013,1959-10-29,937.0 104720.0,3.5,0,"('937.0 104720.0', '1959-10-31')"
46453014,1959-10-30,937.0 104720.0,2.8,0,"('937.0 104720.0', '1959-10-31')"


In [None]:
# TODO: undersample to balance classes


In [19]:
# extract timeseries features

X_features_all = extract_features(
	df_rolled.drop(["binary"], axis=1), column_id='id', column_sort='date', column_value="flow", 
	n_jobs=20, disable_progressbar=False, default_fc_parameters=MinimalFCParameters())


X_features_all.head()


Feature Extraction:  44%|████▍     | 44/100 [06:47<08:46,  9.40s/it]

In [None]:
X_features_all['site_id'] = X_features_all.index.map(lambda x: x[0])
X_features_all = X_features_all.set_index(
    X_features_all.index.map(lambda x: x[1]), drop=True)
X_features_all.index.name = "last_date"

X_features_all.head()


In [None]:
y = all_data_clean.set_index("date").sort_index().binary.shift(-1)


In [None]:
# select relevant features
X_filtered = select_features(X_features_all, y1, n_jobs=20)

X_filtered.head()

In [None]:
y1.value_counts()

In [None]:
# split into train and test sets
X_filtered_train, X_filtered_test, y_train, y_test = train_test_split(
    X_filtered, y1, test_size=.4)

classifier_filtered = RandomForestClassifier()
classifier_filtered.fit(X_filtered_train, y_train)
print(classification_report(y_test, classifier_filtered.predict(X_filtered_test)))
