diff --git a/notebooks/pipeline_example.ipynb b/notebooks/basic_pipeline_example.ipynb similarity index 56% rename from notebooks/pipeline_example.ipynb rename to notebooks/basic_pipeline_example.ipynb index 2d486aaa4..61b7b0410 100644 --- a/notebooks/pipeline_example.ipynb +++ b/notebooks/basic_pipeline_example.ipynb @@ -1,28 +1,28 @@ { "cells": [ { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], + "cell_type": "markdown", + "metadata": {}, "source": [ - "import pandas as pd\n", - "from sklearn.pipeline import Pipeline\n", - "from sklearn.cross_validation import train_test_split\n", - "from sklearn.ensemble import RandomForestClassifier\n", - "from sklearn.metrics import classification_report" + "# Basic exampe of the RelevantFetaureAugmenter in sklearn pipeline" ] }, { "cell_type": "code", "execution_count": null, "metadata": { - "collapsed": true + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [], "source": [ + "import pandas as pd\n", + "from sklearn.pipeline import Pipeline\n", + "from sklearn.cross_validation import train_test_split\n", + "from sklearn.ensemble import RandomForestClassifier\n", + "from sklearn.metrics import classification_report\n", + "from tsfresh.examples.robot_execution_failures import download_robot_execution_failures\n", "from tsfresh.examples import load_robot_execution_failures\n", "from tsfresh.transformers import RelevantFeatureAugmenter" ] @@ -31,10 +31,15 @@ "cell_type": "code", "execution_count": null, "metadata": { - "collapsed": true + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [], "source": [ + "# Download the dataset if you haven't already\n", + "download_robot_execution_failures() \n", + "# Load data\n", "df_ts, y = load_robot_execution_failures()" ] }, @@ -42,10 +47,13 @@ "cell_type": "code", "execution_count": null, "metadata": { - "collapsed": true + "collapsed": true, + "deletable": true, + "editable": true }, "outputs": [], "source": [ + "# We create an empty feature matrix that has the proper index\n", "X = pd.DataFrame(index=y.index)" ] }, @@ -53,10 +61,13 @@ "cell_type": "code", "execution_count": null, "metadata": { - "collapsed": true + "collapsed": true, + "deletable": true, + "editable": true }, "outputs": [], "source": [ + "# Split data into train and test set\n", "X_train, X_test, y_train, y_test = train_test_split(X, y)" ] }, @@ -64,10 +75,13 @@ "cell_type": "code", "execution_count": null, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [], "source": [ + "# We have a pipeline that consists of a feature extraction step with a subsequent Random Forest Classifier \n", "ppl = Pipeline([('fresh', RelevantFeatureAugmenter(column_id='id', column_sort='time')),\n", " ('clf', RandomForestClassifier())])" ] @@ -76,10 +90,16 @@ "cell_type": "code", "execution_count": null, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [], "source": [ + "# Here comes the tricky part, due to limitations of the sklearn pipeline API, we can not pass the dataframe\n", + "# containing the time series dataframe but instead have to use the set_params method\n", + "# In this case, df_ts contains the time series of both train and test set, if you have different dataframes for \n", + "# train and test set, you have to call set_params two times (see the notebook pipeline_with_two_datasets.ipynb)\n", "ppl.set_params(fresh__timeseries_container=df_ts)" ] }, @@ -87,10 +107,13 @@ "cell_type": "code", "execution_count": null, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [], "source": [ + "# We fit the pipeline\n", "ppl.fit(X_train, y_train)" ] }, @@ -98,10 +121,13 @@ "cell_type": "code", "execution_count": null, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [], "source": [ + "# Predicting works as well\n", "y_pred = ppl.predict(X_test)" ] }, @@ -109,10 +135,13 @@ "cell_type": "code", "execution_count": null, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [], "source": [ + "# So, finally we inspect the performance\n", "print(classification_report(y_test, y_pred))" ] } @@ -133,7 +162,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", - "version": "2.7.3" + "version": "2.7.11" } }, "nbformat": 4, diff --git a/notebooks/pipeline_with_two_datasets.ipynb b/notebooks/pipeline_with_two_datasets.ipynb index b2e53cb6f..3a6933085 100644 --- a/notebooks/pipeline_with_two_datasets.ipynb +++ b/notebooks/pipeline_with_two_datasets.ipynb @@ -2,20 +2,26 @@ "cells": [ { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ - "# Using Pipeline with separate datasets for train and test data\n", + "# Using the RelevantFeatureAugmenter with separate datasets for train and test data\n", "\n", - "This notebook shows how to use the RelevantFeatureAugmenter in pipelines where you first train on samples from dataset `df_train` but then want to test using samples from `df_test`.\n", + "This notebook illustrates the RelevantFeatureAugmenter in pipelines where you have first train on samples from dataset `df_train` but then want to test using samples from another `df_test`.\n", + "(Here `df_train` and `df_test` refer to the dataframes that contain the time series data)\n", "\n", - "The trick is just to call `ppl.set_params(fresh__timeseries_container=df)` for each of the datasets." + "Due to limitations in the sklearn pipeline API one has to use the `ppl.set_params(fresh__timeseries_container=df)` method for those two dataframes between train and test run." ] }, { "cell_type": "code", "execution_count": null, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [], "source": [ @@ -24,58 +30,51 @@ "from sklearn.cross_validation import train_test_split\n", "from sklearn.ensemble import RandomForestClassifier\n", "from sklearn.metrics import classification_report\n", - "from tsfresh.examples.robot_execution_failures import download_robot_execution_failures" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ + "from tsfresh.examples.robot_execution_failures import download_robot_execution_failures\n", "from tsfresh.examples import load_robot_execution_failures\n", "from tsfresh.transformers import RelevantFeatureAugmenter" ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We are going to use the same dataset initialized twice, but lets pretend that we are initializing two separate datasets `df_train` and `df_test`:" - ] - }, { "cell_type": "code", "execution_count": null, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [], "source": [ "download_robot_execution_failures\n", - "df_train, y_train = load_robot_execution_failures()\n", - "df_test, y_test = load_robot_execution_failures()" + "df, y = load_robot_execution_failures()\n", + "df.shape" ] }, { "cell_type": "code", "execution_count": null, "metadata": { - "collapsed": true + "collapsed": false }, "outputs": [], "source": [ + "# Here, df contains the time series of both train and test set. \n", + "# We will split it into a train df_train and a test set df_test:\n", + "y_train, y_test = train_test_split(y)\n", + "df_train = df.loc[df.id.isin(y_train.index)]\n", + "df_test = df.loc[df.id.isin(y_test.index)]\n", "X_train = pd.DataFrame(index=y_train.index)\n", - "X_test = pd.DataFrame(index=y_test.index)" + "X_test = pd.DataFrame(index=y_test.index)\n", + "df_train.shape, df_test.shape" ] }, { "cell_type": "code", "execution_count": null, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [], "source": [ @@ -87,21 +86,14 @@ "cell_type": "code", "execution_count": null, "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "ppl.set_params(fresh__timeseries_container=df_train)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [], "source": [ + "# for the fit on the train test set, we set the fresh__timeseries_container to `df_train`\n", + "ppl.set_params(fresh__timeseries_container=df_train)\n", "ppl.fit(X_train, y_train)" ] }, @@ -109,21 +101,14 @@ "cell_type": "code", "execution_count": null, "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "ppl.set_params(fresh__timeseries_container=df_test)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [], "source": [ + "# for the predict on the test test set, we set the fresh__timeseries_container to `df_test`\n", + "ppl.set_params(fresh__timeseries_container=df_test)\n", "y_pred = ppl.predict(X_test)" ] }, @@ -131,49 +116,33 @@ "cell_type": "code", "execution_count": null, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [], "source": [ "print(classification_report(y_test, y_pred))" ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] } ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 2", "language": "python", - "name": "python3" + "name": "python2" }, "language_info": { "codemirror_mode": { "name": "ipython", - "version": 3 + "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.4.3" + "pygments_lexer": "ipython2", + "version": "2.7.11" } }, "nbformat": 4,