polished pipeline example notebooks (#158)

blue-yonder · Feb 20, 2017 · 5078f26 · 5078f26
1 parent 9f06f22
commit 5078f26
Show file tree

Hide file tree

Showing 2 changed files with 96 additions and 98 deletions.
diff --git a/notebooks/pipeline_example.ipynb → notebooks/basic_pipeline_example.ipynb b/notebooks/pipeline_example.ipynb → notebooks/basic_pipeline_example.ipynb
@@ -1,28 +1,28 @@
 {
  "cells": [
   {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "collapsed": true
-   },
-   "outputs": [],
+   "cell_type": "markdown",
+   "metadata": {},
    "source": [
-    "import pandas as pd\n",
-    "from sklearn.pipeline import Pipeline\n",
-    "from sklearn.cross_validation import train_test_split\n",
-    "from sklearn.ensemble import RandomForestClassifier\n",
-    "from sklearn.metrics import classification_report"
+    "# Basic exampe of the RelevantFetaureAugmenter in sklearn pipeline"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
    "metadata": {
-    "collapsed": true
+    "collapsed": false,
+    "deletable": true,
+    "editable": true
    },
    "outputs": [],
    "source": [
+    "import pandas as pd\n",
+    "from sklearn.pipeline import Pipeline\n",
+    "from sklearn.cross_validation import train_test_split\n",
+    "from sklearn.ensemble import RandomForestClassifier\n",
+    "from sklearn.metrics import classification_report\n",
+    "from tsfresh.examples.robot_execution_failures import download_robot_execution_failures\n",
     "from tsfresh.examples import load_robot_execution_failures\n",
     "from tsfresh.transformers import RelevantFeatureAugmenter"
    ]
@@ -31,43 +31,57 @@
    "cell_type": "code",
    "execution_count": null,
    "metadata": {
-    "collapsed": true
+    "collapsed": false,
+    "deletable": true,
+    "editable": true
    },
    "outputs": [],
    "source": [
+    "# Download the dataset if you haven't already\n",
+    "download_robot_execution_failures() \n",
+    "# Load data\n",
     "df_ts, y = load_robot_execution_failures()"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
    "metadata": {
-    "collapsed": true
+    "collapsed": true,
+    "deletable": true,
+    "editable": true
    },
    "outputs": [],
    "source": [
+    "# We create an empty feature matrix that has the proper index\n",
     "X = pd.DataFrame(index=y.index)"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
    "metadata": {
-    "collapsed": true
+    "collapsed": true,
+    "deletable": true,
+    "editable": true
    },
    "outputs": [],
    "source": [
+    "# Split data into train and test set\n",
     "X_train, X_test, y_train, y_test = train_test_split(X, y)"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
    "metadata": {
-    "collapsed": false
+    "collapsed": false,
+    "deletable": true,
+    "editable": true
    },
    "outputs": [],
    "source": [
+    "# We have a pipeline that consists of a feature extraction step with a subsequent Random Forest Classifier \n",
     "ppl = Pipeline([('fresh', RelevantFeatureAugmenter(column_id='id', column_sort='time')),\n",
     "                ('clf', RandomForestClassifier())])"
    ]
@@ -76,43 +90,58 @@
    "cell_type": "code",
    "execution_count": null,
    "metadata": {
-    "collapsed": false
+    "collapsed": false,
+    "deletable": true,
+    "editable": true
    },
    "outputs": [],
    "source": [
+    "# Here comes the tricky part, due to limitations of the sklearn pipeline API, we can not pass the dataframe\n",
+    "# containing the time series dataframe but instead have to use the set_params method\n",
+    "# In this case, df_ts contains the time series of both train and test set, if you have different dataframes for \n",
+    "# train and test set, you have to call set_params two times (see the notebook pipeline_with_two_datasets.ipynb)\n",
     "ppl.set_params(fresh__timeseries_container=df_ts)"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
    "metadata": {
-    "collapsed": false
+    "collapsed": false,
+    "deletable": true,
+    "editable": true
    },
    "outputs": [],
    "source": [
+    "# We fit the pipeline\n",
     "ppl.fit(X_train, y_train)"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
    "metadata": {
-    "collapsed": false
+    "collapsed": false,
+    "deletable": true,
+    "editable": true
    },
    "outputs": [],
    "source": [
+    "# Predicting works as well\n",
     "y_pred = ppl.predict(X_test)"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
    "metadata": {
-    "collapsed": false
+    "collapsed": false,
+    "deletable": true,
+    "editable": true
    },
    "outputs": [],
    "source": [
+    "# So, finally we inspect the performance\n",
     "print(classification_report(y_test, y_pred))"
    ]
   }
@@ -133,7 +162,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython2",
-   "version": "2.7.3"
+   "version": "2.7.11"
   }
  },
  "nbformat": 4,

diff --git a/notebooks/pipeline_with_two_datasets.ipynb b/notebooks/pipeline_with_two_datasets.ipynb
@@ -2,20 +2,26 @@
  "cells": [
   {
    "cell_type": "markdown",
-   "metadata": {},
+   "metadata": {
+    "deletable": true,
+    "editable": true
+   },
    "source": [
-    "# Using Pipeline with separate datasets for train and test data\n",
+    "# Using the RelevantFeatureAugmenter with separate datasets for train and test data\n",
     "\n",
-    "This notebook shows how to use the RelevantFeatureAugmenter in pipelines where you first train on samples from dataset `df_train` but then want to test using samples from `df_test`.\n",
+    "This notebook illustrates the RelevantFeatureAugmenter in pipelines where you have first train on samples from dataset `df_train` but then want to test using samples from another `df_test`.\n",
+    "(Here `df_train` and `df_test` refer to the dataframes that contain the time series data)\n",
     "\n",
-    "The trick is just to call `ppl.set_params(fresh__timeseries_container=df)` for each of the datasets."
+    "Due to limitations in the sklearn pipeline API one has to use the `ppl.set_params(fresh__timeseries_container=df)` method for those two dataframes between train and test run."
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
    "metadata": {
-    "collapsed": false
+    "collapsed": false,
+    "deletable": true,
+    "editable": true
    },
    "outputs": [],
    "source": [
@@ -24,58 +30,51 @@
     "from sklearn.cross_validation import train_test_split\n",
     "from sklearn.ensemble import RandomForestClassifier\n",
     "from sklearn.metrics import classification_report\n",
-    "from tsfresh.examples.robot_execution_failures import download_robot_execution_failures"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "collapsed": true
-   },
-   "outputs": [],
-   "source": [
+    "from tsfresh.examples.robot_execution_failures import download_robot_execution_failures\n",
     "from tsfresh.examples import load_robot_execution_failures\n",
     "from tsfresh.transformers import RelevantFeatureAugmenter"
    ]
   },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "We are going to use the same dataset initialized twice, but lets pretend that we are initializing two separate datasets `df_train` and `df_test`:"
-   ]
-  },
   {
    "cell_type": "code",
    "execution_count": null,
    "metadata": {
-    "collapsed": false
+    "collapsed": false,
+    "deletable": true,
+    "editable": true
    },
    "outputs": [],
    "source": [
     "download_robot_execution_failures\n",
-    "df_train, y_train = load_robot_execution_failures()\n",
-    "df_test, y_test = load_robot_execution_failures()"
+    "df, y = load_robot_execution_failures()\n",
+    "df.shape"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
    "metadata": {
-    "collapsed": true
+    "collapsed": false
    },
    "outputs": [],
    "source": [
+    "# Here, df contains the time series of both train and test set. \n",
+    "# We will split it into a train df_train and a test set  df_test:\n",
+    "y_train, y_test = train_test_split(y)\n",
+    "df_train = df.loc[df.id.isin(y_train.index)]\n",
+    "df_test = df.loc[df.id.isin(y_test.index)]\n",
     "X_train = pd.DataFrame(index=y_train.index)\n",
-    "X_test = pd.DataFrame(index=y_test.index)"
+    "X_test = pd.DataFrame(index=y_test.index)\n",
+    "df_train.shape, df_test.shape"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
    "metadata": {
-    "collapsed": false
+    "collapsed": false,
+    "deletable": true,
+    "editable": true
    },
    "outputs": [],
    "source": [
@@ -87,93 +86,63 @@
    "cell_type": "code",
    "execution_count": null,
    "metadata": {
-    "collapsed": true
-   },
-   "outputs": [],
-   "source": [
-    "ppl.set_params(fresh__timeseries_container=df_train)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "collapsed": true
+    "collapsed": false,
+    "deletable": true,
+    "editable": true
    },
    "outputs": [],
    "source": [
+    "# for the fit on the train test set, we set the fresh__timeseries_container to `df_train`\n",
+    "ppl.set_params(fresh__timeseries_container=df_train)\n",
     "ppl.fit(X_train, y_train)"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
    "metadata": {
-    "collapsed": true
-   },
-   "outputs": [],
-   "source": [
-    "ppl.set_params(fresh__timeseries_container=df_test)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "collapsed": true
+    "collapsed": false,
+    "deletable": true,
+    "editable": true
    },
    "outputs": [],
    "source": [
+    "# for the predict on the test test set, we set the fresh__timeseries_container to `df_test`\n",
+    "ppl.set_params(fresh__timeseries_container=df_test)\n",
     "y_pred = ppl.predict(X_test)"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
    "metadata": {
-    "collapsed": false
+    "collapsed": false,
+    "deletable": true,
+    "editable": true
    },
    "outputs": [],
    "source": [
     "print(classification_report(y_test, y_pred))"
    ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "collapsed": true
-   },
-   "outputs": [],
-   "source": []
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "collapsed": true
-   },
-   "outputs": [],
-   "source": []
   }
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "Python 2",
    "language": "python",
-   "name": "python3"
+   "name": "python2"
   },
   "language_info": {
    "codemirror_mode": {
     "name": "ipython",
-    "version": 3
+    "version": 2
    },
    "file_extension": ".py",
    "mimetype": "text/x-python",
    "name": "python",
    "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.4.3"
+   "pygments_lexer": "ipython2",
+   "version": "2.7.11"
   }
  },
  "nbformat": 4,