Add a notebook demonstrating how to use the Pipeline with two separat…

…e datasets (train and test) (#156) It helped me understand it...
blue-yonder · Feb 20, 2017 · 9f06f22 · 9f06f22
1 parent 0b08d81
commit 9f06f22
Showing 1 changed file with 181 additions and 0 deletions.
diff --git a/notebooks/pipeline_with_two_datasets.ipynb b/notebooks/pipeline_with_two_datasets.ipynb
@@ -0,0 +1,181 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Using Pipeline with separate datasets for train and test data\n",
+    "\n",
+    "This notebook shows how to use the RelevantFeatureAugmenter in pipelines where you first train on samples from dataset `df_train` but then want to test using samples from `df_test`.\n",
+    "\n",
+    "The trick is just to call `ppl.set_params(fresh__timeseries_container=df)` for each of the datasets."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "from sklearn.pipeline import Pipeline\n",
+    "from sklearn.cross_validation import train_test_split\n",
+    "from sklearn.ensemble import RandomForestClassifier\n",
+    "from sklearn.metrics import classification_report\n",
+    "from tsfresh.examples.robot_execution_failures import download_robot_execution_failures"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "from tsfresh.examples import load_robot_execution_failures\n",
+    "from tsfresh.transformers import RelevantFeatureAugmenter"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We are going to use the same dataset initialized twice, but lets pretend that we are initializing two separate datasets `df_train` and `df_test`:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "download_robot_execution_failures\n",
+    "df_train, y_train = load_robot_execution_failures()\n",
+    "df_test, y_test = load_robot_execution_failures()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "X_train = pd.DataFrame(index=y_train.index)\n",
+    "X_test = pd.DataFrame(index=y_test.index)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "ppl = Pipeline([('fresh', RelevantFeatureAugmenter(column_id='id', column_sort='time')),\n",
+    "                ('clf', RandomForestClassifier())])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "ppl.set_params(fresh__timeseries_container=df_train)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "ppl.fit(X_train, y_train)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "ppl.set_params(fresh__timeseries_container=df_test)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "y_pred = ppl.predict(X_test)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "print(classification_report(y_test, y_pred))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.4.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 1
+}