Skip to content

Commit

Permalink
polished pipeline example notebooks (#158)
Browse files Browse the repository at this point in the history
  • Loading branch information
MaxBenChrist committed Feb 20, 2017
1 parent 9f06f22 commit 5078f26
Show file tree
Hide file tree
Showing 2 changed files with 96 additions and 98 deletions.
Original file line number Diff line number Diff line change
@@ -1,28 +1,28 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"cell_type": "markdown",
"metadata": {},
"source": [
"import pandas as pd\n",
"from sklearn.pipeline import Pipeline\n",
"from sklearn.cross_validation import train_test_split\n",
"from sklearn.ensemble import RandomForestClassifier\n",
"from sklearn.metrics import classification_report"
"# Basic exampe of the RelevantFetaureAugmenter in sklearn pipeline"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"import pandas as pd\n",
"from sklearn.pipeline import Pipeline\n",
"from sklearn.cross_validation import train_test_split\n",
"from sklearn.ensemble import RandomForestClassifier\n",
"from sklearn.metrics import classification_report\n",
"from tsfresh.examples.robot_execution_failures import download_robot_execution_failures\n",
"from tsfresh.examples import load_robot_execution_failures\n",
"from tsfresh.transformers import RelevantFeatureAugmenter"
]
Expand All @@ -31,43 +31,57 @@
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"# Download the dataset if you haven't already\n",
"download_robot_execution_failures() \n",
"# Load data\n",
"df_ts, y = load_robot_execution_failures()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
"collapsed": true,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"# We create an empty feature matrix that has the proper index\n",
"X = pd.DataFrame(index=y.index)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
"collapsed": true,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"# Split data into train and test set\n",
"X_train, X_test, y_train, y_test = train_test_split(X, y)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"# We have a pipeline that consists of a feature extraction step with a subsequent Random Forest Classifier \n",
"ppl = Pipeline([('fresh', RelevantFeatureAugmenter(column_id='id', column_sort='time')),\n",
" ('clf', RandomForestClassifier())])"
]
Expand All @@ -76,43 +90,58 @@
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"# Here comes the tricky part, due to limitations of the sklearn pipeline API, we can not pass the dataframe\n",
"# containing the time series dataframe but instead have to use the set_params method\n",
"# In this case, df_ts contains the time series of both train and test set, if you have different dataframes for \n",
"# train and test set, you have to call set_params two times (see the notebook pipeline_with_two_datasets.ipynb)\n",
"ppl.set_params(fresh__timeseries_container=df_ts)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"# We fit the pipeline\n",
"ppl.fit(X_train, y_train)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"# Predicting works as well\n",
"y_pred = ppl.predict(X_test)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"# So, finally we inspect the performance\n",
"print(classification_report(y_test, y_pred))"
]
}
Expand All @@ -133,7 +162,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.3"
"version": "2.7.11"
}
},
"nbformat": 4,
Expand Down
123 changes: 46 additions & 77 deletions notebooks/pipeline_with_two_datasets.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -2,20 +2,26 @@
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"metadata": {
"deletable": true,
"editable": true
},
"source": [
"# Using Pipeline with separate datasets for train and test data\n",
"# Using the RelevantFeatureAugmenter with separate datasets for train and test data\n",
"\n",
"This notebook shows how to use the RelevantFeatureAugmenter in pipelines where you first train on samples from dataset `df_train` but then want to test using samples from `df_test`.\n",
"This notebook illustrates the RelevantFeatureAugmenter in pipelines where you have first train on samples from dataset `df_train` but then want to test using samples from another `df_test`.\n",
"(Here `df_train` and `df_test` refer to the dataframes that contain the time series data)\n",
"\n",
"The trick is just to call `ppl.set_params(fresh__timeseries_container=df)` for each of the datasets."
"Due to limitations in the sklearn pipeline API one has to use the `ppl.set_params(fresh__timeseries_container=df)` method for those two dataframes between train and test run."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
Expand All @@ -24,58 +30,51 @@
"from sklearn.cross_validation import train_test_split\n",
"from sklearn.ensemble import RandomForestClassifier\n",
"from sklearn.metrics import classification_report\n",
"from tsfresh.examples.robot_execution_failures import download_robot_execution_failures"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"from tsfresh.examples.robot_execution_failures import download_robot_execution_failures\n",
"from tsfresh.examples import load_robot_execution_failures\n",
"from tsfresh.transformers import RelevantFeatureAugmenter"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"We are going to use the same dataset initialized twice, but lets pretend that we are initializing two separate datasets `df_train` and `df_test`:"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"download_robot_execution_failures\n",
"df_train, y_train = load_robot_execution_failures()\n",
"df_test, y_test = load_robot_execution_failures()"
"df, y = load_robot_execution_failures()\n",
"df.shape"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
"collapsed": false
},
"outputs": [],
"source": [
"# Here, df contains the time series of both train and test set. \n",
"# We will split it into a train df_train and a test set df_test:\n",
"y_train, y_test = train_test_split(y)\n",
"df_train = df.loc[df.id.isin(y_train.index)]\n",
"df_test = df.loc[df.id.isin(y_test.index)]\n",
"X_train = pd.DataFrame(index=y_train.index)\n",
"X_test = pd.DataFrame(index=y_test.index)"
"X_test = pd.DataFrame(index=y_test.index)\n",
"df_train.shape, df_test.shape"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
Expand All @@ -87,93 +86,63 @@
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"ppl.set_params(fresh__timeseries_container=df_train)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"# for the fit on the train test set, we set the fresh__timeseries_container to `df_train`\n",
"ppl.set_params(fresh__timeseries_container=df_train)\n",
"ppl.fit(X_train, y_train)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"ppl.set_params(fresh__timeseries_container=df_test)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"# for the predict on the test test set, we set the fresh__timeseries_container to `df_test`\n",
"ppl.set_params(fresh__timeseries_container=df_test)\n",
"y_pred = ppl.predict(X_test)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"print(classification_report(y_test, y_pred))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"display_name": "Python 2",
"language": "python",
"name": "python3"
"name": "python2"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.4.3"
"pygments_lexer": "ipython2",
"version": "2.7.11"
}
},
"nbformat": 4,
Expand Down

0 comments on commit 5078f26

Please sign in to comment.