In [1]:
# Outside modules
import warnings; warnings.simplefilter('ignore') # suppress warnings
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
import joblib
import pandas as pd

import dask.dataframe as dd

# Home-made modules
from utils import *
from preprocessing import *
from feature_creation import *
from feature_selection import  *
from dim_reduction import *

In [2]:
client = Client()

In [3]:
data = dd.read_csv('https://gist.githubusercontent.com/catyselman/9353e4e480ddf2db44b44a79e14718b5/raw/ded23e586ca5db1b4a566b1e289acd12ebf69357/bikeshare_hourly.csv', blocksize=25e4)

In [4]:
data.dtypes

instant         int64
dteday         object
season          int64
yr              int64
mnth            int64
hr              int64
holiday         int64
weekday         int64
workingday      int64
weathersit      int64
temp          float64
atemp         float64
hum           float64
windspeed     float64
casual          int64
registered      int64
cnt             int64
dtype: object

In [5]:
data['realtemp']=(data.temp*41)

In [6]:
pp = [drop_registered, drop_casual, drop_date, year_as_bool,\
      season_as_category, month_as_category, weekday_as_category, \
      hour_as_category, holiday_as_bool, working_day_as_bool, weather_sit_as_category, categorize_columns,
     ]

lm = {"name": "Linear Regression",
      "model": LinearRegression(),
      "params": {}\
     }

rfr = {"name": "Random Forest",
       "model": RandomForestRegressor(),
       "params": {"random_state": [SEED]}
      }

models = [lm, rfr ]

### Baseline - No feature engineering

In [7]:
best_model = pipeline_casero(data, preprocessing=pp, models=models)

Beginning pipeline at 2019-05-18 19:06:17.771852

Performing preprocessing steps...
	Dropping the registered variable since we won't have this information
	Dropping the causual variable since we won't have this information
	Dropping the date variable since this information is encoded in other variables
	Converting year to a boolean variable...
	Converting season to a categorical variable...
	Converting month to a categorical variable...
	Converting day of week to a categorical variable...
	Converting hour of day to a categorical variable...
	Converting holiday or not to a boolean variable...
	Converting holiday or not to a boolean variable...
	Converting weather situation to a categorical variable...
Preprocessing completed at 2019-05-18 19:06:17.927393, performed 12 steps
New Shape of data: 15

Performing feature creation...
Feature Creation completed at 2019-05-18 19:06:17.928171, performed 0 steps
New Shape of data: 15

Dummifying...
New Shape of data: 61

Performing dimensionality 

### Basic Feature Engineering

In [8]:
fc = [commute_hours, weather_cluster]
best_model = pipeline_casero(data, preprocessing=pp, creation=fc, models=models)

Beginning pipeline at 2019-05-18 19:06:46.646287

Performing preprocessing steps...
	Dropping the registered variable since we won't have this information
	Dropping the causual variable since we won't have this information
	Dropping the date variable since this information is encoded in other variables
	Converting year to a boolean variable...
	Converting season to a categorical variable...
	Converting month to a categorical variable...
	Converting day of week to a categorical variable...
	Converting hour of day to a categorical variable...
	Converting holiday or not to a boolean variable...
	Converting holiday or not to a boolean variable...
	Converting weather situation to a categorical variable...
Preprocessing completed at 2019-05-18 19:06:46.882527, performed 12 steps
New Shape of data: 15

Performing feature creation...
	Adding variable for Commute Hours, 1 for yes and 0 for false
	Adding clustering variable based on weather-related features...

Dummifying to create clusters...
N

### Predicting Casual and Registered Counts

In [9]:
fc = [prediction_forecasts]
best_model = pipeline_casero(data, preprocessing=pp, creation=fc, models=models)

Beginning pipeline at 2019-05-18 19:07:37.348076

Performing preprocessing steps...
	Dropping the registered variable since we won't have this information
	Dropping the causual variable since we won't have this information
	Dropping the date variable since this information is encoded in other variables
	Converting year to a boolean variable...
	Converting season to a categorical variable...
	Converting month to a categorical variable...
	Converting day of week to a categorical variable...
	Converting hour of day to a categorical variable...
	Converting holiday or not to a boolean variable...
	Converting holiday or not to a boolean variable...
	Converting weather situation to a categorical variable...
Preprocessing completed at 2019-05-18 19:07:37.507121, performed 12 steps
New Shape of data: 15

Performing feature creation...
	Adding casual_forecast variable...
Dummifying...
New Shape of data: 60

		Predictions for GridSearchCV on casual: 0.48931 +/- 0.08596
	Adding registered_forecast

### Basic Feature Engineering  Predicting Casual + Registered Counts 

In [10]:
fc = [commute_hours, weather_cluster, prediction_forecasts]
best_model = pipeline_casero(data, preprocessing=pp, creation=fc, models=models)

Beginning pipeline at 2019-05-18 19:08:17.531158

Performing preprocessing steps...
	Dropping the registered variable since we won't have this information
	Dropping the causual variable since we won't have this information
	Dropping the date variable since this information is encoded in other variables
	Converting year to a boolean variable...
	Converting season to a categorical variable...
	Converting month to a categorical variable...
	Converting day of week to a categorical variable...
	Converting hour of day to a categorical variable...
	Converting holiday or not to a boolean variable...
	Converting holiday or not to a boolean variable...
	Converting weather situation to a categorical variable...
Preprocessing completed at 2019-05-18 19:08:17.705451, performed 12 steps
New Shape of data: 15

Performing feature creation...
	Adding variable for Commute Hours, 1 for yes and 0 for false
	Adding clustering variable based on weather-related features...

Dummifying to create clusters...
N

### Genetic Programming

In [11]:
fc = [genetic_programming]
best_model = pipeline_casero(data, preprocessing=pp, creation=fc, models=models)

Beginning pipeline at 2019-05-18 19:10:02.314983

Performing preprocessing steps...
	Dropping the registered variable since we won't have this information
	Dropping the causual variable since we won't have this information
	Dropping the date variable since this information is encoded in other variables
	Converting year to a boolean variable...
	Converting season to a categorical variable...
	Converting month to a categorical variable...
	Converting day of week to a categorical variable...
	Converting hour of day to a categorical variable...
	Converting holiday or not to a boolean variable...
	Converting holiday or not to a boolean variable...
	Converting weather situation to a categorical variable...
Preprocessing completed at 2019-05-18 19:10:02.484383, performed 12 steps
New Shape of data: 15

Performing feature creation...
Creating features through genetic programming...
    |    Population Average   |             Best Individual              |
---- ------------------------- -------

### Basic Featueres + Tree Selection 

In [12]:
fc = [commute_hours, weather_cluster]
fs = [tree_selection]
best_model = pipeline_casero(data, preprocessing=pp, creation=fc, selection = fs, models=models)

Beginning pipeline at 2019-05-18 19:14:47.365245

Performing preprocessing steps...
	Dropping the registered variable since we won't have this information
	Dropping the causual variable since we won't have this information
	Dropping the date variable since this information is encoded in other variables
	Converting year to a boolean variable...
	Converting season to a categorical variable...
	Converting month to a categorical variable...
	Converting day of week to a categorical variable...
	Converting hour of day to a categorical variable...
	Converting holiday or not to a boolean variable...
	Converting holiday or not to a boolean variable...
	Converting weather situation to a categorical variable...
Preprocessing completed at 2019-05-18 19:14:47.672917, performed 12 steps
New Shape of data: 15

Performing feature creation...
	Adding variable for Commute Hours, 1 for yes and 0 for false
	Adding clustering variable based on weather-related features...

Dummifying to create clusters...
N

### Genetic Programming + Tree Selection

In [13]:
fc = [genetic_programming]
fs = [tree_selection]
best_model = pipeline_casero(data, preprocessing=pp, creation=fc, selection = fs, models=models)

Beginning pipeline at 2019-05-18 19:16:27.331855

Performing preprocessing steps...
	Dropping the registered variable since we won't have this information
	Dropping the causual variable since we won't have this information
	Dropping the date variable since this information is encoded in other variables
	Converting year to a boolean variable...
	Converting season to a categorical variable...
	Converting month to a categorical variable...
	Converting day of week to a categorical variable...
	Converting hour of day to a categorical variable...
	Converting holiday or not to a boolean variable...
	Converting holiday or not to a boolean variable...
	Converting weather situation to a categorical variable...
Preprocessing completed at 2019-05-18 19:16:27.546202, performed 12 steps
New Shape of data: 15

Performing feature creation...
Creating features through genetic programming...
    |    Population Average   |             Best Individual              |
---- ------------------------- -------

### Genetic Programming + Stepwise

In [14]:
fc = [commute_hours]
fs = [rfe]
best_model = pipeline_casero(data, preprocessing=pp, creation=fc, selection = fs, models=models)

Beginning pipeline at 2019-05-18 19:23:17.892729

Performing preprocessing steps...
	Dropping the registered variable since we won't have this information
	Dropping the causual variable since we won't have this information
	Dropping the date variable since this information is encoded in other variables
	Converting year to a boolean variable...
	Converting season to a categorical variable...
	Converting month to a categorical variable...
	Converting day of week to a categorical variable...
	Converting hour of day to a categorical variable...
	Converting holiday or not to a boolean variable...
	Converting holiday or not to a boolean variable...
	Converting weather situation to a categorical variable...
Preprocessing completed at 2019-05-18 19:23:18.198455, performed 12 steps
New Shape of data: 15

Performing feature creation...
	Adding variable for Commute Hours, 1 for yes and 0 for false
Feature Creation completed at 2019-05-18 19:23:18.232103, performed 1 steps
New Shape of data: 16

D

### Recursive Feature Elimination

In [15]:
fc = []
fs = [rfe]
best_model = pipeline_casero(data, preprocessing=pp, creation=fc, selection = fs, models=models)

Beginning pipeline at 2019-05-18 19:24:43.957467

Performing preprocessing steps...
	Dropping the registered variable since we won't have this information
	Dropping the causual variable since we won't have this information
	Dropping the date variable since this information is encoded in other variables
	Converting year to a boolean variable...
	Converting season to a categorical variable...
	Converting month to a categorical variable...
	Converting day of week to a categorical variable...
	Converting hour of day to a categorical variable...
	Converting holiday or not to a boolean variable...
	Converting holiday or not to a boolean variable...
	Converting weather situation to a categorical variable...
Preprocessing completed at 2019-05-18 19:24:44.229765, performed 12 steps
New Shape of data: 15

Performing feature creation...
Feature Creation completed at 2019-05-18 19:24:44.230998, performed 0 steps
New Shape of data: 15

Dummifying...
New Shape of data: 61

Performing dimensionality 

### Tree Selection

In [16]:
fc = []
fs = [tree_selection]
best_model = pipeline_casero(data, preprocessing=pp, creation=fc, selection = fs, models=models)

Beginning pipeline at 2019-05-18 19:25:42.728609

Performing preprocessing steps...
	Dropping the registered variable since we won't have this information
	Dropping the causual variable since we won't have this information
	Dropping the date variable since this information is encoded in other variables
	Converting year to a boolean variable...
	Converting season to a categorical variable...
	Converting month to a categorical variable...
	Converting day of week to a categorical variable...
	Converting hour of day to a categorical variable...
	Converting holiday or not to a boolean variable...
	Converting holiday or not to a boolean variable...
	Converting weather situation to a categorical variable...
Preprocessing completed at 2019-05-18 19:25:42.876259, performed 12 steps
New Shape of data: 15

Performing feature creation...
Feature Creation completed at 2019-05-18 19:25:42.877564, performed 0 steps
New Shape of data: 15

Dummifying...
New Shape of data: 61

Performing dimensionality 

### Deep Feature Synthesis

This is the only feature engineering step that doesn't use dask. The 'deep_features' function will convert the dataset into a Pandas dataframe for processing.

In [18]:
fc = [deep_features]
fs = []
dr = [pca]
best_model = pipeline_casero(data, preprocessing=pp, creation=fc, selection = fs, reduction = dr, models=models)

Beginning pipeline at 2019-05-18 19:26:10.457795

Performing preprocessing steps...
	Dropping the registered variable since we won't have this information
	Dropping the causual variable since we won't have this information
	Dropping the date variable since this information is encoded in other variables
	Converting year to a boolean variable...
	Converting season to a categorical variable...
	Converting month to a categorical variable...
	Converting day of week to a categorical variable...
	Converting hour of day to a categorical variable...
	Converting holiday or not to a boolean variable...
	Converting holiday or not to a boolean variable...
	Converting weather situation to a categorical variable...
Preprocessing completed at 2019-05-18 19:26:10.658936, performed 12 steps
New Shape of data: 15

Performing feature creation...
	Performing Deep Feature Synthesis...
		Created feature seasons.STD(bikeshare_hourly.holiday)
		Created feature seasons.STD(bikeshare_hourly.workingday)
		Created