# Predict Bike Trips

In [None]:
from demo.chicago_bike import load_sample
from matplotlib.pyplot import subplots
import composeml as cp
import featuretools as ft
import evalml

In [None]:
df = load_sample()

df.head()

## Prediction Engineering

In [None]:
def trip_count(ds):
    return len(ds)

In [None]:
lm = cp.LabelMaker(
    target_entity='from_station_id',
    labeling_function=trip_count,
    time_index='starttime',
    window_size='13h',
)

In [None]:
lt = lm.search(
    df.sort_values('starttime'),
    num_examples_per_instance=-1,
    minimum_data='2014-06-30 08:00',
    verbose=False,
)

lt.head()

In [None]:
lt.describe()

In [None]:
fig, ax = subplots(nrows=2, ncols=1, figsize=(6, 8))
lt.plot.distribution(ax=ax[0])
lt.plot.count_by_time(ax=ax[1])
fig.tight_layout(pad=2)

## Feature Engineering

In [None]:
es = ft.EntitySet('chicago_bike')

es.entity_from_dataframe(
    dataframe=df.reset_index(),
    entity_id='trips',
    time_index='starttime',
    index='trip_id',
)
                  
es.normalize_entity(
    base_entity_id='trips',
    new_entity_id='from_station_id',
    index='from_station_id',
    make_time_index=False,
)

es.normalize_entity(
    base_entity_id='trips',
    new_entity_id='weather',
    index='events',
    make_time_index=False,
)                 

es.normalize_entity(
    base_entity_id='trips',
    new_entity_id='gender',
    index='gender',
    make_time_index=False,
)

es["trips"]["gender"].interesting_values = ['Male', 'Female']
es["trips"]["events"].interesting_values = ['tstorms']
es.plot()

In [None]:
fm, fd = ft.dfs(
    entityset=es,
    target_entity='from_station_id',
    trans_primitives=['hour', 'week', 'is_weekend'],
    cutoff_time=lt,
    cutoff_time_in_index=True,
    include_cutoff_time=False,
    verbose=False,
)

fm.head()

## Machine Learning

In [None]:
y = fm.pop('trip_count')
splits = evalml.preprocessing.split_data(fm, y, test_size=0.1, random_state=0, regression=True)
X_train, X_holdout, y_train, y_holdout = splits

In [None]:
automl = evalml.AutoMLSearch(problem_type='regression', objective='r2', random_state=0)
automl.search(X_train, y_train, data_checks='disabled', show_iteration_plot=False)

In [None]:
automl.best_pipeline.describe()
automl.best_pipeline.graph()

In [None]:
best_pipeline = automl.best_pipeline.fit(X_train, y_train)
score = best_pipeline.score(X_holdout, y_holdout, objectives=['r2'])
dict(score)

In [None]:
feature_importance = best_pipeline.feature_importance
feature_importance = feature_importance.set_index('feature')['importance']
top_k = feature_importance.abs().sort_values().tail(20).index
feature_importance[top_k].plot.barh(figsize=(8, 8), fontsize=14, width=.7);