# Model Development

This notebook was used for model development.

In [2]:
import numpy as np
import pandas as pd
import pickle

#Models
from sklearn.linear_model import ElasticNet
from sklearn.ensemble import GradientBoostingRegressor

#Preprocessing
from sklearn.preprocessing import OneHotEncoder

#Model Selection
from sklearn.model_selection import train_test_split
from sklearn.model_selection import PredefinedSplit
from sklearn.model_selection import GridSearchCV

In [3]:
df = pd.read_csv('./../data/processed/final_dataset.csv.gz') 

In [4]:
categorical_features = ['Start Station ID',
                            'Gender',
                            'Day of Week',
                            'Hour',
                            'Month',
                            'Age Missing',
                            'Customer',
                            'Subscriber',
                            'User Type Missing',
                            'WT01',
                            'WT08']

In [5]:
X = df.drop(['Trip Duration','Start Time','DATE'], axis=1)
y = df['Trip Duration']

In [8]:
def map_cat_feature_to_target_range(series):
    vals = series.unique()
    mapping = dict(zip(vals,range(len(vals))))
    return series.apply(lambda x: mapping[x])

for feature in categorical_features:
    df[feature] = map_cat_feature_to_target_range(df[feature])

In [20]:
mask = np.zeros(X.shape[1])
for i in range(len(mask)):
    if X.columns[i] in categorical_features:
        mask[i] = 1
    else:
        mask[i] = 0   
mask = mask.astype(bool)

In [21]:
enc = OneHotEncoder(categorical_features=mask)
X = enc.fit_transform(X)

In [25]:
X, X_test, y, y_test = train_test_split(X, y, test_size = 0.2)

In [26]:
np.random.seed(1354)
validation = PredefinedSplit(np.random.choice([0,-1],X.shape[0],p=[0.8,0.2]))