In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import Ridge, Lasso

In [2]:
df = pd.read_pickle('data/cleaned_data_spacy.p')

In [3]:
df.drop(['timestamp'], axis = 1, inplace = True)

### Sorting the cleaned dataframe by time

In [4]:
df.sort_values('time', ascending=True, inplace = True)
df.reset_index(inplace = True)

### Separating the datetime components for training

In [5]:
df['year'] = df.time.apply(lambda x: x.year)
df['month'] = df.time.apply(lambda x: x.month)
df['day'] = df.time.apply(lambda x: x.day)
df['hour'] = df.time.apply(lambda x: x.hour)

### Grouping all the training labels with the spacy vector of the weather_description into a single vector

In [6]:
def group_vecs(row):
    return list([row['trip_distance'], row['total_amount'],row['amount_rate'],row['year'], row['month'], row['day'], row['hour'], row['humidity'], row['pressure'], row['temperature'], row['wind_speed']])\
            +list(row['weather_vec'])

In [7]:
df['vecs'] = df.apply(group_vecs, axis=1)

### Extracting the training vectors and scaling the values with a MinMaxScaler

In [8]:
X = np.vstack(df['vecs'])
y = np.vstack(df['passenger_count'])
X_scaler = MinMaxScaler(feature_range=(0, 1))
y_scaler = MinMaxScaler(feature_range=(0, 1))
X_scaled = X_scaler.fit_transform(X)
y_scaled = y_scaler.fit_transform(y)
final_vals = np.concatenate((y_scaled,X_scaled), axis = 1)



### Function to reframe the vectors to prepare for a multivariate supervised training of time series

source: https://machinelearningmastery.com/convert-time-series-supervised-learning-problem-python/

In [9]:
def series_to_supervised(data, n_in=1, n_out=1, dropnan=True):
    """
    Frame a time series as a supervised learning dataset.
    Arguments:
        data: Sequence of observations as a list or NumPy array.
        n_in: Number of lag observations as input (X).
        n_out: Number of observations as output (y).
        dropnan: Boolean whether or not to drop rows with NaN values.
    Returns:
        Pandas DataFrame of series framed for supervised learning.
    """
    n_vars = 1 if type(data) is list else data.shape[1]
    df = pd.DataFrame(data)
    cols, names = list(), list()
    # input sequence (t-n, ... t-1)
    for i in range(n_in, 0, -1):
        cols.append(df.shift(i))
        names += [('var%d(t-%d)' % (j+1, i)) for j in range(n_vars)]
    # forecast sequence (t, t+1, ... t+n)
    for i in range(0, n_out):
        cols.append(df.shift(-i))
        if i == 0:
            names += [('var%d(t)' % (j+1)) for j in range(n_vars)]
        else:
            names += [('var%d(t+%d)' % (j+1, i)) for j in range(n_vars)]
    # put it all together
    agg = pd.concat(cols, axis=1)
    agg.columns = names
    # drop rows with NaN values
    if dropnan:
        agg.dropna(inplace=True)
    return agg

In [10]:
vals_reframed = series_to_supervised(final_vals)
cols_filtered = [c for c in vals_reframed.columns if c.endswith('(t-1)')]
vals_reframed_arr = vals_reframed[cols_filtered].values

### Splitting data into training and test sets

In [11]:
test_frac = 0.20
train_size = int(round(vals_reframed_arr.shape[0]*(1-test_frac)))

In [12]:
train_set = vals_reframed_arr[:train_size, :]
test_set = vals_reframed_arr[train_size:, :]

In [13]:
X_train, y_train = train_set[:,1:], train_set[:,:1] 
X_test, y_test = test_set[:,1:], test_set[:,:1]

### Training a Ridge regression model

In [14]:
mod_1 = Ridge()
mod_1.fit(X_train, y_train)

Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001)

In [15]:
mod_1.score(X_test, y_test)

0.8967306444305283

### Training a Lasso regression model

In [16]:
mod_2 = Lasso()
mod_2.fit(X_train, y_train)

Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)

In [17]:
mod_2.score(X_test, y_test)

-0.2079907319522709

### Conclusions

The ridge regressor seems to be the best prediction model based on the model scores