In [1]:
import pandas as pd
import numpy as np
import datetime as datetime
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error

In [2]:
# Import train and test data sets
sales_train = pd.read_csv('../Data/sales_train_merge.csv', index_col = 0, parse_dates=['date'])
sales_test = pd.read_csv('../Data/sales_test_merge.csv', index_col = 0, parse_dates=['date'])

  mask |= (ar1 == a)


In [3]:
sales_train.head()

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day,profits,item_category_id
0,2013-01-31,0,0,32,884.0,6.0,1326.0,40
1,2013-01-31,0,0,33,1041.0,3.0,1041.0,37
2,2013-01-31,0,0,35,247.0,1.0,247.0,40
3,2013-01-31,0,0,43,221.0,1.0,221.0,40
4,2013-01-31,0,0,51,257.0,2.0,257.0,57


In [4]:
sales_test.head()

Unnamed: 0,ID,shop_id,item_id,item_category_id,date,date_block_num
0,0,5,5037,19,2015-11-30,34
1,1,5,5320,55,2015-11-30,34
2,2,5,5233,19,2015-11-30,34
3,3,5,5232,23,2015-11-30,34
4,4,5,5268,20,2015-11-30,34


# Preprocessing

Before a machine learning algorithm can be developed to fit the data, the data needs to be formatted and preprocessed to remove redundancies, standardize the data, or add extra columns. In this instance, columns like ``item_price``, ``profits``, and ``item_category_id`` could be removed. The ``item_category_id`` column could be removed since each item belongs to a specific column and therefore is correlated to ``item_id``. If kept, the model could deceivingly perform better than implied. The other two columns, ``item_price`` and ``profits``, could be removed since the ``item_price`` could depend on the date due to depreciation or inflation and ``profits`` dependent on ``item_price``.

In [5]:
# Remove unnecessary columns from sales_train
sales_train.drop(['item_price', 'profits', 'item_category_id'], axis=1, inplace=True)

Now that those columns have been removed, the next issue to handle are the ``date`` and ``date_block_num`` columns. As with the ``profits`` and ``item_category_id`` columns, these two columns are correlated to each other and redundant. It'll suit the problem better if the ``date_block_num`` column was removed and the ``date`` column split into two new ones, ``month`` and ``year``.

In [6]:
# Split date column into month and year columns
sales_train['month'] = sales_train['date'].dt.month
sales_train['year'] = sales_train['date'].dt.year

In [7]:
# Remove date_block_num from training data
sales_train.drop(['date', 'date_block_num'], axis=1, inplace=True)

Now that the training set has been preprocessed, the same needs to be done to the test set.

In [8]:
# Create month and year columns in test data
sales_test['month'] = sales_test['date'].dt.month
sales_test['year'] = sales_test['date'].dt.year

In [9]:
# Preprocess test set
sales_test.drop(['ID', 'item_category_id', 'date_block_num', 'date'], axis=1, inplace=True)

In [10]:
sales_train.head()

Unnamed: 0,shop_id,item_id,item_cnt_day,month,year
0,0,32,6.0,1,2013
1,0,33,3.0,1,2013
2,0,35,1.0,1,2013
3,0,43,1.0,1,2013
4,0,51,2.0,1,2013


In [11]:
sales_test.head()

Unnamed: 0,shop_id,item_id,month,year
0,5,5037,11,2015
1,5,5320,11,2015
2,5,5233,11,2015
3,5,5232,11,2015
4,5,5268,11,2015


Finally, before modeling, the *sales_train* data needs to be split into a training and hold out set.

In [12]:
# Split the data into training and hold out sets
train = sales_train[:-6]

val = sales_train[-6:]

In [13]:
# Separate dependent variable from data
X_train = train.drop('item_cnt_day', axis = 1)
y_train = train['item_cnt_day']

X_val = val.drop('item_cnt_day', axis=1)
y_val = val['item_cnt_day']

In [14]:
# Standardize and scale data
scaler = StandardScaler()

# Fit and transform sales_train data
X_train_scaled = scaler.fit_transform(X_train)

# Transform validation data
X_val_scaled = scaler.transform(X_val)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)
  


In [15]:
# Transform sales_test according to scaler
X_test_scaled = scaler.transform(sales_test)

  


# Random Forest Regressor

The first model to test will be a Random Forest Regressor model 

In [16]:
from sklearn.ensemble import RandomForestRegressor

In [17]:
# Create grid search parameters to search over

# Number of trees
rfr_n_estimators = [int(x) for x in np.linspace(50, 150, 6)]

# Max depth of trees
rfr_max_depth = [int(x) for x in np.linspace(5, 20, 4)]

# Minimum number of samples required to split
rfr_min_samples_split = [2, 5]

# Create the grid search
rfr_grid = {'n_estimators':rfr_n_estimators,
            'max_depth':rfr_max_depth,
            'min_samples_split':rfr_min_samples_split}

In [18]:
# Instantiate RandomForestRegressor model
rfr = RandomForestRegressor()

rfr_grid = GridSearchCV(estimator = rfr,
                        param_grid = rfr_grid,
                        n_jobs = 4,
                        verbose = 1)

rfr_grid.fit(X_train_scaled, y_train)

[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 3 folds for each of 48 candidates, totalling 144 fits


[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed: 15.1min
[Parallel(n_jobs=4)]: Done 144 out of 144 | elapsed: 94.7min finished


GridSearchCV(cv='warn', error_score='raise-deprecating',
       estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
           oob_score=False, random_state=None, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=4,
       param_grid={'n_estimators': [50, 70, 90, 110, 130, 150], 'max_depth': [5, 10, 15, 20], 'min_samples_split': [2, 5]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=1)

In [19]:
rfr_grid.best_params_

{'max_depth': 10, 'min_samples_split': 2, 'n_estimators': 50}

In [20]:
rfr_grid.best_estimator_.score(X_train_scaled, y_train)

0.6613999520932542

In [21]:
rfr_grid.best_estimator_.score(X_val_scaled, y_val)

-0.5135369365155273

In [22]:
rfr_best = rfr_grid.best_estimator_

In [23]:
rfr_predictions = rfr_best.predict(X_val_scaled)

rfr_val_mse = mean_squared_error(y_val, rfr_predictions)
rfr_val_mse

5.045123121718425

# Gradient Boosting Regressor

In [25]:
from xgboost import XGBRegressor

In [26]:
# Create grid search parameters to randomize over

# Learning rate
xgb_learning_rate = [0.1, 0.01]

# Number of trees
xgb_n_estimators = [int(x) for x in np.linspace(start = 50, stop = 150, num = 3)]

# Max depth of trees
xgb_max_depth = [5, 10, 15]

# Minimum number of samples required to split
xgb_min_samples_split = [2, 5]

# Create the random grid search
xgb_random_grid = {'learning_rate':xgb_learning_rate,
                   'n_estimators':xgb_n_estimators,
                   'max_depth':xgb_max_depth,
                   'min_samples_split':xgb_min_samples_split}

In [27]:
# Instatiate GradientBoostingRegressor model
xgb_regressor = XGBRegressor()

# Instatiate RandomSearchCV
xgb_random = RandomizedSearchCV(estimator = xgb_regressor,
                               param_distributions = xgb_random_grid,
                               n_iter = 20,
                               random_state = 246,
                               n_jobs = 4,
                               verbose = 1)

xgb_random.fit(X_train_scaled, y_train)

[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 3 folds for each of 20 candidates, totalling 60 fits


[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed: 25.6min
[Parallel(n_jobs=4)]: Done  60 out of  60 | elapsed: 31.5min finished


RandomizedSearchCV(cv='warn', error_score='raise-deprecating',
          estimator=XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1),
          fit_params=None, iid='warn', n_iter=20, n_jobs=4,
          param_distributions={'learning_rate': [0.1, 0.01], 'n_estimators': [50, 100, 150], 'max_depth': [5, 10, 15], 'min_samples_split': [2, 5]},
          pre_dispatch='2*n_jobs', random_state=246, refit=True,
          return_train_score='warn', scoring=None, verbose=1)

In [28]:
xgb_random.best_params_

{'n_estimators': 150,
 'min_samples_split': 2,
 'max_depth': 15,
 'learning_rate': 0.01}

In [29]:
xgb_random.best_estimator_.score(X_train_scaled, y_train)

0.7464890290360257

In [30]:
xgb_random.best_estimator_.score(X_val_scaled, y_val)

-0.1963653325649446

In [31]:
xgb_best = xgb_random.best_estimator_

In [32]:
xgb_predictions = xgb_best.predict(X_val_scaled)

xgb_val_mse = mean_squared_error(y_val, xgb_predictions)
xgb_val_mse

3.9878844418831485

# Neural Network

In [33]:
import keras
from keras.models import Sequential
from keras.layers import Dense, LSTM
from keras.callbacks import EarlyStopping

Using TensorFlow backend.


In [34]:
# Set up early stopping monitor for models
early_stopping_monitor = EarlyStopping(patience=3)

In [35]:
X_train_reshaped = X_train_scaled.reshape(X_train_scaled.shape[0], 1, X_train_scaled.shape[1])
X_val_reshaped = X_val_scaled.reshape(X_val_scaled.shape[0], 1, X_val_scaled.shape[1])

In [36]:
model = Sequential()
model.add(LSTM(15, return_sequences=False, input_shape=(1, 4)))
model.add(Dense(1))

model.compile(optimizer='adam', loss = 'mean_squared_error')

In [37]:
model.fit(X_train_reshaped, y_train,
         validation_data = (X_val_reshaped, y_val),
         epochs = 100,
         callbacks = [early_stopping_monitor])

Train on 1609118 samples, validate on 6 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100


<keras.callbacks.History at 0x26fb4be7cc0>

In [38]:
model_train_pred = model.predict(X_train_reshaped)

model_train_mse = mean_squared_error(y_train, model_train_pred)
model_train_mse

74.50973043218863

In [39]:
model_val_pred = model.predict(X_val_reshaped)

model_val_mse = mean_squared_error(y_val, model_val_pred)
model_val_mse

4.016708063307381

In [None]:
# Remove outliers?