In [50]:
__author__ = 'sharmaam'

import pandas as pd
import numpy as np
import os
import glob
from sklearn import cross_validation, ensemble, preprocessing, metrics
from sklearn.grid_search import GridSearchCV
import collections
import matplotlib.pyplot as plt
from sklearn.cross_validation import train_test_split
from h2o.estimators.random_forest import H2ORandomForestEstimator

pd.set_option('display.max_columns', None)

%matplotlib inline

import matplotlib
import numpy as np
import matplotlib.pyplot as plt
matplotlib.style.use('ggplot')

# Prints the feature importance sorted descending for a model
def featureImp(model, df, n=10):
    index = np.argsort(model.feature_importances_)[::-1]
    print df.columns[index].values[:n] + ':' + model.feature_importances_[index].astype(str)[:n]


# Percentage Root Mean Squared Error (RMSPE)
def rmspe(pred, actual):
    sse = np.sum(pow((pred-actual)/pred, 2))
    sseMean = sse / len(pred)
    rmspe = np.sqrt(sseMean)
    return rmspe


# Print Grid Search results
def getBestModel(model):
    print("Best score: %0.3f" % model.best_score_)
    print("Best parameters set:")
    best_parameters = model.best_estimator_.get_params()
    for param_name in sorted(param_grid.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))
    return model.best_estimator_

# Returns true for encoded columns
def isEncodedCol(df, col):
    encodedCols = []
    encode = (df[col].dtype != 'int64') & (df[col].dtype != 'float64')
    encode = encode | (col in encodedCols)
    return encode

# Encode columns to categorical values
def encodeCols(df):
    print "Encoding cols..."
    # Encode categorical cols
    for col in df.columns:
        if (isEncodedCol(df, col)):
            lbl = preprocessing.LabelEncoder()
            lbl.fit(df[col])
            df[col] = lbl.transform(df[col])
    return df


def preProcess(df):
    df['Month'] = df.Date.dt.month.astype(int)
    df['Day'] = df.Date.dt.day.astype(int)
    df['Year'] = df.Date.dt.year.astype(int)

    grpByCust = df.groupby(["Store", "Month"]).mean()['Customers']
    grpByCust.name = 'Avg_Cust'
    df = df.join(grpByCust, on=['Store', 'Month'])

    stores = pd.read_csv('../input/store.csv')
    df = df.merge(stores, on='Store')    
    
    
    

    # Remove columns not being handled currently
    df = df.drop(['Customers'], axis=1)

    # Impute missing values
    for impCol in df.columns:
        if (df[impCol].dtype == 'int64') or (df[impCol].dtype == 'float64'):
            #imputedVal = df[impCol].value_counts().index[0]
            #df[impCol] = df[impCol].fillna(imputedVal)
            df[impCol] = df[impCol].fillna(0)
        elif df[impCol].dtype == 'object':
            #imputedVal = df[impCol].value_counts().index[0]
            #df[impCol] = df[impCol].fillna(imputedVal)
            df[impCol] = df[impCol].fillna('NAValue')

    '''
    print "Cleaning too many categories..."
    ### Clean variables with too many categories
    for col in df.columns:
        if (df[col].dtype != 'int64') & (df[col].dtype != 'float64'):
            top30 = np.array(collections.Counter(df[col]).most_common(60))[:, 0]
            notInTop30 = [(not x in top30) for x in df[col]]
            df.ix[notInTop30, col] = 'rareValue'

    '''
    return df
  

In [51]:
forSubmission = False
gbr = False

os.chdir("/home/devel/axs/work/kaggle/rossmann/scripts")

# load trainAlling and test datasets
trainAll = pd.read_csv('../input/train.csv', parse_dates=[2, ])

# Take a random split for temp training purpose
trainAll, dummy = train_test_split(trainAll, test_size = 0.8)

trainAll = preProcess(trainAll)

trainAll.head(6)

Unnamed: 0,Store,DayOfWeek,Date,Sales,Open,Promo,StateHoliday,SchoolHoliday,Month,Day,Year,Avg_Cust,StoreType,Assortment,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,PromoInterval
0,351,4,2013-04-25,5708,1,1,0,0,4,25,2013,443.705882,a,a,5290,11,2012,1,5,2013,"Feb,May,Aug,Nov"
1,351,4,2015-04-16,7237,1,1,0,0,4,16,2015,443.705882,a,a,5290,11,2012,1,5,2013,"Feb,May,Aug,Nov"
2,351,1,2013-04-29,8298,1,1,0,0,4,29,2013,443.705882,a,a,5290,11,2012,1,5,2013,"Feb,May,Aug,Nov"
3,351,6,2015-03-14,5413,1,0,0,0,3,14,2015,456.714286,a,a,5290,11,2012,1,5,2013,"Feb,May,Aug,Nov"
4,351,5,2015-07-17,6896,1,1,0,1,7,17,2015,445.277778,a,a,5290,11,2012,1,5,2013,"Feb,May,Aug,Nov"
5,351,6,2013-10-05,4593,1,0,0,0,10,5,2013,431.545455,a,a,5290,11,2012,1,5,2013,"Feb,May,Aug,Nov"


In [52]:
trainAll = encodeCols(trainAll)

#Split train-test data
train, test = train_test_split(trainAll, test_size = 0.2)

#trainX = train.drop(['Sales', 'PromoInterval'], axis=1)
#testX = test.drop(['Sales', 'PromoInterval'], axis=1)

Encoding cols...


In [53]:
trainAll.head(6)

Unnamed: 0,Store,DayOfWeek,Date,Sales,Open,Promo,StateHoliday,SchoolHoliday,Month,Day,Year,Avg_Cust,StoreType,Assortment,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,PromoInterval
0,351,4,114,5708,1,1,1,0,4,25,2013,443.705882,0,0,5290,11,2012,1,5,2013,0
1,351,4,835,7237,1,1,1,0,4,16,2015,443.705882,0,0,5290,11,2012,1,5,2013,0
2,351,1,118,8298,1,1,1,0,4,29,2013,443.705882,0,0,5290,11,2012,1,5,2013,0
3,351,6,802,5413,1,0,1,0,3,14,2015,456.714286,0,0,5290,11,2012,1,5,2013,0
4,351,5,927,6896,1,1,1,1,7,17,2015,445.277778,0,0,5290,11,2012,1,5,2013,0
5,351,6,277,4593,1,0,1,0,10,5,2013,431.545455,0,0,5290,11,2012,1,5,2013,0


In [54]:
#model = ensemble.RandomForestRegressor(n_estimators=100, criterion='mse', oob_score=True, max_features=None, max_depth=None).fit(trainX, trainY)
#print "...Completed!"

# Grid based turning
# from sklearn import linear_model
# model = linear_model.LinearRegression(fit_intercept=True, normalize=True, copy_X=True).fit(trainX, trainY)
# Alternative model
# rfModel = ensemble.RandomForestRegressor(criterion='mse')
# param_grid = dict(max_features=[40,50,60,70,80,90])
# model = GridSearchCV(rfModel, param_grid=param_grid, scoring=rmsle_scorer, verbose=10).fit(trainX, trainY)
# model = getBestModel(model)



In [55]:
#Initialize the H20 engine
import h2o
h2o.init()

0,1
H2O cluster uptime:,5 minutes 11 seconds 891 milliseconds
H2O cluster version:,3.7.0.3250
H2O cluster name:,H2O_started_from_python
H2O cluster total nodes:,1
H2O cluster total memory:,512.0 MB
H2O cluster total cores:,8
H2O cluster allowed cores:,8
H2O cluster healthy:,True
H2O Connection ip:,127.0.0.1
H2O Connection port:,54321


In [56]:
train_Hex = h2o.H2OFrame(train.to_dict("list"))
test_Hex = h2o.H2OFrame(test.to_dict("list"))


Parse Progress: [##################################################] 100%

Parse Progress: [##################################################] 100%


In [57]:
xCols = train_Hex.names
xCols.remove("Sales")
yCols = 'Sales'
# Define and fit model
model = H2ORandomForestEstimator(seed=42)
model.train(x=xCols, y=yCols, training_frame=train_Hex)
print "Training complete: \n", model


drf Model Build Progress: [##################################################] 100%
Training complete: 
Model Details
H2ORandomForestEstimator :  Distributed RF
Model Key:  DRF_model_python_1447641305609_4

Model Summary:


0,1,2,3,4,5,6,7,8
,number_of_trees,model_size_in_bytes,min_depth,max_depth,mean_depth,min_leaves,max_leaves,mean_leaves
,50.0,17771888.0,20.0,20.0,20.0,24609.0,36810.0,30708.84




ModelMetricsRegression: drf
** Reported on train data. **

MSE: 1187708.25503
R^2: 0.919755701434
Mean Residual Deviance: 1187708.25503

Scoring History:


0,1,2,3,4,5
,timestamp,duration,number_of_trees,training_MSE,training_deviance
,2015-11-16 08:10:44,0.659 sec,1.0,2536253.1,2536253.1
,2015-11-16 08:10:44,1.339 sec,2.0,2398121.2,2398121.2
,2015-11-16 08:10:45,2.018 sec,3.0,2227522.0,2227522.0
,2015-11-16 08:10:46,2.679 sec,4.0,2084386.3,2084386.3
,2015-11-16 08:10:46,3.380 sec,5.0,1959575.3,1959575.3
,2015-11-16 08:10:47,3.974 sec,6.0,1870532.8,1870532.8
,2015-11-16 08:10:51,8.309 sec,13.0,1471577.3,1471577.3
,2015-11-16 08:10:55,12.484 sec,20.0,1326887.3,1326887.3
,2015-11-16 08:11:00,16.634 sec,27.0,1254908.4,1254908.4



Variable Importances:


0,1,2,3
variable,relative_importance,scaled_importance,percentage
Open,25703324057600.0,1.0,0.3
Avg_Cust,18802326634500.0,0.7,0.2
DayOfWeek,14380551897100.0,0.6,0.2
Promo,9801424699390.0,0.4,0.1
StateHoliday,1940356595710.0,0.1,0.0
StoreType,1725288546300.0,0.1,0.0
CompetitionOpenSinceMonth,1645626392580.0,0.1,0.0
CompetitionDistance,1604564680700.0,0.1,0.0
Store,1532669853700.0,0.1,0.0





In [58]:
preds = model.predict(test_Hex)

pred, actual = np.array(h2o.as_list(preds)), np.array(h2o.as_list(test_Hex['Sales']))
pred = pred[actual>0]
actual = actual[actual>0]

err = rmspe(pred, actual)

print "PRMSE Score: ", err

PRMSE Score:  0.162116431386
