In [15]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split,KFold,cross_val_score
from sklearn.ensemble import StackingRegressor,RandomForestRegressor,BaggingRegressor, AdaBoostRegressor, GradientBoostingRegressor, ExtraTreesRegressor
from sklearn.metrics import median_absolute_error,mean_squared_error

In [2]:
south_df = pd.read_csv('ap-south-1.csv',names=['Date','Instance_Type','OS','Region','Price'],parse_dates=['Date'])
cat_col = ['Instance_Type','OS','Region']

In [3]:
south_df.head()

Unnamed: 0,Date,Instance_Type,OS,Region,Price
0,2017-03-07 16:05:01,c4.8xlarge,Windows,ap-south-1b,1.8865
1,2017-03-07 16:05:01,m4.xlarge,Linux/UNIX,ap-south-1b,0.0366
2,2017-03-07 16:05:01,m4.large,Linux/UNIX,ap-south-1a,0.0243
3,2017-03-07 16:05:01,m4.large,Linux/UNIX,ap-south-1b,0.0346
4,2017-03-07 16:05:01,c4.8xlarge,Linux/UNIX,ap-south-1a,0.2895


In [4]:
lb,ub = np.percentile(sorted(south_df['Price']),[5,95])
south_df['Price'] = np.where(south_df['Price']>ub,ub,south_df['Price'])

In [5]:
label_encoder = LabelEncoder()
cat_df = south_df[cat_col]
for i in cat_col:
    south_df[i] = label_encoder.fit_transform(cat_df[i])

In [6]:
south_df[['year','month','day','h','m','s']] = pd.DataFrame([(x.year,x.month,x.day,x.hour,x.minute,x.second) for x in south_df['Date']])

In [7]:
south_df = south_df.sample(frac=0.10,random_state=100)

In [8]:
new_cat_col = ['month','day','h','m','s']
model_data_df = south_df.drop(['Date','year'],axis=1)

In [9]:
X = model_data_df.loc[:,model_data_df.columns != 'Price']
y = model_data_df['Price']
print(X.shape)
print(y.shape)

(203103, 8)
(203103,)


In [10]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.3,random_state=100)

In [20]:
def stacking_models():
    level0 = list()
    level0.append(('ADB',AdaBoostRegressor(random_state = 100)))
    level0.append(('GB',GradientBoostingRegressor(min_samples_leaf=50,min_samples_split=100,n_estimators=100,random_state = 100)))
    level0.append(('XGB',XGBRegressor(min_samples_leaf=50,min_samples_split=100,n_estimators=100,random_state = 100)))
    level0.append(('ETB',ExtraTreesRegressor(min_samples_leaf=50,min_samples_split=100,n_estimators=100,random_state = 100)))
    level1 = RandomForestRegressor(min_samples_leaf = 50,min_samples_split= 100,n_estimators=300,random_state=100)
    
    model = StackingRegressor(estimators=level0, final_estimator=level1, cv=5,verbose = 1)
    return model

In [21]:
%%time
model = stacking_models()
model.fit(X_train,y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    3.6s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   30.6s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   14.1s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   41.3s finished


Wall time: 3min 21s


StackingRegressor(cv=5,
                  estimators=[('ADB',
                               AdaBoostRegressor(base_estimator=None,
                                                 learning_rate=1.0,
                                                 loss='linear', n_estimators=50,
                                                 random_state=100)),
                              ('GB',
                               GradientBoostingRegressor(alpha=0.9,
                                                         ccp_alpha=0.0,
                                                         criterion='friedman_mse',
                                                         init=None,
                                                         learning_rate=0.1,
                                                         loss='ls', max_depth=3,
                                                         max_features=None,
                                                         max_leaf_nodes=None,
           

In [22]:
y_pred = model.predict(X_test)

In [24]:
%%time
cv_res = cross_val_score(model,X_train,y_train,cv=5,scoring='r2',n_jobs=-1)
print(cv_res.mean(),' ',cv_res.std())

0.955351896343493   0.0017246860434569397
Wall time: 4min 32s


In [23]:
mean_squared_error(y_test,y_pred)

0.012978961068267956