In [11]:
%matplotlib inline
import seaborn as sns
import matplotlib.pyplot as plt

import numpy as np
import pandas as pd
import math

from sklearn.preprocessing import LabelEncoder
from sklearn.cross_validation import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression, Ridge, LassoLarsCV, ElasticNet
from sklearn.metrics import r2_score

from sklearn.ensemble import GradientBoostingRegressor
# feature selection (from supportive model)
from sklearn.feature_selection import SelectFromModel
from sklearn.grid_search import GridSearchCV

from sklearn.decomposition import PCA, FastICA
from sklearn.decomposition import TruncatedSVD
from sklearn.random_projection import GaussianRandomProjection
from sklearn.random_projection import SparseRandomProjection
from sklearn.metrics import make_scorer


from sklearn.base import BaseEstimator,TransformerMixin, ClassifierMixin, RegressorMixin, clone
from sklearn.pipeline import make_pipeline, make_union
from sklearn.utils import check_array
from sklearn.tree import DecisionTreeRegressor

from sklearn.model_selection import KFold
import xgboost as xgb



In [12]:
train_orig=pd.read_csv('train.csv')
test_orig=pd.read_csv('test.csv')

train=train_orig.copy()
test=test_orig.copy()

## Encode categorical variables

In [13]:
for col in train.select_dtypes(['object']).columns:
    lb=LabelEncoder()
    lb.fit(list(train[col].values.astype(str))+list(test[col].values.astype(str)))
    train[col]=lb.transform(list(train[col].astype(str)))
    test[col]=lb.transform(list(test[col].astype(str)))
    print col, 'Done'

X0 Done
X1 Done
X2 Done
X3 Done
X4 Done
X5 Done
X6 Done
X8 Done


## Add dimension reduction components

In [15]:
for index in train.index:
    train.ix[index, 'ones_count']=float(len(train.ix[index][train.ix[index]==1]))/float(len(train.ix[index]))
    train.ix[index, 'zeros count']=float(len(train.ix[index][train.ix[index]==0]))/float(len(train.ix[index]))
    
    test.ix[index, 'ones_count']=float(len(test.ix[index][test.ix[index]==1]))/float(len(test.ix[index]))
    test.ix[index, 'zeros count']=float(len(test.ix[index][test.ix[index]==0]))/float(len(test.ix[index]))


In [123]:
pca_=PCA(whiten=True)
pca_.fit(train.drop('y',1))
# np.cumsum(pca_.explained_variance_ratio_)
# plt.plot(np.cumsum(pca_.explained_variance_ratio_))
# plt.xlim([0,10])

PCA(copy=True, iterated_power='auto', n_components=None, random_state=None,
  svd_solver='auto', tol=0.0, whiten=True)

In [16]:
##Dimension Reduction

n_comp=10
SEED=420
train_labels=train['y']

# full_data = pd.concat((train.drop('y',1), test), keys=['train', 'test'])

# ICA
ica = FastICA(n_components=n_comp, random_state=SEED)
# ica.fit(full_data)
ica_results_train = ica.fit_transform(train.drop('y',1))
ica_results_test = ica.transform(test)

# PCA
pca = PCA(n_components=n_comp, random_state=SEED)
# pca.fit(full_data)
pca_results_train = pca.fit_transform(train.drop('y',1))
pca_results_test = pca.transform(test)

# tSVD
tsvd = TruncatedSVD(n_components=n_comp, random_state=SEED)
# tsvd.fit(full_data)
tsvd_results_train = tsvd.fit_transform(train.drop('y',1))
tsvd_results_test = tsvd.transform(test)

# GRP
grp = GaussianRandomProjection(n_components=n_comp, eps=0.1, random_state=SEED)
# grp.fit(full_data)
grp_results_train = grp.fit_transform(train.drop('y',1))
grp_results_test = grp.transform(test)

# SRP
srp = SparseRandomProjection(n_components=n_comp, dense_output=True, random_state=SEED)
# srp.fit(full_data)
srp_results_train = srp.fit_transform(train.drop('y',1))
srp_results_test = srp.transform(test)


for i in range(1, n_comp+1):
    train['tsvd_' + str(i)] = tsvd_results_train[:,i-1]
    test['tsvd_' + str(i)] = tsvd_results_test[:, i-1]
    
    train['pca_' + str(i)] = pca_results_train[:,i-1]
    test['pca_' + str(i)] = pca_results_test[:, i-1]
    
    train['ica_' + str(i)] = ica_results_train[:,i-1]
    test['ica_' + str(i)] = ica_results_test[:, i-1]
    
    train['grp_' + str(i)] = grp_results_train[:,i-1]
    test['grp_' + str(i)] = grp_results_test[:, i-1]
    
    train['srp_' + str(i)] = srp_results_train[:,i-1]
    test['srp_' + str(i)] = srp_results_test[:, i-1]



In [17]:
train.shape, test.shape

((4209, 430), (4209, 429))

## Training and Prediction

In [18]:
x_train, y_train = train.drop('y',1), train['y']
x_test = test

y_mean=np.mean(train['y'])

In [127]:
xgb_params = {
    'n_estimators': [800], 
    'learning_rate': [0.004],
    'max_depth': [10],
    'subsample': [0.95],
}

In [128]:
from sklearn.metrics import make_scorer

xgb_model = xgb.XGBRegressor(
    silent=1,
    objective = 'reg:linear',
    seed=12345
)

# create r2 scorer
r2_scorer = make_scorer(r2_score, greater_is_better = True)

# grid search
model = GridSearchCV(
    estimator = xgb_model,
    param_grid = xgb_params,
    scoring = r2_scorer,
    cv = 5,
    verbose=100
)

In [105]:
model.fit(x_train, y_train)

Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV] n_estimators=800, subsample=0.95, learning_rate=0.004, max_depth=4 
[CV]  n_estimators=800, subsample=0.95, learning_rate=0.004, max_depth=4, score=0.066900 -   3.9s
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    3.9s remaining:    0.0s
[CV] n_estimators=800, subsample=0.95, learning_rate=0.004, max_depth=4 
[CV]  n_estimators=800, subsample=0.95, learning_rate=0.004, max_depth=4, score=0.432321 -   3.7s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    7.6s remaining:    0.0s
[CV] n_estimators=800, subsample=0.95, learning_rate=0.004, max_depth=4 
[CV]  n_estimators=800, subsample=0.95, learning_rate=0.004, max_depth=4, score=0.588670 -   3.9s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:   11.5s remaining:    0.0s
[CV] n_estimators=800, subsample=0.95, learning_rate=0.004, max_depth=4 
[CV]  n_estimators=800, subsample=0.95, learning_rate=0.004, max_depth=4, score=0.530099 -   3.8s
[Parallel(n_jobs=

GridSearchCV(cv=5, error_score='raise',
       estimator=XGBRegressor(base_score=100.66931812782134, colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       nthread=-1, objective='reg:linear', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=12345, silent=1, subsample=1),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'n_estimators': [800], 'subsample': [0.95], 'learning_rate': [0.004], 'max_depth': [4]},
       pre_dispatch='2*n_jobs', refit=True, scoring=make_scorer(r2_score),
       verbose=100)

In [91]:
output_xgb14 = pd.DataFrame({'id': test['ID'].astype(np.int32), 'y': model.predict(x_test)})
output_xgb14.to_csv('mercedes_xgb14.csv', index=False)

Public LB score of **0.5642**.

In [106]:
print r2_score(y_train, model.predict(x_train))

0.63244434073


## Model Stacking

In [19]:
xgb_params3 = {
    'n_trees': 520, 
    'eta': 0.0045,
    'max_depth': 4,
    'subsample': 0.93,
    'objective': 'reg:linear',
    'eval_metric': 'rmse',
    'base_score': y_mean, # base prediction = mean(target)
    'silent': 1
}

xgb_params4 = {
    'n_trees': 500, 
    'eta': 0.005,
    'max_depth': 4,
    'subsample': 0.95,
    'objective': 'reg:linear',
    'eval_metric': 'rmse',
    'base_score': y_mean, # base prediction = mean(target)
    'silent': 1
}


xgb_params5 = {
    'n_trees': 800, 
    'eta': 0.004,
    'max_depth': 4,
    'subsample': 0.95,
    'objective': 'reg:linear',
    'eval_metric': 'rmse',
    'base_score': y_mean, # base prediction = mean(target)
    'silent': 1
}

In [42]:
dtrain=xgb.DMatrix(x_train, y_train)
dtest=xgb.DMatrix(x_test)

In [43]:
num_boost_rounds = 1250
# train model
model_stacked_xgb1 = xgb.train(dict(xgb_params3, silent=1), dtrain, num_boost_round=num_boost_rounds)

In [44]:
num_boost_rounds = 1200
# train model
model_stacked_xgb2 = xgb.train(dict(xgb_params4, silent=1), dtrain, num_boost_round=num_boost_rounds)

In [45]:
num_boost_rounds = 1300
# train model
model_stacked_xgb3 = xgb.train(dict(xgb_params5, silent=1), dtrain, num_boost_round=num_boost_rounds)

In [21]:
class StackingEstimator(BaseEstimator, TransformerMixin):
    
    def __init__(self, estimator):
        self.estimator = estimator

    def fit(self, X, y=None, **fit_params):
        self.estimator.fit(X, y, **fit_params)
        return self
    
    def transform(self, X):
        X = check_array(X)
        X_transformed = np.copy(X)
        
        # add class probabilities as a synthetic feature
        if issubclass(self.estimator.__class__, ClassifierMixin) and hasattr(self.estimator, 'predict_proba'):
            X_transformed = np.hstack((self.estimator.predict_proba(X), X))

        # add class prodiction as a synthetic feature
        X_transformed = np.hstack((np.reshape(self.estimator.predict(X), (-1, 1)), X_transformed))

        return X_transformed
    
class StackingCVRegressorRetrained(BaseEstimator, RegressorMixin, TransformerMixin):
    def __init__(self, regressors, meta_regressor, n_folds=5, use_features_in_secondary=True):
        self.regressors = regressors
        self.meta_regressor = meta_regressor
        self.n_folds = n_folds
        self.use_features_in_secondary = use_features_in_secondary

    def fit(self, X, y):
        self.regr_ = [clone(x) for x in self.regressors]
        self.meta_regr_ = clone(self.meta_regressor)

        kfold = KFold(n_splits=self.n_folds)

        out_of_fold_predictions = np.zeros((X.shape[0], len(self.regressors)))

        # Create out-of-fold predictions for training meta-model
        for i, regr in enumerate(self.regr_):
            for train_idx, holdout_idx in kfold.split(X, y):
                instance = clone(regr)
                instance.fit(X.ix[train_idx], y[train_idx])
                out_of_fold_predictions[holdout_idx, i] = instance.predict(X.ix[holdout_idx])

        # Train meta-model
        if self.use_features_in_secondary:
            self.meta_regr_.fit(np.hstack((X, out_of_fold_predictions)), y)
        else:
            self.meta_regr_.fit(out_of_fold_predictions, y)
        
        # Retrain base models on all data
        for regr in self.regr_:
            regr.fit(X, y)

        return self

    def predict(self, X):
        meta_features = np.column_stack([
            regr.predict(X) for regr in self.regr_
        ])

        if self.use_features_in_secondary:
            return self.meta_regr_.predict(np.hstack((X, meta_features)))
        else:
            return self.meta_regr_.predict(meta_features)
        
class AveragingRegressor(BaseEstimator, RegressorMixin, TransformerMixin):
    def __init__(self, regressors):
        self.regressors = regressors

    def fit(self, X, y):
        self.regr_ = [clone(x) for x in self.regressors]
        
        # Train base models
        for regr in self.regr_:
            regr.fit(X, y)

        return self

    def predict(self, X):
        predictions = np.column_stack([
            regr.predict(X) for regr in self.regr_
        ])
        return np.mean(predictions, axis=1)



In [22]:
stacked_pipeline = make_pipeline(
    StackingEstimator(estimator=LassoLarsCV(normalize=True)),
    StackingEstimator(estimator=GradientBoostingRegressor(learning_rate=0.004, loss="huber", max_depth=5, max_features=0.55, min_samples_leaf=18, min_samples_split=14, subsample=0.9)),
    LassoLarsCV()
)

In [23]:
stacked_pipeline.fit(x_train, y_train)

  g1 = arrayfuncs.min_pos((C - Cov) / (AA - corr_eq_dir + tiny))
  g2 = arrayfuncs.min_pos((C + Cov) / (AA + corr_eq_dir + tiny))




Pipeline(steps=[('stackingestimator-1', StackingEstimator(estimator=LassoLarsCV(copy_X=True, cv=None, eps=2.2204460492503131e-16,
      fit_intercept=True, max_iter=500, max_n_alphas=1000, n_jobs=1,
      normalize=True, positive=False, precompute='auto', verbose=False))), ('stackingestimator-2', StackingEst...x_n_alphas=1000, n_jobs=1,
      normalize=True, positive=False, precompute='auto', verbose=False))])

In [46]:
stacked_pred1=stacked_pipeline.predict(x_test)


# final_pred = 0.75*xgb_pred1 + 0.25*stacked_pred

stacked_pred1

array([  80.14897441,   94.36736093,   79.78993308, ...,   92.86456237,
        109.96506944,   91.93661774])

In [62]:
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor

en = ElasticNet(alpha=0.001, l1_ratio=0.1)
    
rf = RandomForestRegressor(n_estimators=500, n_jobs=4, min_samples_split=25, min_samples_leaf=25, max_depth=4)

et = ExtraTreesRegressor(n_estimators=100, n_jobs=4, min_samples_split=25, min_samples_leaf=35, max_features=150)

xgbm = xgb.sklearn.XGBRegressor(max_depth=4, learning_rate=0.005, subsample=0.9, base_score=y_mean,
                                objective='reg:linear', n_estimators=1000)

ls1=LassoLarsCV(normalize=True)

gbr=GradientBoostingRegressor(learning_rate=0.004, loss="huber", max_depth=5, max_features=0.55, subsample=0.9)
                           
# stack_avg = StackingCVRegressorAveraged((en, rf, et), ElasticNet(l1_ratio=0.1, alpha=1.4))

stack_with_feats = StackingCVRegressorRetrained((en, rf, et, ls1, gbr), xgbm, use_features_in_secondary=True)

stack_retrain = StackingCVRegressorRetrained((en, rf, et, ls1, gbr), ElasticNet(l1_ratio=0.1, alpha=1.4),use_features_in_secondary=False)

averaged = AveragingRegressor((en, rf, et, xgbm, ls1, gbr))


In [63]:
stack_with_feats.fit(x_train, y_train)








StackingCVRegressorRetrained(meta_regressor=XGBRegressor(base_score=100.66931812782134, colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.005, max_delta_step=0,
       max_depth=4, min_child_weight=1, missing=None, n_estimators=1000,
       nthread=-1, objective='reg:linear', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=0.9),
               n_folds=5,
               regressors=(ElasticNet(alpha=0.001, copy_X=True, fit_intercept=True, l1_ratio=0.1,
      max_iter=1000, normalize=False, positive=False, precompute=False,
      random_state=None, selection='cyclic', tol=0.0001, warm_start=False), RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=4,
 ...rs=100, presort='auto', random_state=None,
             subsample=0.9, verbose=0, warm_start=False)),
               use_features_in_secondary=True)

In [64]:
stacked_pred2=stack_with_feats.predict(x_test)
stacked_pred2

array([  84.88342285,  111.15447998,   84.46959686, ...,   93.037323  ,
        110.33455658,   95.16850281], dtype=float32)

In [65]:
stack_retrain.fit(x_train, y_train)
stacked_pred3=stack_retrain.predict(x_test)
stacked_pred3

array([  80.41874166,   95.96779117,   80.10699609, ...,   91.83824779,
        109.92308617,   92.22802241])

In [66]:
averaged.fit(x_train, y_train)
stacked_pred4=averaged.predict(x_test)
stacked_pred4

array([  84.04132603,  100.37424685,   83.87997461, ...,   93.54765902,
        108.60308996,   93.68534541])

In [67]:
xgb_pred1=model_stacked_xgb1.predict(dtest)
xgb_pred2=model_stacked_xgb2.predict(dtest)
xgb_pred3=model_stacked_xgb3.predict(dtest)

In [98]:
stacked_df=pd.DataFrame()
stacked_df['pred1']=stacked_pred1
stacked_df['pred2']=stacked_pred2
stacked_df['pred3']=stacked_pred3
stacked_df['pred4']=stacked_pred4
stacked_df['xgb_pred1']=xgb_pred1
stacked_df['xgb_pred2']=xgb_pred2
stacked_df['xgb_pred3']=xgb_pred3

columns1=['pred1','pred2','xgb_pred1','xgb_pred2','xgb_pred3']
columns2=['pred3','pred4', 'xgb_pred1', 'xgb_pred3', 'xgb_pred2']
final_pred1=stacked_df[columns1].mean(axis=1)
final_pred2=stacked_df[columns2].mean(axis=1)
final_pred3=stacked_df.mean(axis=1)
# final_pred2=0.1112*stacked_df[columns2].mean(axis=1)

# final_pred3=final_pred2+final_pred1

final_pred=(final_pred2+final_pred1+final_pred3)/3.0


In [113]:
class StackingEstimator(BaseEstimator, TransformerMixin):
    
    def __init__(self, estimator):
        self.estimator = estimator

    def fit(self, X, y=None, **fit_params):
        self.estimator.fit(X, y, **fit_params)
        return self
    def transform(self, X):
        X = check_array(X)
        X_transformed = np.copy(X)
        # add class probabilities as a synthetic feature
        if issubclass(self.estimator.__class__, ClassifierMixin) and hasattr(self.estimator, 'predict_proba'):
            X_transformed = np.hstack((self.estimator.predict_proba(X), X))

        # add class prodiction as a synthetic feature
        X_transformed = np.hstack((np.reshape(self.estimator.predict(X), (-1, 1)), X_transformed))

        return X_transformed


train = train_orig
test = test_orig


for c in train.columns:
    if train[c].dtype == 'object':
        lbl = LabelEncoder()
        lbl.fit(list(train[c].values) + list(test[c].values))
        train[c] = lbl.transform(list(train[c].values))
        test[c] = lbl.transform(list(test[c].values))



n_comp = 12

# tSVD
tsvd = TruncatedSVD(n_components=n_comp, random_state=420)
tsvd_results_train = tsvd.fit_transform(train.drop(["y"], axis=1))
tsvd_results_test = tsvd.transform(test)

# PCA
pca = PCA(n_components=n_comp, random_state=420)
pca2_results_train = pca.fit_transform(train.drop(["y"], axis=1))
pca2_results_test = pca.transform(test)

# ICA
ica = FastICA(n_components=n_comp, random_state=420)
ica2_results_train = ica.fit_transform(train.drop(["y"], axis=1))
ica2_results_test = ica.transform(test)

# GRP
grp = GaussianRandomProjection(n_components=n_comp, eps=0.1, random_state=420)
grp_results_train = grp.fit_transform(train.drop(["y"], axis=1))
grp_results_test = grp.transform(test)

# SRP
srp = SparseRandomProjection(n_components=n_comp, dense_output=True, random_state=420)
srp_results_train = srp.fit_transform(train.drop(["y"], axis=1))
srp_results_test = srp.transform(test)

#save columns list before adding the decomposition components

usable_columns = list(set(train.columns) - set(['y']))

# Append decomposition components to datasets
for i in range(1, n_comp + 1):
    train['pca_' + str(i)] = pca2_results_train[:, i - 1]
    test['pca_' + str(i)] = pca2_results_test[:, i - 1]

    train['ica_' + str(i)] = ica2_results_train[:, i - 1]
    test['ica_' + str(i)] = ica2_results_test[:, i - 1]

    train['tsvd_' + str(i)] = tsvd_results_train[:, i - 1]
    test['tsvd_' + str(i)] = tsvd_results_test[:, i - 1]

    train['grp_' + str(i)] = grp_results_train[:, i - 1]
    test['grp_' + str(i)] = grp_results_test[:, i - 1]

    train['srp_' + str(i)] = srp_results_train[:, i - 1]
    test['srp_' + str(i)] = srp_results_test[:, i - 1]

#usable_columns = list(set(train.columns) - set(['y']))

y_train = train['y'].values
y_mean = np.mean(y_train)
id_test = test['ID'].values
#finaltrainset and finaltestset are data to be used only the stacked model (does not contain PCA, SVD... arrays) 
finaltrainset = train[usable_columns].values
finaltestset = test[usable_columns].values


'''Train the xgb model then predict the test data'''

xgb_params = {
    'n_trees': 520, 
    'eta': 0.0045,
    'max_depth': 4,
    'subsample': 0.93,
    'objective': 'reg:linear',
    'eval_metric': 'rmse',
    'base_score': y_mean, # base prediction = mean(target)
    'silent': 1
}
# NOTE: Make sure that the class is labeled 'class' in the data file

dtrain = xgb.DMatrix(train.drop('y', axis=1), y_train)
dtest = xgb.DMatrix(test)

num_boost_rounds = 1250
# train model
model = xgb.train(dict(xgb_params, silent=0), dtrain, num_boost_round=num_boost_rounds)
y_pred = model.predict(dtest)

'''Train the stacked models then predict the test data'''

stacked_pipeline = make_pipeline(
    StackingEstimator(estimator=LassoLarsCV(normalize=True)),
    StackingEstimator(estimator=GradientBoostingRegressor(learning_rate=0.001, loss="huber", max_depth=5, max_features=0.55, min_samples_leaf=18, min_samples_split=14, subsample=0.7)),
    LassoLarsCV()

)


stacked_pipeline.fit(finaltrainset, y_train)
results = stacked_pipeline.predict(finaltestset)

'''R2 Score on the entire Train data when averaging'''

print('R2 score on train data:')
print(r2_score(y_train,stacked_pipeline.predict(finaltrainset)*0.2855 + model.predict(dtrain)*0.7145))

'''Average the preditionon test data  of both models then save it on a csv file'''

sub = pd.DataFrame()
sub['ID'] = id_test
sub['y'] = y_pred*0.75 + results*0.25
sub.to_csv('stacked-model.csv', index=False)



R2 score on train data:
0.655918869095


In [106]:
stacked_df['nice_pred']=results
final_pred=stacked_df[['xgb_pred1','xgb_pred2','xgb_pred3','nice_pred']].mean(axis=1)

In [107]:
output_xgb_stacked13 = pd.DataFrame({'id': test['ID'].astype(np.int32), 'y': final_pred})
output_xgb_stacked13.to_csv('mercedes_xgb_stacked13.csv', index=False)