In [None]:
import numpy as np
import pandas as pd

#load data
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
all_data = pd.concat((train.loc[:, 'X0':'X385'] , test.loc[:, 'X0':'X385']))
all_data = pd.get_dummies(all_data)

#Separate train,test
X_train = all_data[:train.shape[0]]
X_test = all_data[train.shape[0]:]
y = train['y']

#Dimensionality Reduction/Regularization
from sklearn.linear_model import LassoCV
model_lasso = LassoCV(cv=5).fit(X_train, y)

from sklearn.feature_selection import SelectFromModel
model = SelectFromModel(model_lasso, prefit=True)
X_train_red = model.transform(X_train)
X_test_red = model.transform(X_test)


#Modeling
from sklearn.linear_model import Lasso,Ridge,ElasticNet
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.model_selection import cross_val_score , KFold

models = [('Ridge' , Ridge()),
         ('Lasso' , Lasso()),
         ('Elastic Net' , ElasticNet()),
         ('SVR' , SVR(kernel='rbf')),
         ('RF_regressor' , RandomForestRegressor()),
         ('Xgboost_regressor' , XGBRegressor()),
         ]


kf = KFold(10)
scores = []
for m in models:
    this_score = cross_val_score(m[1], X_train_red, y, cv = kf, scoring='r2' , n_jobs=-1)
    print('%s R2 score is %.3f +/- %.3f' % (m[0] , np.mean(this_score) , np.std(this_score)))
    
    
#Hyper parameter tuning on XGBRegressor
xgb = XGBRegressor()
from sklearn.grid_search import GridSearchCV
param_grid_xgb = {'n_estimators':[20,35,50,100] , 'max_depth':[3,6,9]  ,'min_child_weight':[1,5,15]}
grid_xgb = GridSearchCV(xgb , param_grid_xgb , cv=7 , scoring='r2',n_jobs=-1)
grid_xgb.fit(X_train_red,y)
print('GS best score %.3f' % grid_xgb.best_score_)
print('GS best params {}'.format(grid_xgb.best_params_))

#Using best params to predict test data
xgb_gs = grid_xgb.best_estimator_
xgb_gs.fit(X_train_red ,y)
y_pred = xgb_gs.predict(X_test_red)
solution = pd.DataFrame({'ID':test['ID'] , 'y':y_pred})
solution.to_csv('Mercedes_xgb_gs.csv' , index=False)


#Public R2 Score on kaggle 0.54635
#Private R2 Score on kaggle 0.54517
#Best score on kaggle 0.55550