In [None]:
import sys
sys.path.append('../')
from sklearn.preprocessing import StandardScaler,MinMaxScaler,Imputer
from util import *
from onehot import LabelBinarizerEx
from pipeline import FeaturePipeline, DataFramePipeline
from binning import Binner
from ensemble import EnsembleStackClassifierEx
from addcols import AddColumns
from impute import GroupImputer

## explore data

In [None]:
import pandas as pd
house_train=pd.read_csv('train.csv')
target_col='SalePrice'
id_col='Id'
target=house_train[target_col]

In [None]:
drop_cols=[target_col,id_col]
house_train.drop(drop_cols,axis=1,inplace=True)

In [None]:
num_summary=house_train.describe()
num_summary

In [None]:
cat_summary=house_train.describe(include=['O'])
cat_summary

In [None]:
# drop heavy missing cols
c=house_train.count()
heavy_missing_cols=c[c*2<len(house_train)].index
heavy_missing_cols

## feature enginering

In [None]:
cat_pipelines=[FeaturePipeline(c,'',Pipeline([('onehot',LabelBinarizerEx([c]))])) for c in cat_summary.columns]
num_pipelines=[FeaturePipeline(c,c,Pipeline([('impute',Imputer(strategy='median')),('scale',StandardScaler())])) for c in num_summary.columns]
full_pipeline=DataFramePipeline(cat_pipelines+num_pipelines)
prepared_house_train=full_pipeline.fit_transform(house_train)
prepared_house_train.head()
prepared_house_train.drop(cat_summary.columns,axis=1,inplace=True)

In [None]:
prepared_house_train.describe()

## train

In [None]:
from sklearn.linear_model import SGDClassifier,LinearRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestRegressor,ExtraTreesRegressor,AdaBoostRegressor,GradientBoostingRegressor
from xgboost import XGBRegressor

base_regs=[
#     LinearRegression(n_jobs=-1),
#     SVC(probability=True),
    RandomForestRegressor(n_jobs=-1),
#     ExtraTreesRegressor(n_jobs=-1),
    GradientBoostingRegressor(),
    XGBRegressor(),
]

In [None]:
from sklearn.model_selection import cross_val_score
from math import sqrt,log10

%matplotlib inline
import matplotlib.pyplot as plt
from pylab import *

def trainModels(train_data, target):
    scores=[log10(sqrt(-cross_val_score(reg,train_data,target,scoring='neg_mean_squared_error',cv=5,n_jobs=-1,verbose=1).mean())) 
            for reg in base_regs]

    labels=[c.__class__.__name__[:3] for c in base_regs]
    X=np.arange(len(base_regs))
    bar(X,scores,tick_label=labels,color='rgb')
#     ylim(0.5,1.0)
    show()
    print(sorted(zip(labels,scores),key=lambda x:x[1],reverse=True))

In [None]:
trainModels(prepared_house_train,target)

In [None]:
best_reg=XGBRegressor(objective='reg:linear')
best_reg.fit(prepared_house_train,target)

In [None]:
best_reg=GradientBoostingRegressor()
best_reg.fit(prepared_house_train,target)

## tuning

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid_set=[
#                 {'C':[0.01,0.1,0.5,1.]},
#                 {'C':[1.,10.,],'kernel':['rbf','poly'],'gamma':[0.01,0.1,1.],'coef0':[1.,10.,]},
                {'n_estimators':[50,100,200,300],'max_depth':[5,10,15]},
                {'learning_rate':[0.01,0.1,1.0],'n_estimators':[100,200,300],'max_depth':[3,5,8]},
#                 {'learning_rate':[0.01,0.1,1.0],'n_estimators':[100,200,300]},
#                 {'n_estimators':[50,100,200,300],'max_depth':[5,10,15]},
                {'learning_rate':[0.01,0.1,1.0],'n_estimators':[100,200,300],'max_depth':[5,10,15],'gamma':[0.01,0.1,0.5]},
               ]

def tuneModels(train_data,target):
    results=[]
    for i in range(len(base_regs)):
        gs=GridSearchCV(estimator=base_regs[i],param_grid=param_grid_set[i],scoring='neg_mean_squared_error',n_jobs=-1,verbose=1,cv=5)
        gs.fit(train_data,target)
        results.append((gs.best_estimator_,gs.best_score_))
    print(sorted(results,key=lambda x:x[1],reverse=True))
    return results

In [None]:
results=tuneModels(prepared_house_train,target)

## test

In [None]:
house_test=pd.read_csv('test.csv')
test_id=house_test[id_col]
prepared_house_test=full_pipeline.transform(house_test)
prepared_house_test.drop([id_col]+list(cat_summary.columns),axis=1,inplace=True)
prices=best_reg.predict(prepared_house_test)
df=pd.DataFrame({'Id':test_id,'SalePrice':prices})
df.to_csv('result.csv',index=False)