In [None]:
import sys
sys.path.append('../')
from sklearn.preprocessing import StandardScaler,MinMaxScaler,Imputer
from util import *
from onehot import LabelBinarizerEx
from pipeline import FeaturePipeline, DataFramePipeline
from binning import Binner
from ensemble import EnsembleStackClassifierEx
from addcols import AddColumns
from impute import GroupImputer

%matplotlib inline
from matplotlib import pyplot

## explore data

In [None]:
import pandas as pd
house_train=pd.read_csv('train.csv')
target_col='SalePrice'
id_col='Id'
target=house_train[target_col]

In [None]:
num_summary=house_train.describe()
num_summary

In [None]:
cat_summary=house_train.describe(include=['O'])
cat_summary

add unit price

In [None]:
house_train['UnitPrice']=house_train[target_col]/house_train['LotArea']

explore features

In [None]:
corrs=house_train.corr()[['UnitPrice']]
corrs.sort_values('UnitPrice',ascending=False)

In [None]:
df=house_train.groupby('LotShape').mean()[['UnitPrice']]
df.plot(kind='bar')

LotShape is ordinal

'Reg' is max

In [None]:
corrs['UnitPrice']['LotFrontage']

LotFrontage is slight neg corr with up

it should be discretized

In [None]:
corrs['UnitPrice']['LotArea']

same with LotFrontage

In [None]:
df=house_train.groupby('Utilities').mean()[['UnitPrice']]
df.sort_values('UnitPrice').plot(kind='bar')

make Utilities ordinal

In [None]:
df=house_train.groupby('LandSlope').mean()[['UnitPrice']]
df.sort_values('UnitPrice').plot(kind='bar')

make LandSlope ordinal

merge condition1&2 into 1 col

In [None]:
df=house_train.groupby('HouseStyle').mean()[['UnitPrice']].sort_values('UnitPrice')
df.plot(kind='bar')

make HouseStyle ordinal , SLvl is max

In [None]:
corrs['UnitPrice']['YearRemodAdd']

In [None]:
crr_max=
for i in range(2,10):
    house_train['YearBand'+i]=pd.qcut(house_train['YearBuilt'],4,labels=False,duplicates='drop')
# bins
# help(pd.qcut)
corrs=house_train.corr()
corrs['UnitPrice'][['YearBuilt','YearBand']]

discretize YearBuilt, band is 9

In [None]:
house_train['YearTotal']=house_train['YearRemodAdd']+house_train['YearBuilt']
house_train['YearTotalBand']=pd.qcut(house_train['YearTotal'],6,labels=False,duplicates='drop')
corrs=house_train.corr()
corrs['UnitPrice'][['YearBuilt','YearBand','YearTotal','YearRemodAdd','YearTotalBand']]

add YearBuilt & YearRemodAdd to make YearTotal, discretize it

make ExterQual,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinType2,HeatingQC,KitchenQual,FireplaceQu,GarageFinish, GarageQual, GarageCond, PoolQC, Fence, ordinal

In [None]:
df=house_train.groupby('RoofStyle').mean()[['UnitPrice']].sort_values('UnitPrice')
df.plot(kind='bar')

## feature enginering

In [None]:
drop_cols=[id_col,target_col]

In [None]:
num_cols=list(num_summary.columns)
num_cols=list(filter(lambda c: c not in drop_cols, num_cols))

In [None]:
num_band_cols=list(filter(lambda c: len(house_train[c].unique())>20 , num_cols))
num_normal_cols=set(num_cols)-set(num_band_cols)

In [None]:
cat_cols=cat_summary.columns

In [None]:
cat_ordinal_cols={
    'LotShape':['IR3','IR2','IR1','Reg'],
    'Utilities':['ELO','NoSeWa','NoSewr','AllPub'],
    'LandSlope':['Sev','Mod','Gtl',],
    'HouseStyle':['1Story','1.5Fin','1.5Unf','2.5Fin','2.5Unf','SFoyer','SLvl'],
    'ExterQual':['po','fa','ta','gd','ex'],
    'BsmtQual':['na','po','fa','ta','gd','ex'],
    'BsmtCond':['na','po','fa','ta','gd','ex'],
    'BsmtExposure':['na','no','mn','av','gd'],
    'BsmtFinType1':['na','unf','lwq','rec','blq','alq','glq'],
    'BsmtFinType2':['na','unf','lwq','rec','blq','alq','glq'],
    'HeatingQC':['po','fa','ta','gd','ex'],
    'KitchenQual':['po','fa','ta','gd','ex'],
    'Functional':['sal','sev','maj2','maj1','mod','min2','min1','typ'],
    'FireplaceQu':['na','po','fa','ta','gd','ex'],
    'GarageFinish':['na','unf','rfn','fin'], 
    'GarageQual':['na','po','fa','ta','gd','ex'],
    'GarageCond':['na','po','fa','ta','gd','ex'], 
    'PoolQC':['na','fa','ta','gd','ex'], 
    'Fence':['na','mnww','gdwo','mnprv','gdprv']
}

In [None]:
cat_type_cols=set(cat_cols)-cat_ordinal_cols.keys()

In [None]:
cat_pipelines=[FeaturePipeline(c,'',Pipeline([('onehot',LabelBinarizerEx([c]))])) for c in cat_summary.columns]
num_pipelines=[FeaturePipeline(c,c,Pipeline([('impute',Imputer(strategy='median')),('scale',StandardScaler())])) for c in num_summary.columns]
full_pipeline=DataFramePipeline(cat_pipelines+num_pipelines)
prepared_house_train=full_pipeline.fit_transform(house_train)
prepared_house_train.head()
prepared_house_train.drop(cat_summary.columns,axis=1,inplace=True)

## train

In [None]:
from sklearn.linear_model import SGDClassifier,LinearRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestRegressor,ExtraTreesRegressor,AdaBoostRegressor,GradientBoostingRegressor
from xgboost import XGBRegressor

base_regs=[
#     LinearRegression(n_jobs=-1),
#     SVC(probability=True),
    RandomForestRegressor(n_jobs=-1),
#     ExtraTreesRegressor(n_jobs=-1),
    GradientBoostingRegressor(),
    XGBRegressor(),
]

In [None]:
from sklearn.model_selection import cross_val_score
from math import sqrt,log10

%matplotlib inline
import matplotlib.pyplot as plt
from pylab import *

def trainModels(train_data, target):
    scores=[log10(sqrt(-cross_val_score(reg,train_data,target,scoring='neg_mean_squared_error',cv=5,n_jobs=-1,verbose=1).mean())) 
            for reg in base_regs]

    labels=[c.__class__.__name__[:3] for c in base_regs]
    X=np.arange(len(base_regs))
    bar(X,scores,tick_label=labels,color='rgb')
#     ylim(0.5,1.0)
    show()
    print(sorted(zip(labels,scores),key=lambda x:x[1],reverse=True))

In [None]:
trainModels(prepared_house_train,target)

In [None]:
best_reg=GradientBoostingRegressor()
best_reg.fit(prepared_house_train,target)


## tuning

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid_set=[
#                 {'C':[0.01,0.1,0.5,1.]},
#                 {'C':[1.,10.,],'kernel':['rbf','poly'],'gamma':[0.01,0.1,1.],'coef0':[1.,10.,]},
                {'n_estimators':[50,100,200,300],'max_depth':[5,10,15]},
                {'learning_rate':[0.01,0.1,1.0],'n_estimators':[100,200,300],'max_depth':[3,5,8]},
#                 {'learning_rate':[0.01,0.1,1.0],'n_estimators':[100,200,300]},
#                 {'n_estimators':[50,100,200,300],'max_depth':[5,10,15]},
                {'learning_rate':[0.01,0.1,1.0],'n_estimators':[100,200,300],'max_depth':[5,10,15],'gamma':[0.01,0.1,0.5]},
               ]

def tuneModels(train_data,target):
    results=[]
    for i in range(len(base_regs)):
        gs=GridSearchCV(estimator=base_regs[i],param_grid=param_grid_set[i],scoring='neg_mean_squared_error',n_jobs=-1,verbose=1,cv=5)
        gs.fit(train_data,target)
        results.append((gs.best_estimator_,gs.best_score_))
    print(sorted(results,key=lambda x:x[1],reverse=True))
    return results

In [None]:
results=tuneModels(prepared_house_train,target)

## test

In [None]:
house_test=pd.read_csv('test.csv')
house_test.describe()[['YrSold',id_col]]

In [None]:
test_id=house_test[id_col]
prepared_house_test=full_pipeline.transform(house_test)
prepared_house_test.drop([id_col]+list(cat_summary.columns),axis=1,inplace=True)
prices=best_reg.predict(prepared_house_test)
df=pd.DataFrame({'Id':test_id,'SalePrice':prices})
df.to_csv('result.csv',index=False)