## 1 Preprocessing
- null값이 대부분인 변수 제거
- 남은 변수의 null값 채우기
- skewed data 로그 변환
- category 변수 인코딩

In [6]:
from scipy.stats import skew

In [180]:
def preprocess(train, test):
    outlier_idx=[   4,   11,   13,   30,   53,   66,  112,  142,  151,  153,  185,
        199,  218,  224,  231,  238,  261,  271,  313,  318,  328,  377,
        410,  440,  451,  454,  457,  462,  473,  488,  496,  523,  529,
        559,  568,  581,  583,  588,  595,  607,  608,  628,  632,  666,
        681,  688,  691,  692,  714,  738,  747,  769,  774,  803,  825,
        864,  885,  898,  970,  990, 1046, 1065, 1142, 1169, 1181, 1182,
       1211, 1298, 1322, 1324, 1328, 1359, 1423, 1442, 1453]
    train.drop(train.index[outlier_idx], inplace=True)
    
    all_data = pd.concat([train.loc[:,'MSSubClass':'SaleCondition'], test.loc[:,'MSSubClass':'SaleCondition']])
    
    to_delete = ['Alley','FireplaceQu','PoolQC','Fence','MiscFeature']
    all_data.drop(to_delete, axis=1, inplace = True)
    
    #log transform
    numeric_feats = all_data.dtypes[all_data.dtypes != "object"].index
    skewness = all_data[numeric_feats].apply(lambda x : skew(x.dropna()))
    skewed_feats = skewness[skewness>0.75].index
    all_data[skewed_feats] = np.log1p(all_data[skewed_feats])
    
    #encode categorical data
    all_data=pd.get_dummies(all_data)
    
    #fillna
    all_data = all_data.fillna(all_data.mean())
    
    X_train = all_data[:train.shape[0]]
    X_test = all_data[train.shape[0]:]
    y_train = np.log1p(train.SalePrice)
    
    return X_train, X_test, y_train

## 2 Detect Outliers
+ 아웃라이어 제거 없이 예측 후, 실제 값과 예측값이 크게 벗어나는 데이터 제거

In [67]:
from sklearn.ensemble import IsolationForest
from sklearn.covariance import EllipticEnvelope

In [176]:
train_pred = pd.read_csv('./data/train_pred.csv')
dif = np.abs(train_pred.SalePrice.values - df_train.SalePrice.values)
dif = dif > 40000
outlier_idx = np.where(dif)[0]

## 3 Make models

In [82]:
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import KFold, train_test_split
from sklearn.metrics import make_scorer, mean_squared_error
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor
from sklearn.linear_model import Ridge, RidgeCV, ElasticNet, LassoCV, LassoLarsCV, Lasso
from sklearn.svm import SVR

### 3_1 make scorer
+ 모델 성능 평가 기준을 kaggle 평가 기준인 Root mean square error로 만들어준다

In [9]:
def error(real_value, prediction):
    return mean_squared_error(real_value,prediction)**0.5

RMSE = make_scorer(error, greater_is_better=False)

### 3_2 hyper-parameter tuning
+ 개별 모형들에 대해 가장 높은 점수가 나오는 hyper parameter를 찾는다
+ 개별 모형들은 밑에서 base_models로 사용된다

In [191]:
def random_forest(X_train, X_test):
    rfr = RandomForestRegressor(random_state=0)
    param_grid = {'n_estimators':[500,600,700], 'max_features':[10, 15, 20, 25, 30], 'max_depth':[5, 7, 9, 11, 13]}
    model = GridSearchCV(estimator=rfr, param_grid=param_grid, n_jobs=1, cv=4, scoring=RMSE)
    model.fit(X_train, y_train)
    
    print 'best score:'
    print model.best_score_
    print
    print 'best params:'
    print model.best_params_
    
def gradient_boosting(X_train, X_test):
    gbr = GradientBoostingRegressor(random_state=0)
    param_grid = {'n_estimators': [500],'max_features': [10,15],'max_depth': [6,8,10],'learning_rate': [0.05,0.1,0.15],'subsample': [0.8]}
    model = GridSearchCV(estimator=gbr, param_grid=param_grid, n_jobs=1, cv=10, scoring=RMSE)
    model.fit(X_train, y_train)
    print 'best score:'
    print model.best_score_
    print
    print 'best params:'
    print model.best_params_

def extra_trees(X_train, X_test):
    etr=ExtraTreesRegressor(random_state=0)
    param_grid = {'n_estimators': [500,600,700], 'max_features': [10,15,20]}
    model = GridSearchCV(estimator=etr, param_grid=param_grid, n_jobs=2, cv=4, scoring=RMSE)
    model.fit(X_train, y_train)
    print 'best score:'
    print model.best_score_
    print
    print 'best params:'
    print model.best_params_

def elastic_net(X_train, X_test):
    eln = ElasticNet(random_state=0)
    param_grid = {'l1_ratio': [0.3,0.4,0.5, 0.7], 'alpha':[0.0005, 0.001 ,0.01 ,0.1, 0.5, 1]}
    model = GridSearchCV(estimator=eln, param_grid=param_grid, cv=5, scoring=RMSE)
    model.fit(X_train, y_train)
    print model.best_score_
    print model.best_params_

def svr(X_train, X_test):
    svr = SVR('linear')
    param_grid = {'C':[0.1, 0.5, 1, 2, 5] }
    model = GridSearchCV(estimator=svr, param_grid=param_grid, cv=5, n_jobs=1, scoring=RMSE)
    model.fit(X_train, y_train)
    print model.best_score_
    print model.best_params_
    
def lasso(X_train, X_test):
    lasso = Lasso(random_state=0)
    model = GridSearchCV(estimator=lasso, param_grid={'alpha':[0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05]}, cv=5, scoring=RMSE)
    moel.fit(X_train, X_test)
    print model.best_score_
    print model.best_params_

### 3_3 stacking models
+ 5개의 모형을 사용해서 1차로 train을 시킨다
+ Ridge 모형으로 1차로 train된 데이터를 사용해서 다시 train 시켜서 최종 결과를 도출한다

In [189]:
def stacking(X_train, X_test, y_train):
    base_models=[RandomForestRegressor(n_estimators=600,max_features=80,max_depth=19,random_state=0),
                 GradientBoostingRegressor(n_estimators=1500, max_features=13, max_depth=3, learning_rate=0.05, random_state=0),
                 Lasso(alpha= 0.0005),
                 SVR(C=15, gamma=0.00005),
                 ExtraTreesRegressor(max_features=160,n_estimators=1050,max_depth=30,random_state=0)]
    X = X_train.values
    y = y_train.values
    T = X_test.values

    folds = KFold(len(y), n_folds=5, shuffle=True, random_state=0)

    S_train = np.zeros((X.shape[0],len(base_models)))
    S_test = np.zeros((T.shape[0],len(base_models)))
    for i, reg in enumerate(base_models):
        S_test_i = np.zeros((T.shape[0],5))
        print 'stage = {}/5'.format(i+1)
        for j, (train_idx, test_idx) in enumerate(folds):
            X_train = X[train_idx]
            y_train = y[train_idx]
            X_holdout = X[test_idx]

            reg.fit(X_train, y_train)
            y_pred= reg.predict(X_holdout)
            S_train[test_idx,i] = y_pred
            S_test_i[:,j] = reg.predict(T)
            print '  fold = {}/5'.format(j+1)
        S_test[:,i] = S_test_i.mean(1)

    param_grid = {'alpha':[0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 1.5,1.7, 2, 2.3, 2.5, 2.7]}
    model = GridSearchCV(estimator=Ridge(),param_grid=param_grid, cv=5, scoring=RMSE)
    model.fit(S_train, y)
    y_pred = model.predict(S_test)
    
    return np.expm1(y_pred)

## 4 Run stacking model, make submission file

In [None]:
df_train = pd.read_csv('./data/train.csv')
df_test = pd.read_csv('./data/test.csv')
X_train, X_test, y_train = preprocess(df_train,df_test)

submission = pd.DataFrame({'Id':df_test.Id.values, 'SalePrice':y_pred})
submission.to_csv('submission.csv', index=False)