## Preprocessing
- null값이 대부분인 변수 제거
- 남은 변수의 null값 채우기
- skewed data 로그 변환
- category 변수 인코딩

In [41]:
from scipy.stats import skew

In [42]:
def preprocess(train, test):
    y_train = train.SalePrice
    train.drop(['Id','SalePrice'], axis=1, inplace=True)
    test.drop(['Id'], axis=1, inplace=True)

    # null 값이 많은 feature 삭제
    missing = train.isnull().sum()
    to_delete = missing[missing>600]
    train.drop(list(to_delete.index), axis=1, inplace=True)
    test.drop(list(to_delete.index), axis=1, inplace=True)

    category = train.dtypes[train.dtypes=='object'].index
    numerical = train.dtypes[train.dtypes!='object'].index

    #skewed 된 자료를 log를 취해준다.    
    skewness = train[numerical].apply(lambda x : skew(x.dropna()))
    skew_idx = skewness[skewness>0.75].index
    train[skew_idx]=np.log1p(train[skew_idx])
    test[skew_idx]=np.log1p(test[skew_idx])
    
    #categorical data의 null 값을 최빈값으로 채운다.
    for i in category:
        train[i].fillna(train[i].mode().values[0], inplace=True)
        test[i].fillna(test[i].mode().values[0], inplace=True)

    #numerical data의 null 값을 중앙값으로 채운다.
    for i in numerical:
        train[i].fillna(train[i].median(), inplace=True)
        test[i].fillna(test[i].median(), inplace=True)

    #categorical 변수 인코딩
    train = pd.get_dummies(train)
    test = pd.get_dummies(test)
    
    dif = []
    for i in train.columns:
        if i not in test.columns:
            dif.append(i)

    test_null0 = np.zeros((1459,16))
    test_null = pd.DataFrame(test_null0, columns=dif)
    test = pd.concat([test, test_null], axis=1)    

    X_train = train
    X_test = test
    y_train = np.log1p(y_train)

    return X_train, X_test, y_train

## Detect Outliers

In [43]:
from sklearn.ensemble import IsolationForest

In [44]:
def outlier_detection(X_train,y_train):
    If = IsolationForest(contamination=0.05)
    If.fit(X_train)
    inlier =If.predict(X_train)

    outlier_idx = np.where(inlier==-1)[0]

    X_train.drop(outlier_idx, axis=0, inplace=True)
    y_train.drop(outlier_idx, axis=0, inplace=True)
    
    return X_train, y_train

In [10]:
[4, 11, 13, 20, 46, 66, 70, 167, 178, 185, 199, 224, 261, 309, 313, 318, 349, 412, 423, 440, 454, 477, 478, 523, 540, 581, 585,
 588, 595, 654, 688, 691, 774, 798, 875, 898, 926, 970, 987, 1027, 1109, 1169, 1182, 1239, 1256, 1298, 1324, 1353, 1359, 1405, 1442,1447]

[4,
 11,
 13,
 20,
 46,
 66,
 70,
 167,
 178,
 185,
 199,
 224,
 261,
 309,
 313,
 318,
 349,
 412,
 423,
 440,
 454,
 477,
 478,
 523,
 540,
 581,
 585,
 588,
 595,
 654,
 688,
 691,
 774,
 798,
 875,
 898,
 926,
 970,
 987,
 1027,
 1109,
 1169,
 1182,
 1239,
 1256,
 1298,
 1324,
 1353,
 1359,
 1405,
 1442,
 1447]

## Make models

In [45]:
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import make_scorer, mean_squared_error
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor



In [46]:
def error(real_value, prediction):
    return mean_squared_error(real_value,prediction)**0.5

RMSE = make_scorer(error, greater_is_better=False)

In [79]:
def random_forest(X_train, X_test, y_train):
    rfr = RandomForestRegressor(random_state=0)
    param_grid = {'n_estimators':[600], 'max_features':[25], 'max_depth':[11]}
    model = GridSearchCV(estimator=rfr, param_grid=param_grid, n_jobs=1, cv=4, scoring=RMSE)
    model.fit(X_train, y_train)
    
    print 'best score:'
    print model.best_score_
    print
    print 'best params:'
    print model.best_params_
    
    return model.best_score_, model.best_params_, model.grid_scores_

def gradient_boosting0(X_train, X_test, y_train):
    gbr = GradientBoostingRegressor(random_state=0)
    param_grid = {'n_estimators': [500],'max_features': [10,15],'max_depth': [6,8,10],'learning_rate': [0.05,0.1,0.15],'subsample': [0.8]}
    model = GridSearchCV(estimator=gbr, param_grid=param_grid, n_jobs=1, cv=10, scoring=RMSE)
    model.fit(X_train, y_train)
    print 'best score:'
    print model.best_score_
    print
    print 'best params:'
    print model.best_params_
    y_pred = model.predict(X_test)
    return np.exp(y_pred)
    
def gradient_boosting(X_train, X_test, y_train):
    gbr = GradientBoostingRegressor(n_estimators=1500, max_features=13, max_depth=3, learning_rate=0.05, random_state=0)
    gbr.fit(X_train, y_train)
    y_pred = gbr.predict(X_test)
    return np.exp(y_pred)
    
def extra_tree(X_train, X_test, y_train):
    etr=ExtraTreesRegressor(random_state=0)
    param_grid = {'n_estimators': [500,600,700], 'max_features': [10,15,20]}
    model = GridSearchCV(estimator=etr, param_grid=param_grid, n_jobs=2, cv=4, scoring=RMSE)
    model.fit(X_train, y_train)
    print 'best score:'
    print model.best_score_
    print
    print 'best params:'
    print model.best_params_


## Test

In [80]:
train = pd.read_csv('./data/train.csv')
test = pd.read_csv('./data/test.csv')
Id_list = test.Id.values

X_train, X_test, y_train = preprocess(train,test)
#X_train, y_train = outlier_detection(X_train0,y_train0)

y_pred = gradient_boosting0(X_train,X_test,y_train)
result_gbr = pd.DataFrame({'Id':Id_list, 'SalePrice':y_pred})
result_gbr.to_csv('./data/result_gbr.csv')