# 2019 2nd ML month with KaKR

### 캐글 코리아와 함께하는 2nd ML 대회 - House Price Prediction

AIFFEL 기초다지기 node 10



In [1]:
# 주피터 NB 시각화 설정
import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

In [2]:
import warnings
warnings.filterwarnings("ignore")

import os
from os.path import join

import pandas as pd
import numpy as np

import missingno as msno

from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.model_selection import KFold, cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import xgboost as xgb
import lightgbm as lgb

import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
data_dir = "~/aiffel/kaggle_kakr_housing/data"

train_data_path = join(data_dir, 'train.csv')
sub_data_path = join(data_dir, 'test.csv')      # 테스트, 즉 submission 시 사용할 데이터 경로

In [4]:
data = pd.read_csv(train_data_path)
sub = pd.read_csv(sub_data_path)
print('train data dim : {}'.format(data.shape))
print('sub data dim : {}'.format(sub.shape))

train data dim : (15035, 21)
sub data dim : (6468, 20)


__전처리__

In [5]:
y = data['price']
del data['price']

In [6]:
train_len = len(data)
data = pd.concat((data, sub), axis=0)

In [7]:
data['date'] = data['date'].apply(lambda i: i[:6]).astype(int)
data.head

<bound method NDFrame.head of          id    date  bedrooms  bathrooms  sqft_living  sqft_lot  floors  \
0         0  201410         3       1.00         1180      5650     1.0   
1         1  201502         2       1.00          770     10000     1.0   
2         2  201502         3       2.00         1680      8080     1.0   
3         3  201406         3       2.25         1715      6819     2.0   
4         4  201501         3       1.50         1060      9711     1.0   
...     ...     ...       ...        ...          ...       ...     ...   
6463  21498  201406         3       1.75         1500     11968     1.0   
6464  21499  201501         3       2.00         1490      1126     3.0   
6465  21500  201502         3       2.50         1310      1294     2.0   
6466  21501  201406         2       0.75         1020      1350     2.0   
6467  21502  201501         3       2.50         1600      2388     2.0   

      waterfront  view  condition  grade  sqft_above  sqft_basement  

In [8]:
sub_id = data['id'][train_len:]
del data['id']

In [9]:
train = data.iloc[:train_len,:]
X_sub = data.iloc[train_len:,:]

In [10]:
# 치우친 분포의 칼럼 스케일링
skew_columns = ['bedrooms', 'sqft_living', 'sqft_lot',
'sqft_above', 'sqft_basement', 'sqft_lot15', 'sqft_living15']

for c in skew_columns:
    data[c] = np.log1p(data[c].values)

### 모델링

In [11]:
random_state=2024
gboost = GradientBoostingRegressor(random_state=random_state)
xgboost = xgb.XGBRegressor(random_state=random_state)
lightgbm = lgb.LGBMRegressor(random_state=random_state)
rdforest = RandomForestRegressor(random_state=random_state)

models = [gboost, xgboost, lightgbm, rdforest]
# models = [{'model':gboost, 'name':'GradientBoosting'},
#           {'model':xgboost, 'name':'XGBoost'},
#           {'model':lightgbm, 'name':'LightGBM'},
#           {'model':rdforest, 'name':'RandomForestRegressor'}]

__KFold Cross-validation 도입실패__
```python
def get_cv_score(models):
    kfold = KFold(n_splits=5, random_state=2019).get_n_splits(x.values)
    for m in models:
        print("Model {} CV score : {:.4f}".format(
            m['name'], np.mean(cross_val_score(m['model'], x.values, y)), 
                                             kf=kfold))

def get_cv_score(models):
    kfold = KFold(n_splits=5).get_n_splits(x.values)
    for m in models:
        CV_score = np.mean(cross_val_score(m['model'], X=x.values, y=y, cv=kfold))
        print(f"Model: {m['name']}, CV score:{CV_score:.4f}")

def get_cv_score(models):
    kfold = KFold(n_splits=5).get_n_splits(x.values)
    for m in models:
        CV_score = np.mean(cross_val_score(m['model'], X=x.values, y=y, cv=kfold))
        print(f"Model: {m['name']}, CV score:{CV_score:.4f}")
```

In [12]:
def rmse(y_test, y_pred):
    return np.sqrt(mean_squared_error(np.expm1(y_test), np.expm1(y_pred)))

__아래 `get_scores` 함수는 에러가 난다.__

In [15]:
def get_scores(models, train, y, random_state):
    # train, test 데이터셋 분리
    # random_state를 사용하여 고정하고 train과 test 셋의 비율은 8:2로 합니다.
    X_train, X_test, y_train, y_test = train_test_split(
                            train, y, test_size=0.2, random_state=random_state)

    for model in models:
        # 모델 이름 획득
        model_name = model.__class__.__name__

        # 모델 학습
        model.fit(X_train, y_train)

        # 예측
        y_preds = model.predict(X_test)

        # 예측 결과의 rmse값 저장
        print(y_test.head())
        print(y_preds.head())
        df[model_name] = rmse(y_test, y_preds)
  
    # data frame에 저장
    score_df = pd.DataFrame(df, index=['RMSE']).T.sort_values('RMSE', ascending=False)
    return score_df


# get_scores(models, train, y, random_state)

### 그리드 탐색

In [16]:
from sklearn.model_selection import GridSearchCV

In [17]:
# LMS 노드 예제 그대로
param_grid = {
    'n_estimators': [50, 100],
    'max_depth': [1, 10],
}

In [18]:
model = lgb.LGBMRegressor(random_state=random_state)

In [19]:
def my_GridSearch(model, train, y, param_grid, verbose=2, n_jobs=5):

    grid_model = GridSearchCV(model, param_grid=param_grid,
                        scoring='neg_mean_squared_error',
                        cv=5, verbose=verbose, n_jobs=n_jobs)

    grid_model.fit(train, y)
    
    results = pd.DataFrame(grid_model.cv_results_['params'])
    results['score']=grid_model.cv_results_['mean_test_score']
    results['RMSLE'] = np.sqrt(-1 * results['score'])
    results.sort_values('RMSLE', inplace=True)
    
    return results

In [20]:
srch_rslt = my_GridSearch(model, train, y, param_grid, verbose=2, n_jobs=5)
srch_rslt

Fitting 5 folds for each of 4 candidates, totalling 20 fits


Unnamed: 0,max_depth,n_estimators,score,RMSLE
3,10,100,-16365390000.0,127927.284712
2,10,50,-17247030000.0,131327.941095
1,1,100,-38567070000.0,196385.016448
0,1,50,-47903110000.0,218867.798643


### 에버리징 구현 실패 ㅠㅠ

### 캐글 제출 폼 만들기

In [21]:
model = lgb.LGBMRegressor(max_depth=10, n_estimators=100, random_state=random_state)

In [22]:
def save_submission(model, train, y, test, model_name, rmsle):
    model.fit(train, y)
    prediction = model.predict(test)
    prediction = np.expm1(prediction)
    
    data_dir = os.getenv('HOME')+'/aiffel/kaggle_kakr_housing/data'
    submission_path = join(data_dir, 'sample_submission.csv')
    
    submission = pd.read_csv(submission_path)
    submission['price'] = prediction
    
    submission_csv_path ='{}/submission_{}_RMSLE_{}.csv'.format(data_dir, model_name, rmsle)
    submission.to_csv(submission_csv_path, index=False)
    
    print("saved to, "+submission_csv_path)
    return

In [24]:
save_submission(
    model, train, y[:train_len], X_sub, 'LGBMRegressor', rmsle='0.164399')

saved to, /aiffel/aiffel/kaggle_kakr_housing/data/submission_LGBMRegressor_RMSLE_0.164399.csv
[CV] END .......................max_depth=1, n_estimators=50; total time=   0.6s
[CV] END ......................max_depth=1, n_estimators=100; total time=   0.8s
[CV] END ......................max_depth=10, n_estimators=50; total time=   1.3s
[CV] END .....................max_depth=10, n_estimators=100; total time=   1.6s
[CV] END .......................max_depth=1, n_estimators=50; total time=   0.4s
[CV] END ......................max_depth=1, n_estimators=100; total time=   0.6s
[CV] END ......................max_depth=10, n_estimators=50; total time=   1.4s
[CV] END .....................max_depth=10, n_estimators=100; total time=   1.6s
[CV] END .......................max_depth=1, n_estimators=50; total time=   0.7s
[CV] END ......................max_depth=10, n_estimators=50; total time=   1.0s
[CV] END .....................max_depth=10, n_estimators=100; total time=   2.0s
[CV] END ......