In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.model_selection import train_test_split
import lightgbm as lgb
from sklearn.linear_model import (
    LinearRegression,
    Ridge,
    Lasso
)
%matplotlib inline

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [2]:
train = pd.read_csv('./input/train.csv') #訓練データ
test = pd.read_csv('./input/test.csv') #テストデータ

# train = train.drop("Id", axis = 1)
target = np.log1p( train["SalePrice"])
train = train.drop("SalePrice", axis = 1)


In [3]:
target

0       12.247699
1       12.109016
2       12.317171
3       11.849405
4       12.429220
5       11.870607
6       12.634606
7       12.206078
8       11.774528
9       11.678448
10      11.771444
11      12.751303
12      11.877576
13      12.540761
14      11.964007
15      11.790565
16      11.911708
17      11.407576
18      11.976666
19      11.842236
20      12.692506
21      11.845110
22      12.345839
23      11.774528
24      11.944714
25      12.454108
26      11.811555
27      12.631344
28      12.242891
29      11.134604
          ...    
1430    12.165985
1431    11.875838
1432    11.074436
1433    12.136192
1434    11.982935
1435    12.066816
1436    11.699413
1437    12.885673
1438    11.916395
1439    12.190964
1440    12.160034
1441    11.913720
1442    12.644331
1443    11.703554
1444    12.098493
1445    11.767575
1446    11.969724
1447    12.388398
1448    11.626263
1449    11.429555
1450    11.820418
1451    12.567555
1452    11.884496
1453    11.344519
1454    12

In [4]:
train.columns

Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
       'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
       'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
       'GarageCond', 'PavedDrive

In [5]:
numerics = train.select_dtypes(include='number').columns.tolist()
numerics

['Id',
 'MSSubClass',
 'LotFrontage',
 'LotArea',
 'OverallQual',
 'OverallCond',
 'YearBuilt',
 'YearRemodAdd',
 'MasVnrArea',
 'BsmtFinSF1',
 'BsmtFinSF2',
 'BsmtUnfSF',
 'TotalBsmtSF',
 '1stFlrSF',
 '2ndFlrSF',
 'LowQualFinSF',
 'GrLivArea',
 'BsmtFullBath',
 'BsmtHalfBath',
 'FullBath',
 'HalfBath',
 'BedroomAbvGr',
 'KitchenAbvGr',
 'TotRmsAbvGrd',
 'Fireplaces',
 'GarageYrBlt',
 'GarageCars',
 'GarageArea',
 'WoodDeckSF',
 'OpenPorchSF',
 'EnclosedPorch',
 '3SsnPorch',
 'ScreenPorch',
 'PoolArea',
 'MiscVal',
 'MoSold',
 'YrSold']

In [6]:
origin_categoricals = train.select_dtypes(exclude='number').columns.tolist()
origin_categoricals
# categoricals[3]

['MSZoning',
 'Street',
 'Alley',
 'LotShape',
 'LandContour',
 'Utilities',
 'LotConfig',
 'LandSlope',
 'Neighborhood',
 'Condition1',
 'Condition2',
 'BldgType',
 'HouseStyle',
 'RoofStyle',
 'RoofMatl',
 'Exterior1st',
 'Exterior2nd',
 'MasVnrType',
 'ExterQual',
 'ExterCond',
 'Foundation',
 'BsmtQual',
 'BsmtCond',
 'BsmtExposure',
 'BsmtFinType1',
 'BsmtFinType2',
 'Heating',
 'HeatingQC',
 'CentralAir',
 'Electrical',
 'KitchenQual',
 'Functional',
 'FireplaceQu',
 'GarageType',
 'GarageFinish',
 'GarageQual',
 'GarageCond',
 'PavedDrive',
 'PoolQC',
 'Fence',
 'MiscFeature',
 'SaleType',
 'SaleCondition']

In [7]:
import category_encoders
# categoricals =['MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities',
#        'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2']
ce = category_encoders.OrdinalEncoder(cols=origin_categoricals,handle_unknown='impute')
train[origin_categoricals] = ce.fit_transform(train[origin_categoricals])

features = origin_categoricals + numerics


# train["SalePrice"]

In [8]:
low_fis = ['SaleCondition','LowQualFinSF', 'SaleType', 'Heating', 'GarageQual', 'BsmtFinType2', 'GarageCond',
     'RoofMatl', '3SsnPorch', 'PoolArea', 'Condition2', 'Functional', 'PoolQC', 'Utilities', 'Street', 'MiscVal',
           'BldgType', 'MiscFeature', 'RoofStyle', 'Alley',
#          ]

            'MSZoning', 'BsmtHalfBath', 'Foundation', 'Fence', 'KitchenAbvGr', 'HouseStyle', 'GarageType', 
            'MasVnrType', 'Condition1', 'BsmtQual', 'PavedDrive', 'LotConfig', 'LandSlope', 'ExterCond', 
            'Electrical', 'BsmtFinSF2', 'BsmtCond', 'BsmtExposure', 'CentralAir', 'Exterior2nd'
          ]

categoricals = origin_categoricals.copy()

for low_fi in low_fis:
    features.remove(low_fi)
    if low_fi in categoricals:
        categoricals.remove(low_fi)
    
train = train[features]

In [9]:
params = {
            'boosting_type': 'gbdt',
            'objective': 'regression',
            'metric': {'rmse'},
            'subsample': 0.7,
            'subsample_freq': 1,
            'learning_rate': 0.01,
            'num_leaves': 30,
            'min_data_in_leaf':24,
            'feature_fraction': 0.9,
            'lambda_l1': 1,  
            'lambda_l2': 1
            #'device':'gpu'
            }

In [10]:

folds = 5
seed = 666

#kf = StratifiedKFold(n_splits=folds, shuffle=False, random_state=seed)

In [11]:
train_X, val_X , train_y, val_y =  train_test_split(train, target, test_size= 0.2, random_state = 666)



In [12]:
val_X

Unnamed: 0,LotShape,LandContour,Neighborhood,Exterior1st,ExterQual,BsmtFinType1,HeatingQC,KitchenQual,FireplaceQu,GarageFinish,...,Fireplaces,GarageYrBlt,GarageCars,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,ScreenPorch,MoSold,YrSold
214,2,1,1,4,2,1,1,2,1,3,...,0,1977.0,1,299,0,36,0,0,3,2010
999,1,1,1,1,1,1,1,1,1,1,...,0,2006.0,2,632,105,61,0,0,2,2010
545,2,1,7,4,1,1,2,1,1,2,...,0,1988.0,3,786,0,0,0,0,2,2006
254,1,1,12,2,2,4,3,2,1,1,...,0,1957.0,1,294,250,0,0,0,6,2010
402,1,1,10,3,2,3,1,2,1,2,...,0,1940.0,1,240,168,0,0,0,8,2008
267,1,2,24,3,2,7,1,1,3,2,...,1,1939.0,1,240,262,24,0,0,7,2008
212,1,1,6,1,2,1,1,1,2,1,...,1,2009.0,2,644,168,108,0,0,7,2009
540,1,1,17,1,3,1,1,3,3,3,...,1,2006.0,3,765,270,68,0,0,3,2009
1318,3,1,1,1,1,3,1,1,2,1,...,1,2001.0,3,748,198,150,0,0,8,2006
128,1,1,12,1,2,5,3,2,4,1,...,1,1966.0,2,453,188,108,0,0,7,2006


In [13]:
lgb_train = lgb.Dataset(train_X, train_y, categorical_feature=categoricals)
lgb_eval = lgb.Dataset(val_X, val_y, categorical_feature=categoricals)

In [14]:
gbm = lgb.train(params,
                lgb_train,
                num_boost_round=10000,
                valid_sets=(lgb_train, lgb_eval),
                early_stopping_rounds=100,
                verbose_eval = 100)



Training until validation scores don't improve for 100 rounds.
[100]	training's rmse: 0.217586	valid_1's rmse: 0.214318
[200]	training's rmse: 0.147446	valid_1's rmse: 0.15889
[300]	training's rmse: 0.119777	valid_1's rmse: 0.140326
[400]	training's rmse: 0.106687	valid_1's rmse: 0.132661
[500]	training's rmse: 0.0993521	valid_1's rmse: 0.129506
[600]	training's rmse: 0.0944878	valid_1's rmse: 0.12794
[700]	training's rmse: 0.0906292	valid_1's rmse: 0.126831
[800]	training's rmse: 0.0873253	valid_1's rmse: 0.12634
[900]	training's rmse: 0.0845103	valid_1's rmse: 0.126073
[1000]	training's rmse: 0.082225	valid_1's rmse: 0.125732
[1100]	training's rmse: 0.0801284	valid_1's rmse: 0.125582
[1200]	training's rmse: 0.0783271	valid_1's rmse: 0.125365
[1300]	training's rmse: 0.0765878	valid_1's rmse: 0.125232
[1400]	training's rmse: 0.0749431	valid_1's rmse: 0.125191
[1500]	training's rmse: 0.073402	valid_1's rmse: 0.125281
Early stopping, best iteration is:
[1430]	training's rmse: 0.0744681	v

In [15]:
fi = pd.DataFrame(  {0: train_X.columns ,  1: gbm.feature_importance()})
low_fis = fi.sort_values(1).head(20)[0].tolist()
print(low_fis)
# low_fis = ['SaleCondition','LowQualFinSF', 'SaleType', 'Heating', 'GarageQual', 'BsmtFinType2', 'GarageCond',
#      'RoofMatl', '3SsnPorch', 'PoolArea', 'Condition2', 'Functional', 'PoolQC', 'Utilities', 'Street', 'MiscVal', 'BldgType', 'MiscFeature', 'RoofStyle', 'Alley']
# print(categoricals)
# for low_fi in low_fis:
#     print (low_fi)
#     features.remove(low_fi)

['FullBath', 'LandContour', 'EnclosedPorch', 'BedroomAbvGr', 'HalfBath', 'GarageCars', 'FireplaceQu', 'TotRmsAbvGrd', 'ScreenPorch', 'Fireplaces', 'BsmtFullBath', 'LotShape', 'YrSold', 'BsmtFinType1', 'ExterQual', 'MSSubClass', 'Exterior1st', '2ndFlrSF', 'MasVnrArea', 'KitchenQual']


In [16]:
ce_test = category_encoders.OrdinalEncoder(cols=origin_categoricals,handle_unknown='impute')
test[origin_categoricals] = ce.fit_transform(test[origin_categoricals])

test = test[features]

In [17]:
len(test)

1459

In [18]:
res = gbm.predict(test)
for i in  range( len(test) ):
    res[i] = np.expm1(res[i])



In [19]:
from datetime import datetime
sub = pd.DataFrame()
sub['Id'] = test['Id']
sub['SalePrice'] = res
tmpTime = datetime.now().strftime("%Y%m%d%H%M%S")
filename='submission'+tmpTime+'.csv'
sub.to_csv( filename, index=False)
