In [1]:
import platform
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# 그래프 한글 깨짐 방지
if platform.system() == 'Darwin':
    # 맥일때
    plt.rcParams["font.family"] = 'AppleGothic'
elif platform.system() == 'Windows':
    # 윈도우일때
    plt.rcParams["font.family"] = 'Malgun Gothic'

# 마이너스기호(-) 폰트 깨지는 문제 해결
plt.rcParams['axes.unicode_minus'] = False

In [2]:
# 데이터 디렉토리 파일 확인
import os
print(os.listdir("../input_2019-2nd-ml-month-with-kakr/"))

['test.csv', 'train.csv', 'sample_submission.csv']


In [3]:
# train, test 데이터 read(dataframe)
train = pd.read_csv("../input_2019-2nd-ml-month-with-kakr/train.csv")
test = pd.read_csv("../input_2019-2nd-ml-month-with-kakr/test.csv")

In [4]:
train.shape

(15035, 21)

In [5]:
test.shape

(6468, 20)

### Regression Model
* Decition Tree
* Support Vector Machine
* Random Forest
* kNN
* GLM

In [6]:
train.head()

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,0,20141013T000000,221900.0,3,1.0,1180,5650,1.0,0,0,...,7,1180,0,1955,0,98178,47.5112,-122.257,1340,5650
1,1,20150225T000000,180000.0,2,1.0,770,10000,1.0,0,0,...,6,770,0,1933,0,98028,47.7379,-122.233,2720,8062
2,2,20150218T000000,510000.0,3,2.0,1680,8080,1.0,0,0,...,8,1680,0,1987,0,98074,47.6168,-122.045,1800,7503
3,3,20140627T000000,257500.0,3,2.25,1715,6819,2.0,0,0,...,7,1715,0,1995,0,98003,47.3097,-122.327,2238,6819
4,4,20150115T000000,291850.0,3,1.5,1060,9711,1.0,0,0,...,7,1060,0,1963,0,98198,47.4095,-122.315,1650,9711


In [7]:
# train 데이터셋을 학습에 사용할 train feature 데이터프레임에 추가(price 칼럼 제외)
X_train = train.drop("price", axis=1)
X_test = test.copy()

In [8]:
# ID 칼럼 삭제
del X_train["id"]
del X_test["id"]

In [22]:
# date 칼럼 삭제
del X_train["date"]
del X_test["date"]

In [11]:
# 학습 결과 확인용 lable만으로 데이터 프레임 제작
Y_train = train["price"]

In [14]:
# floors 변수 확인, 최소값 1, 최대값 3.5임
X_train["floors"].describe()

count    15035.000000
mean         1.498071
std          0.538522
min          1.000000
25%          1.000000
50%          1.500000
75%          2.000000
max          3.500000
Name: floors, dtype: float64

In [15]:
# floors가 1, 2, 3인 경우는 floor_attic에 0, 0.5가 있는 경우는 1로 파생변수 추가
X_train["floor_attic"] = X_train["floors"].map({1 : 0, 1.5 : 1, 2 : 0, 2.5 : 1, 3 : 0, 3.5 : 1})
X_test["floor_attic"] = X_test["floors"].map({1 : 0, 1.5 : 1, 2 : 0, 2.5 : 1, 3 : 0, 3.5 : 1})

In [23]:
# Random Forest로 정답제출용 submission 데이터프레임 작성

from sklearn.ensemble import RandomForestRegressor

forest_reg = RandomForestRegressor(n_estimators=100)
forest_reg.fit(X_train, Y_train)
Y_pred = forest_reg.predict(X_test)

submission = pd.DataFrame({
        "id": test["id"],
        "price": Y_pred
    })
submission.shape

(6468, 2)

In [24]:
submission.head(10)

Unnamed: 0,id,price
0,15035,482387.5
1,15036,473020.75
2,15037,1322427.3
3,15038,279540.75
4,15039,318825.55
5,15040,323749.77
6,15041,474403.29
7,15042,714089.09
8,15043,310510.82
9,15044,621862.0


In [25]:
# 정답제출용 데이터 프레임으로 정답제출 파일 작성
submission.to_csv('submission_HPP_rf_floor.csv', index=False)