# 라이브러리 불러오기

In [1]:
import datetime
import os
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.metrics import mean_squared_error, mean_absolute_error
import warnings

from src.pre_process.feature_add import FeatureAddition

warnings.filterwarnings('ignore')

# 랜덤 시드 설정

In [2]:
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

In [3]:
RAW_BASE_PATH = os.path.join("..", "..", "data", "raw")
PROCESSED_BASE_PATH = os.path.join("..", "..", "data", "processed")
recent_df = pd.read_csv(os.path.join(PROCESSED_BASE_PATH, "V1_all_apt_idx_recent_deposit.csv"))[
    ["recent_deposit", "contract_ymd"]]
train = pd.read_csv(os.path.join(RAW_BASE_PATH, 'train.csv'))
train.reset_index(drop=False, inplace=True)
test = pd.read_csv(os.path.join(RAW_BASE_PATH, 'test.csv'))
recent_df.head()

Unnamed: 0,recent_deposit,contract_ymd
0,-999.0,2019-04-01
1,-999.0,2019-04-01
2,-999.0,2019-04-01
3,-999.0,2019-04-01
4,-999.0,2019-04-01


In [4]:
# Train 중복 제거
from src.pre_process.feature_duplicate import FeatureDuplication

train = FeatureDuplication(train).get_data()
train.reset_index(drop=False, inplace=True)
train["_type"] = "train"
test["_type"] = "test"
df = pd.concat([train, test], axis=0, ignore_index=True)
del train, test
from src.pre_process.feature_add import FeatureAddition

df = FeatureAddition(df).get_data()
df.head()

Unnamed: 0,index,level_0,area_m2,contract_year_month,contract_day,contract_type,floor,built_year,latitude,longitude,age,deposit,_type,apt_idx,area,area_price,area_m2_price
0,0,0.0,84.9981,201906,25,2,9,2019,37.054314,127.045216,0,17000.0,train,0,25.8,658.914729,200.004471
1,1,1.0,84.9981,202003,26,2,20,2019,37.054314,127.045216,1,23000.0,train,0,25.8,891.472868,270.594284
2,2,2.0,84.9981,202003,28,2,8,2019,37.054314,127.045216,1,23000.0,train,0,25.8,891.472868,270.594284
3,3,3.0,59.34,201907,15,2,1,1986,36.964647,127.055847,33,5000.0,train,1,18.0,277.777778,84.260195
4,4,4.0,59.81,201904,12,2,6,1995,36.97239,127.084514,24,1800.0,train,2,18.1,99.447514,30.095302


In [6]:
df = pd.concat([df, recent_df], axis=1)
del recent_df
df.head()

Unnamed: 0,index,level_0,area_m2,contract_year_month,contract_day,contract_type,floor,built_year,latitude,longitude,age,deposit,_type,apt_idx,area,area_price,area_m2_price,recent_deposit,contract_ymd
0,0,0.0,84.9981,201906,25,2,9,2019,37.054314,127.045216,0,17000.0,train,0,25.8,658.914729,200.004471,-999.0,2019-04-01
1,1,1.0,84.9981,202003,26,2,20,2019,37.054314,127.045216,1,23000.0,train,0,25.8,891.472868,270.594284,-999.0,2019-04-01
2,2,2.0,84.9981,202003,28,2,8,2019,37.054314,127.045216,1,23000.0,train,0,25.8,891.472868,270.594284,-999.0,2019-04-01
3,3,3.0,59.34,201907,15,2,1,1986,36.964647,127.055847,33,5000.0,train,1,18.0,277.777778,84.260195,-999.0,2019-04-01
4,4,4.0,59.81,201904,12,2,6,1995,36.97239,127.084514,24,1800.0,train,2,18.1,99.447514,30.095302,-999.0,2019-04-01


In [7]:
df.drop(columns=["level_0", "contract_year_month", "contract_day", "area", "area_price"], inplace=True)
df.columns

Index(['index', 'area_m2', 'contract_type', 'floor', 'built_year', 'latitude',
       'longitude', 'age', 'deposit', '_type', 'apt_idx', 'area_m2_price',
       'recent_deposit', 'contract_ymd'],
      dtype='object')

In [8]:
df.set_index("apt_idx", inplace=True)
df.head()

Unnamed: 0_level_0,index,area_m2,contract_type,floor,built_year,latitude,longitude,age,deposit,_type,area_m2_price,recent_deposit,contract_ymd
apt_idx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
0,0,84.9981,2,9,2019,37.054314,127.045216,0,17000.0,train,200.004471,-999.0,2019-04-01
0,1,84.9981,2,20,2019,37.054314,127.045216,1,23000.0,train,270.594284,-999.0,2019-04-01
0,2,84.9981,2,8,2019,37.054314,127.045216,1,23000.0,train,270.594284,-999.0,2019-04-01
1,3,59.34,2,1,1986,36.964647,127.055847,33,5000.0,train,84.260195,-999.0,2019-04-01
2,4,59.81,2,6,1995,36.97239,127.084514,24,1800.0,train,30.095302,-999.0,2019-04-01


In [9]:

for filename in os.listdir(os.path.join(PROCESSED_BASE_PATH, "apt_idx")):
    print(filename)
    if filename.find("onehot") != -1:
        continue
    if filename.find("interest") != -1:
        interest_df = pd.read_csv(os.path.join(PROCESSED_BASE_PATH, "apt_idx", str(filename)))
        interest_df["contract_year_month"] = pd.to_datetime(interest_df["contract_year_month"])
        interest_df.set_index("contract_year_month", inplace=True)
        df["contract_year_month"] = pd.to_datetime(df["contract_ymd"]).map(lambda x: x.replace(x.year, x.month, 1))
        df.reset_index(inplace=True)
        df.set_index("contract_year_month", inplace=True)
        df = df.join(interest_df, on="contract_year_month", how="left")
        df.set_index("apt_idx", inplace=True)
        continue
    apt_df = pd.read_csv(os.path.join(PROCESSED_BASE_PATH, "apt_idx", str(filename)))
    apt_df.set_index("apt_idx", inplace=True)
    if 'Unnamed: 0' in apt_df.columns:
        apt_df.drop(columns=['Unnamed: 0'], inplace=True)

    df = df.join(apt_df, on="apt_idx", how="left")
    del apt_df

apt_idx_grid_id.csv
apt_idx_park.csv
apt_idx_school.csv
apt_idx_subway.csv
apt_idx_subway_onehot.csv
year_month_interest.csv


In [10]:
df

Unnamed: 0_level_0,index,area_m2,contract_type,floor,built_year,latitude,longitude,age,deposit,_type,...,nearest_high_school_within_1km,has_high_school_within_1km,nearest_subway_distance,nearest_subway_idx,num_subway_within_1km,category_interchange_within_1km,num_subway_within_500m,category_interchange_within_500m,interest_rate,diff_interest_rate
apt_idx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0,84.9981,2,9,2019,37.054314,127.045216,0,17000.0,train,...,1,True,717,40,1,1,0,0,1.85,-0.09
0,1,84.9981,2,20,2019,37.054314,127.045216,1,23000.0,train,...,1,True,717,40,1,1,0,0,1.85,-0.09
0,2,84.9981,2,8,2019,37.054314,127.045216,1,23000.0,train,...,1,True,717,40,1,1,0,0,1.85,-0.09
1,3,59.3400,2,1,1986,36.964647,127.055847,33,5000.0,train,...,1,True,3897,41,0,0,0,0,1.85,-0.09
2,4,59.8100,2,6,1995,36.972390,127.084514,24,1800.0,train,...,0,False,2039,41,0,0,0,0,1.85,-0.09
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17869,150167,115.5101,0,17,2010,37.528394,126.659398,14,,test,...,2,True,1483,650,0,0,0,0,3.56,0.00
17869,150168,142.8738,0,4,2010,37.528394,126.659398,14,,test,...,2,True,1483,650,0,0,0,0,3.56,0.00
17869,150169,142.8738,1,13,2010,37.528394,126.659398,14,,test,...,2,True,1483,650,0,0,0,0,3.56,0.00
17869,150170,114.9285,1,2,2010,37.528394,126.659398,14,,test,...,2,True,1483,650,0,0,0,0,3.56,0.00


In [11]:
df.isna().sum()

index                                        0
area_m2                                      0
contract_type                                0
floor                                        0
built_year                                   0
latitude                                     0
longitude                                    0
age                                          0
deposit                                 150172
_type                                        0
area_m2_price                           150172
recent_deposit                               0
contract_ymd                                 0
grid_id                                      0
nearest_park_distance                        0
nearest_park_within_500.0m                   0
has_park_within_500.0m                       0
nearest_elementary_school_distance           0
nearest_elementary_school_within_1km         0
has_elementary_school_within_1km             0
nearest_middle_school_distance               0
nearest_middl

In [12]:
df.reset_index(drop=False, inplace=True)
df.to_csv(os.path.join(PROCESSED_BASE_PATH, "V2_features.csv"), index=False)