# 라이브러리 불러오기

In [None]:
import datetime
import os
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.metrics import mean_squared_error, mean_absolute_error
import warnings

from src.pre_process.feature_add import FeatureAddition

warnings.filterwarnings('ignore')

# 랜덤 시드 설정

In [None]:
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

In [None]:
RAW_BASE_PATH = os.path.join("..", "..", "data", "raw")
PROCESSED_BASE_PATH = os.path.join("..", "..", "data", "processed")
recent_df = pd.read_csv(os.path.join(PROCESSED_BASE_PATH, "V1_all_apt_idx_recent_deposit.csv"))[
    ["recent_deposit", "contract_ymd"]]
train = pd.read_csv(os.path.join(RAW_BASE_PATH, 'train.csv'))
train.reset_index(drop=False, inplace=True)
test = pd.read_csv(os.path.join(RAW_BASE_PATH, 'test.csv'))
recent_df.head()

In [None]:
# Train 중복 제거
from src.pre_process.feature_duplicate import FeatureDuplication

train = FeatureDuplication(train).get_data()
train.reset_index(drop=False, inplace=True)
train["_type"] = "train"
test["_type"] = "test"
df = pd.concat([train, test], axis=0, ignore_index=True)
del train, test
from src.pre_process.feature_add import FeatureAddition

df = FeatureAddition(df).get_data()
df.head()

In [None]:
from src.pre_procecss.interest.diff_interestrate import DiffInterestRateAdder

df = DiffInterestRateAdder(df, pd.read_csv(os.path.join(RAW_BASE_PATH, "interestRate.csv"))).get_data()
df.head()

In [None]:
df = pd.concat([df, recent_df], axis=1)
del recent_df
df.head()

In [None]:
df.drop(columns=["level_0", "contract_year_month", "contract_day", "area", "area_price"], inplace=True)
df.columns

In [None]:
df.set_index("apt_idx", inplace=True)
df.head()

In [None]:

for filename in os.listdir(os.path.join(PROCESSED_BASE_PATH, "apt_idx")):
    print(filename)
    if filename.find("onehot") != -1:
        continue
    if filename.find("interest") != -1:
        interest_df = pd.read_csv(os.path.join(PROCESSED_BASE_PATH, "apt_idx", str(filename)))
        interest_df["contract_year_month"] = pd.to_datetime(interest_df["contract_year_month"])
        interest_df.set_index("contract_year_month", inplace=True)
        df["contract_year_month"] = pd.to_datetime(df["contract_ymd"]).map(lambda x: x.replace(x.year, x.month, 1))
        df.reset_index(inplace=True)
        df.set_index("contract_year_month", inplace=True)
        df = df.join(interest_df, on="contract_year_month", how="left", rsuffix="_int")
        df.set_index("apt_idx", inplace=True)
        continue
    apt_df = pd.read_csv(os.path.join(PROCESSED_BASE_PATH, "apt_idx", str(filename)))
    apt_df.set_index("apt_idx", inplace=True)
    if 'Unnamed: 0' in apt_df.columns:
        apt_df.drop(columns=['Unnamed: 0'], inplace=True)

    df = df.join(apt_df, on="apt_idx", how="left")
    del apt_df

In [None]:
df.head()

In [None]:
# df.join(interest_df, on="contract_year_month", how="left")

In [24]:
df.reset_index(drop=False, inplace=True)
df.columns = [c.replace("_int", "") for c in df.columns]
df.to_csv(os.path.join(PROCESSED_BASE_PATH, "V2_features.csv"), index=False)

In [25]:
df.columns.sort_values()

Index(['_type', 'age', 'apt_idx', 'area_m2', 'area_m2_price', 'built_year',
       'category_interchange_within_1km', 'category_interchange_within_500m',
       'contract_type', 'contract_ymd', 'deposit', 'diff_interest_rate',
       'diff_interest_rate_int', 'floor', 'grid_id',
       'has_elementary_school_within_1km', 'has_high_school_within_1km',
       'has_middle_school_within_1km', 'has_park_within_500.0m', 'index',
       'interest_rate', 'interest_rate_int', 'latitude', 'level_0',
       'longitude', 'nearest_elementary_school_distance',
       'nearest_elementary_school_within_1km', 'nearest_high_school_distance',
       'nearest_high_school_within_1km', 'nearest_middle_school_distance',
       'nearest_middle_school_within_1km', 'nearest_park_distance',
       'nearest_park_within_500.0m', 'nearest_subway_distance',
       'nearest_subway_idx', 'num_subway_within_1km', 'num_subway_within_500m',
       'recent_deposit', 'year_month'],
      dtype='object')

In [28]:

df.to_csv(os.path.join(PROCESSED_BASE_PATH, "V2_features.csv"), index=False)