# 라이브러리 불러오기

In [None]:
import os
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.metrics import mean_squared_error, mean_absolute_error
import warnings

from src.pre_process.feature_add import FeatureAddition

warnings.filterwarnings('ignore')

# 랜덤 시드 설정

In [None]:
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

In [None]:
RAW_BASE_PATH = os.path.join("..", "..", "data", "raw")
PROCESSED_BASE_PATH = os.path.join("..", "..", "data", "processed")
# train_data = pd.read_csv(os.path.join(BASE_PATH, 'train.csv'))
# test_data = pd.read_csv(os.path.join(BASE_PATH, 'test.csv'))
train = pd.read_csv(os.path.join(RAW_BASE_PATH, 'train.csv'))
train.reset_index(drop=False, inplace=True)
test = pd.read_csv(os.path.join(RAW_BASE_PATH, 'test.csv'))

school = pd.read_csv(os.path.join(RAW_BASE_PATH, 'schoolinfo.csv'))
subway = pd.read_csv(os.path.join(RAW_BASE_PATH, 'subwayInfo.csv'))
interest = pd.read_csv(os.path.join(RAW_BASE_PATH, 'interestRate.csv'))
park = pd.read_csv(os.path.join(RAW_BASE_PATH, 'parkInfo.csv'))
sample_submission = pd.read_csv(os.path.join(RAW_BASE_PATH, 'sample_submission.csv'))


In [None]:
# Train 중복 제거
from src.pre_process.feature_duplicate import FeatureDuplication

train = FeatureDuplication(train).get_data()
train.reset_index(drop=False, inplace=True)

In [None]:
train["_type"] = "train"
test["_type"] = "test"
df = pd.concat([train, test], axis=0, ignore_index=True)
df

In [None]:
from src.pre_process.feature_add import FeatureAddition

df = FeatureAddition(df).get_data()
df.head()

In [None]:

from src.pre_process.subway.subway_distance_feature_add import SubwayDistanceFeatureAddition

df = SubwayDistanceFeatureAddition(df, subway).get_data()
df

In [None]:
df[['apt_idx', 'nearest_subway_distance', 'nearest_subway_idx',
    'num_subway_within_1km', 'category_interchange_within_1km',
    'num_subway_within_500m', 'category_interchange_within_500m']].drop_duplicates().to_csv(
    os.path.join(PROCESSED_BASE_PATH, 'apt_idx_subway.csv'))

In [None]:
from src.pre_process.park.park_info_feature_add import ParkInfoFeatureAdd

df = ParkInfoFeatureAdd(df, park).get_data()
df

In [None]:
df[['apt_idx', 'nearest_park_distance', 'nearest_park_within_500.0m',
    'has_park_within_500.0m']].drop_duplicates().to_csv(os.path.join(PROCESSED_BASE_PATH, 'apt_idx_park.csv'))

In [None]:
from src.pre_process.school.school_info_feature_add import SchoolInfoFeatureAdd

df = SchoolInfoFeatureAdd(df, school).get_data()
df

In [None]:
df[['apt_idx', 'nearest_elementary_school_distance',
    'nearest_elementary_school_within_1km',
    'has_elementary_school_within_1km', 'nearest_middle_school_distance',
    'nearest_middle_school_within_1km', 'has_middle_school_within_1km',
    'nearest_high_school_distance', 'nearest_high_school_within_1km',
    'has_high_school_within_1km']].drop_duplicates().to_csv(os.path.join(PROCESSED_BASE_PATH, 'apt_idx_school.csv'))

In [None]:
from src.pre_procecss.interest.diff_interestrate import DiffInterestRateAdder

df = DiffInterestRateAdder(df, interest).get_data()
df.head()

In [None]:
df[
    ["contract_year_month", "interest_rate", "diff_interest_rate"]].drop_duplicates().reset_index(
    drop=True).sort_values(by="contract_year_month").to_csv(
    os.path.join(PROCESSED_BASE_PATH, 'year_month_interest.csv'), index=False)

In [None]:
from src.pre_process.deposit.grid_deposit import GridFeature

df = GridFeature(df).get_data()
df.head()

In [None]:
df[["apt_idx", "grid_id"]].drop_duplicates().to_csv(os.path.join(PROCESSED_BASE_PATH, 'apt_idx_grid_id.csv'))