# 라이브러리 불러오기

In [1]:
import os
import pandas as pd
import numpy as np
from data.feature_engineering import find_nearest_haversine_distance

# 데이터 불러오기

In [2]:
# 파일 경로 지정
data_path: str = "~/house/data"

# train, test data 불러오기
train_data: pd.DataFrame = pd.read_csv(os.path.join(data_path, "train.csv"))
test_data: pd.DataFrame = pd.read_csv(os.path.join(data_path, "test.csv"))
sample_submission: pd.DataFrame = pd.read_csv(os.path.join(data_path, "sample_submission.csv"))

# 금리, 지하철, 학교, 공원 정보 불러오기
interest_data: pd.DataFrame = pd.read_csv(os.path.join(data_path, "interestRate.csv"))
subway_data: pd.DataFrame = pd.read_csv(os.path.join(data_path, "subwayInfo.csv"))
school_data: pd.DataFrame = pd.read_csv(os.path.join(data_path, "schoolinfo.csv"))
park_data: pd.DataFrame = pd.read_csv(os.path.join(data_path, "parkInfo.csv"))

# 데이터 병합

## 금리 데이터 병합
* `interest_data`: 2018년 12월 ~ 2024년 5월까지의 금리
* 계약 연월 기준으로 `interest_data`를 `train_data`로 병합 (2019년 4월 ~ 2023년 12월)
* 계약 연월 기준으로 `interest_data`를 `test_data`로 병합 (2024년 1월 ~ 2024년 6월)

In [3]:
train_data

Unnamed: 0,index,area_m2,contract_year_month,contract_day,contract_type,floor,built_year,latitude,longitude,age,deposit
0,0,84.9981,201906,25,2,9,2019,37.054314,127.045216,0,17000.0
1,1,84.9981,202003,26,2,20,2019,37.054314,127.045216,1,23000.0
2,2,84.9981,202003,28,2,8,2019,37.054314,127.045216,1,23000.0
3,3,59.3400,201907,15,2,1,1986,36.964647,127.055847,33,5000.0
4,4,59.8100,201904,12,2,6,1995,36.972390,127.084514,24,1800.0
...,...,...,...,...,...,...,...,...,...,...,...
1801223,1801223,114.8126,202311,25,0,5,2010,37.528394,126.659398,13,39000.0
1801224,1801224,101.9088,202311,28,0,6,2010,37.528394,126.659398,13,38000.0
1801225,1801225,114.7900,202312,3,0,19,2010,37.528394,126.659398,13,37000.0
1801226,1801226,101.9088,202312,4,1,15,2010,37.528394,126.659398,13,34400.0


In [4]:
test_data

Unnamed: 0,index,area_m2,contract_year_month,contract_day,contract_type,floor,built_year,latitude,longitude,age
0,0,84.9610,202404,12,1,14,2016,36.965423,127.048779,8
1,1,59.9000,202404,13,0,4,1997,36.963105,127.040678,27
2,2,39.2700,202404,29,0,5,1990,36.957089,127.047449,34
3,3,39.2700,202405,3,0,1,1990,36.957089,127.047449,34
4,4,46.9800,202406,2,0,4,1990,36.957089,127.047449,34
...,...,...,...,...,...,...,...,...,...,...
150167,150167,115.5101,202402,27,0,17,2010,37.528394,126.659398,14
150168,150168,142.8738,202403,2,0,4,2010,37.528394,126.659398,14
150169,150169,142.8738,202403,16,1,13,2010,37.528394,126.659398,14
150170,150170,114.9285,202403,22,1,2,2010,37.528394,126.659398,14


In [5]:
interest_data

Unnamed: 0,year_month,interest_rate
0,202405,3.56
1,202404,3.54
2,202403,3.59
3,202402,3.62
4,202401,3.66
...,...,...
61,201904,1.85
62,201903,1.94
63,201902,1.92
64,201901,1.99


In [6]:
# 계약 연월 기준으로 interest_data를 train_data로 병합
train_data: pd.DataFrame = pd.merge(train_data, interest_data, left_on="contract_year_month", right_on="year_month", how="left")
train_data: pd.DataFrame = train_data.drop(columns=["year_month"])
train_data

Unnamed: 0,index,area_m2,contract_year_month,contract_day,contract_type,floor,built_year,latitude,longitude,age,deposit,interest_rate
0,0,84.9981,201906,25,2,9,2019,37.054314,127.045216,0,17000.0,1.78
1,1,84.9981,202003,26,2,20,2019,37.054314,127.045216,1,23000.0,1.26
2,2,84.9981,202003,28,2,8,2019,37.054314,127.045216,1,23000.0,1.26
3,3,59.3400,201907,15,2,1,1986,36.964647,127.055847,33,5000.0,1.68
4,4,59.8100,201904,12,2,6,1995,36.972390,127.084514,24,1800.0,1.85
...,...,...,...,...,...,...,...,...,...,...,...,...
1801223,1801223,114.8126,202311,25,0,5,2010,37.528394,126.659398,13,39000.0,4.00
1801224,1801224,101.9088,202311,28,0,6,2010,37.528394,126.659398,13,38000.0,4.00
1801225,1801225,114.7900,202312,3,0,19,2010,37.528394,126.659398,13,37000.0,3.84
1801226,1801226,101.9088,202312,4,1,15,2010,37.528394,126.659398,13,34400.0,3.84


In [7]:
test_data: pd.DataFrame = pd.merge(test_data, interest_data, left_on="contract_year_month", right_on="year_month", how="left")
test_data: pd.DataFrame = test_data.drop(columns=["year_month"])
test_data

Unnamed: 0,index,area_m2,contract_year_month,contract_day,contract_type,floor,built_year,latitude,longitude,age,interest_rate
0,0,84.9610,202404,12,1,14,2016,36.965423,127.048779,8,3.54
1,1,59.9000,202404,13,0,4,1997,36.963105,127.040678,27,3.54
2,2,39.2700,202404,29,0,5,1990,36.957089,127.047449,34,3.54
3,3,39.2700,202405,3,0,1,1990,36.957089,127.047449,34,3.56
4,4,46.9800,202406,2,0,4,1990,36.957089,127.047449,34,
...,...,...,...,...,...,...,...,...,...,...,...
150167,150167,115.5101,202402,27,0,17,2010,37.528394,126.659398,14,3.62
150168,150168,142.8738,202403,2,0,4,2010,37.528394,126.659398,14,3.59
150169,150169,142.8738,202403,16,1,13,2010,37.528394,126.659398,14,3.59
150170,150170,114.9285,202403,22,1,2,2010,37.528394,126.659398,14,3.59


In [8]:
# 금리 결측치 개수 확인 (2024년 6월)
test_data["interest_rate"].isnull().sum()

11882

## 최단거리 데이터 병합

In [9]:
# train_data에서 위도, 경도 중복 행을 제외하고 추출
unique_loc_train: pd.DataFrame = train_data[["latitude", "longitude"]].drop_duplicates().reset_index(drop=True)
unique_loc_test: pd.DataFrame = test_data[["latitude", "longitude"]].drop_duplicates().reset_index(drop=True)

### subway 병합

In [10]:
# train_data에 최단거리 지하철 정보 추가
merged_df: pd.DataFrame = find_nearest_haversine_distance(unique_loc_train, subway_data)
merged_df: pd.DataFrame = pd.concat([merged_df, unique_loc_train], axis=1)
train_data: pd.DataFrame = pd.merge(train_data, merged_df, on=["latitude", "longitude"], how="left")
train_data.rename(columns={
    "nearest_distance": "nearest_subway_distance",
    "nearest_latitude": "nearest_subway_latitude",
    "nearest_longitude": "nearest_subway_longitude"
    }, 
    inplace=True
)

# test_data에 최단거리 지하철 정보 추가
merged_df: pd.DataFrame = find_nearest_haversine_distance(unique_loc_train, subway_data)
merged_df: pd.DataFrame = pd.concat([merged_df, unique_loc_train], axis=1)
test_data: pd.DataFrame = pd.merge(test_data, merged_df, on=["latitude", "longitude"], how="left")
test_data.rename(columns={
    "nearest_distance": "nearest_subway_distance",
    "nearest_latitude": "nearest_subway_latitude",
    "nearest_longitude": "nearest_subway_longitude"
    }, 
    inplace=True
)

### school 병합

In [11]:
# train_data에 최단거리 학교 정보 추가
merged_df: pd.DataFrame = find_nearest_haversine_distance(unique_loc_train, school_data)
merged_df: pd.DataFrame = pd.concat([merged_df, unique_loc_train], axis=1)
train_data: pd.DataFrame = pd.merge(train_data, merged_df, on=["latitude", "longitude"], how="left")
train_data.rename(columns={
    "nearest_distance": "nearest_school_distance",
    "nearest_latitude": "nearest_school_latitude",
    "nearest_longitude": "nearest_school_longitude"
    },
    inplace=True
)

# test_data에 최단거리 학교 정보 추가
merged_df: pd.DataFrame = find_nearest_haversine_distance(unique_loc_train, school_data)
merged_df: pd.DataFrame = pd.concat([merged_df, unique_loc_train], axis=1)
test_data: pd.DataFrame = pd.merge(test_data, merged_df, on=["latitude", "longitude"], how="left")
test_data.rename(columns={
    "nearest_distance": "nearest_school_distance",
    "nearest_latitude": "nearest_school_latitude",
    "nearest_longitude": "nearest_school_longitude"
    },
    inplace=True
)

### park 병합

In [12]:
# train_data에 최단거리 공원 정보 추가
merged_df: pd.DataFrame = find_nearest_haversine_distance(unique_loc_train, school_data)
merged_df: pd.DataFrame = pd.concat([merged_df, unique_loc_train], axis=1)
train_data: pd.DataFrame = pd.merge(train_data, merged_df, on=["latitude", "longitude"], how="left")
train_data.rename(columns={
    "nearest_distance": "nearest_park_distance",
    "nearest_latitude": "nearest_park_latitude",
    "nearest_longitude": "nearest_park_longitude"
    },
    inplace=True
)

# test_data에 최단거리 공원 정보 추가
merged_df: pd.DataFrame = find_nearest_haversine_distance(unique_loc_train, school_data)
merged_df: pd.DataFrame = pd.concat([merged_df, unique_loc_train], axis=1)
test_data: pd.DataFrame = pd.merge(test_data, merged_df, on=["latitude", "longitude"], how="left")
test_data.rename(columns={
    "nearest_distance": "nearest_park_distance",
    "nearest_latitude": "nearest_park_latitude",
    "nearest_longitude": "nearest_park_longitude"
    },
    inplace=True
)

## 병합한 데이터 확인

In [13]:
train_data

Unnamed: 0,index,area_m2,contract_year_month,contract_day,contract_type,floor,built_year,latitude,longitude,age,...,interest_rate,nearest_subway_distance,nearest_subway_latitude,nearest_subway_longitude,nearest_school_distance,nearest_school_latitude,nearest_school_longitude,nearest_park_distance,nearest_park_latitude,nearest_park_longitude
0,0,84.9981,201906,25,2,9,2019,37.054314,127.045216,0,...,1.78,716.952948,37.056496,127.052819,156.120431,37.053232,127.046337,156.120431,37.053232,127.046337
1,1,84.9981,202003,26,2,20,2019,37.054314,127.045216,1,...,1.26,716.952948,37.056496,127.052819,156.120431,37.053232,127.046337,156.120431,37.053232,127.046337
2,2,84.9981,202003,28,2,8,2019,37.054314,127.045216,1,...,1.26,716.952948,37.056496,127.052819,156.120431,37.053232,127.046337,156.120431,37.053232,127.046337
3,3,59.3400,201907,15,2,1,1986,36.964647,127.055847,33,...,1.68,3897.279708,36.990726,127.085159,214.559689,36.962943,127.056980,214.559689,36.962943,127.056980
4,4,59.8100,201904,12,2,6,1995,36.972390,127.084514,24,...,1.85,2039.685349,36.990726,127.085159,1708.489263,36.987746,127.085154,1708.489263,36.987746,127.085154
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1801223,1801223,114.8126,202311,25,0,5,2010,37.528394,126.659398,13,...,4.00,1483.044562,37.524649,126.675539,224.754177,37.529291,126.657114,224.754177,37.529291,126.657114
1801224,1801224,101.9088,202311,28,0,6,2010,37.528394,126.659398,13,...,4.00,1483.044562,37.524649,126.675539,224.754177,37.529291,126.657114,224.754177,37.529291,126.657114
1801225,1801225,114.7900,202312,3,0,19,2010,37.528394,126.659398,13,...,3.84,1483.044562,37.524649,126.675539,224.754177,37.529291,126.657114,224.754177,37.529291,126.657114
1801226,1801226,101.9088,202312,4,1,15,2010,37.528394,126.659398,13,...,3.84,1483.044562,37.524649,126.675539,224.754177,37.529291,126.657114,224.754177,37.529291,126.657114


In [14]:
test_data

Unnamed: 0,index,area_m2,contract_year_month,contract_day,contract_type,floor,built_year,latitude,longitude,age,interest_rate,nearest_subway_distance,nearest_subway_latitude,nearest_subway_longitude,nearest_school_distance,nearest_school_latitude,nearest_school_longitude,nearest_park_distance,nearest_park_latitude,nearest_park_longitude
0,0,84.9610,202404,12,1,14,2016,36.965423,127.048779,8,3.54,4284.771362,36.990726,127.085159,779.057020,36.962943,127.056980,779.057020,36.962943,127.056980
1,1,59.9000,202404,13,0,4,1997,36.963105,127.040678,27,3.54,5004.558790,36.990726,127.085159,1448.494091,36.962943,127.056980,1448.494091,36.962943,127.056980
2,2,39.2700,202404,29,0,5,1990,36.957089,127.047449,34,3.54,5021.183886,36.990726,127.085159,808.416163,36.950687,127.051762,808.416163,36.950687,127.051762
3,3,39.2700,202405,3,0,1,1990,36.957089,127.047449,34,3.56,5021.183886,36.990726,127.085159,808.416163,36.950687,127.051762,808.416163,36.950687,127.051762
4,4,46.9800,202406,2,0,4,1990,36.957089,127.047449,34,,5021.183886,36.990726,127.085159,808.416163,36.950687,127.051762,808.416163,36.950687,127.051762
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
150167,150167,115.5101,202402,27,0,17,2010,37.528394,126.659398,14,3.62,1483.044562,37.524649,126.675539,224.754177,37.529291,126.657114,224.754177,37.529291,126.657114
150168,150168,142.8738,202403,2,0,4,2010,37.528394,126.659398,14,3.59,1483.044562,37.524649,126.675539,224.754177,37.529291,126.657114,224.754177,37.529291,126.657114
150169,150169,142.8738,202403,16,1,13,2010,37.528394,126.659398,14,3.59,1483.044562,37.524649,126.675539,224.754177,37.529291,126.657114,224.754177,37.529291,126.657114
150170,150170,114.9285,202403,22,1,2,2010,37.528394,126.659398,14,3.59,1483.044562,37.524649,126.675539,224.754177,37.529291,126.657114,224.754177,37.529291,126.657114
