## 0. Load Libraries & Data

In [1]:
import os
import pandas as pd
import numpy as np

# from geopy.distance import geodesic
import polars as pl

from sklearn.neighbors import NearestNeighbors # 위도, 경도 기반의 거리 계산을 위해..
import gc # 메모리 확보 필수

In [2]:
# seed 설정
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

In [3]:
# 소수점 이하 3자리 반올림, 과학적 표기법 해제
pd.set_option('display.float_format', '{:.3f}'.format)
np.set_printoptions(suppress=True, precision=3)

## 1. Polars로 변환

In [4]:
# 파일 경로 설정
file_path = '../data/'

# 데이터 불러오기
train_data = pl.read_csv(file_path + 'train.csv').with_columns(pl.lit('train').alias('_type'))
test_data = pl.read_csv(file_path + 'test.csv').with_columns(
    pl.lit(None).alias('deposit'),
    pl.lit('test').alias('_type')    
)

df = pl.concat([train_data, test_data], how="vertical")


subwayInfo = pl.read_csv(file_path + 'subwayInfo.csv')
interestRate = pl.read_csv(file_path + 'interestRate.csv')
schoolInfo = pl.read_csv(file_path + 'schoolinfo.csv')
parkInfo = pl.read_csv(file_path + 'parkInfo.csv')

#### (1) 가장 가까운 지하철과의 거리 계산

- 거리 단위: km

In [5]:
# 지하철역과의 거리 계산
apartment_coords = df.select(['latitude', 'longitude']).to_numpy() # 아파트 좌표
subway_coords = subwayInfo.select(['latitude', 'longitude']).to_numpy() # 지하철 좌표

# 각 좌표를 라디안으로 변환
apartment_coords_rad = np.radians(apartment_coords)
subway_coords_rad = np.radians(subway_coords)

# NearestNeighbors 모델 사용: haversine 거리 사용
nbrs_subway = NearestNeighbors(n_neighbors=1, algorithm='ball_tree', metric='haversine')
nbrs_subway.fit(subway_coords_rad)

# 가장 가까운 두 지하철역까지의 거리 계산
distances_subway, indices_subway = nbrs_subway.kneighbors(apartment_coords_rad)
# 거리를 킬로미터로 변환
distances_subway_km = distances_subway * 6371  # 지구 반지름(km)

df = df.with_columns([
    pl.Series(name='nearest_subway_distance_km', values=distances_subway_km[:, 0])
    # pl.Series(name='second_nearest_subway_distance_km', values=distances_subway_km[:, 1])
])

#### (2) 계약연월에 따른 금리 및 이전 달 금리 추가

In [6]:
# 금리 정보 추가
# interest_rate_df의 'year_month'를 날짜 형식으로 변환
interestRate = interestRate.with_columns(
    (pl.col('year_month').cast(str) + '01').str.strptime(pl.Date, '%Y%m%d').alias('year_month_date')
)

# 금리 데이터를 날짜 순으로 정렬하고 이전 달의 금리 추가
interestRate = interestRate.sort('year_month_date').with_columns(
    pl.col('interest_rate').shift(1).alias('prev_month_interest_rate')
)

# df의 'contract_year_month'를 날짜 형식으로 변환
df = df.with_columns(
    (pl.col('contract_year_month').cast(str) + '01').str.strptime(pl.Date, '%Y%m%d').alias('year_month_date')
)

# df에 현재 월 및 이전 월의 금리 정보 병합
df = df.join(
    interestRate.select(['year_month_date', 'interest_rate', 'prev_month_interest_rate']),
    on='year_month_date',
    how='left'
)

#### (3) 가장 가까운 초등학교, 중학교, 고등학교와의 거리 계산

In [7]:
# 학교와의 거리 계산 함수 정의
def compute_nearest_distance(apartment_coords, target_coords):
    apartment_coords_rad = np.radians(apartment_coords)
    target_coords_rad = np.radians(target_coords)
    nbrs = NearestNeighbors(n_neighbors=1, algorithm='ball_tree', metric='haversine')
    nbrs.fit(target_coords_rad)
    distances, _ = nbrs.kneighbors(apartment_coords_rad)
    distances_km = distances[:, 0] * 6371  # 킬로미터로 변환
    return distances_km

# 학교 데이터를 학교급별로 분리
elementary_df = schoolInfo.filter(pl.col('schoolLevel') == 'elementary')
middle_df = schoolInfo.filter(pl.col('schoolLevel') == 'middle')
high_df = schoolInfo.filter(pl.col('schoolLevel') == 'high')

# 각 학교급의 좌표 추출
elementary_coords = elementary_df.select(['latitude', 'longitude']).to_numpy()
middle_coords = middle_df.select(['latitude', 'longitude']).to_numpy()
high_coords = high_df.select(['latitude', 'longitude']).to_numpy()

# 아파트에서 가장 가까운 학교까지의 거리 계산
distances_elementary = compute_nearest_distance(apartment_coords, elementary_coords)
distances_middle = compute_nearest_distance(apartment_coords, middle_coords)
distances_high = compute_nearest_distance(apartment_coords, high_coords)

# df에 학교 거리 정보 추가
df = df.with_columns([
    pl.Series(name='nearest_elementary_distance_km', values=distances_elementary),
    pl.Series(name='nearest_middle_distance_km', values=distances_middle),
    pl.Series(name='nearest_high_distance_km', values=distances_high)
])

#### (4) 가장 가까운 공원과의 거리 및 공원 면적 추가

In [8]:
# 공원과의 거리 및 면적 정보 계산 함수 정의
def compute_nearest_distance_with_indices(apartment_coords, target_coords):
    apartment_coords_rad = np.radians(apartment_coords)
    target_coords_rad = np.radians(target_coords)
    nbrs = NearestNeighbors(n_neighbors=1, algorithm='ball_tree', metric='haversine')
    nbrs.fit(target_coords_rad)
    distances, indices = nbrs.kneighbors(apartment_coords_rad)
    distances_km = distances[:, 0] * 6371  # 킬로미터로 변환
    return distances_km, indices[:, 0]

# 공원 좌표 추출
park_coords = parkInfo.select(['latitude', 'longitude']).to_numpy()

# 아파트에서 가장 가까운 공원까지의 거리 및 인덱스 계산
distances_park, indices_park = compute_nearest_distance_with_indices(apartment_coords, park_coords)

# 가장 가까운 공원의 면적 추출
park_areas = parkInfo.select('area').to_numpy().flatten()
nearest_park_areas = park_areas[indices_park]

# df에 공원 거리 및 면적 정보 추가
df = df.with_columns([
    pl.Series(name='nearest_park_distance_km', values=distances_park),
    pl.Series(name='nearest_park_area', values=nearest_park_areas)
])

In [9]:
# 사용이 끝난 변수 삭제 및 가비지 컬렉션 수행
del apartment_coords, subway_coords, apartment_coords_rad, subway_coords_rad
del elementary_coords, middle_coords, high_coords
del park_coords
gc.collect()

88

In [10]:
# 파일 경로 설정
output_file_path = '../data/processed_data.csv'

# df를 CSV 파일로 저장하기
df.write_csv(output_file_path)

In [11]:
pd.read_csv(file_path + 'processed_data.csv')

Unnamed: 0,index,area_m2,contract_year_month,contract_day,contract_type,floor,built_year,latitude,longitude,age,...,_type,nearest_subway_distance_km,year_month_date,interest_rate,prev_month_interest_rate,nearest_elementary_distance_km,nearest_middle_distance_km,nearest_high_distance_km,nearest_park_distance_km,nearest_park_area
0,0,84.998,201906,25,2,9,2019,37.054,127.045,0,...,train,0.717,2019-06-01,1.780,1.850,0.156,0.465,0.991,0.499,3898.000
1,1,84.998,202003,26,2,20,2019,37.054,127.045,1,...,train,0.717,2020-03-01,1.260,1.430,0.156,0.465,0.991,0.499,3898.000
2,2,84.998,202003,28,2,8,2019,37.054,127.045,1,...,train,0.717,2020-03-01,1.260,1.430,0.156,0.465,0.991,0.499,3898.000
3,3,59.340,201907,15,2,1,1986,36.965,127.056,33,...,train,3.897,2019-07-01,1.680,1.780,0.215,0.688,0.644,0.170,1616.000
4,4,59.810,201904,12,2,6,1995,36.972,127.085,24,...,train,2.040,2019-04-01,1.850,1.940,1.708,2.198,2.265,0.382,3986.000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1951395,150167,115.510,202402,27,0,17,2010,37.528,126.659,14,...,test,1.483,2024-02-01,3.620,3.660,0.313,0.482,0.225,0.398,8963.900
1951396,150168,142.874,202403,2,0,4,2010,37.528,126.659,14,...,test,1.483,2024-03-01,3.590,3.620,0.313,0.482,0.225,0.398,8963.900
1951397,150169,142.874,202403,16,1,13,2010,37.528,126.659,14,...,test,1.483,2024-03-01,3.590,3.620,0.313,0.482,0.225,0.398,8963.900
1951398,150170,114.928,202403,22,1,2,2010,37.528,126.659,14,...,test,1.483,2024-03-01,3.590,3.620,0.313,0.482,0.225,0.398,8963.900


Unnamed: 0,변수명,데이터 타입,값의 범위,값,Uniq 값 개수,결측치
0,index,int64,0 ~ 1801227,"[0, 1, 2, 3, 4]",1801228,0
1,area_m2,float64,10.3215 ~ 317.36,"[84.9981, 59.34, 59.81, 84.9342, 39.27]",22281,0
2,contract_year_month,int64,201904 ~ 202312,"[201906, 202003, 201907, 201904, 201905]",57,0
3,contract_day,int64,1 ~ 31,"[25, 26, 28, 15, 12]",31,0
4,contract_type,int64,0 ~ 2,"[2, 1, 0]",3,0
5,floor,int64,-4 ~ 68,"[9, 20, 8, 1, 6]",73,0
6,built_year,int64,1961 ~ 2024,"[2019, 1986, 1995, 2016, 1990]",61,0
7,latitude,float64,36.9179099 ~ 38.1819371,"[37.054314, 36.964647, 36.9723899, 36.9654234,...",18436,0
8,longitude,float64,126.4787081 ~ 127.6608961,"[127.0452164, 127.0558472, 127.0845143, 127.04...",18447,0
9,age,int64,-3 ~ 62,"[0, 1, 33, 24, 3]",66,0


In [20]:
train_data.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
index,1801228.0,900613.5,519969.88,0.0,450306.75,900613.5,1350920.25,1801227.0
area_m2,1801228.0,75.188,25.526,10.322,59.75,77.15,84.96,317.36
contract_year_month,1801228.0,202133.126,135.281,201904.0,202008.0,202111.0,202212.0,202312.0
contract_day,1801228.0,15.899,8.626,1.0,9.0,16.0,23.0,31.0
contract_type,1801228.0,1.204,0.885,0.0,0.0,2.0,2.0,2.0
floor,1801228.0,10.052,6.973,-4.0,5.0,9.0,14.0,68.0
built_year,1801228.0,2004.199,11.151,1961.0,1995.0,2004.0,2015.0,2024.0
latitude,1801228.0,37.477,0.163,36.918,37.38,37.502,37.582,38.182
longitude,1801228.0,126.966,0.17,126.479,126.842,126.998,127.089,127.661
age,1801228.0,17.066,11.128,-3.0,7.0,17.0,26.0,62.0


In [21]:
# DataFrame 출력
data_description_df = describe_data(test_data)

# 결과 DataFrame을 표시
data_description_df

Unnamed: 0,변수명,데이터 타입,값의 범위,값,Uniq 값 개수,결측치
0,index,int64,0 ~ 150171,"[0, 1, 2, 3, 4]",150172,0
1,area_m2,float64,10.78 ~ 273.86,"[84.961, 59.9, 39.27, 46.98, 84.9182]",13161,0
2,contract_year_month,int64,202401 ~ 202406,"[202404, 202405, 202406, 202401, 202402]",6,0
3,contract_day,int64,1 ~ 31,"[12, 13, 29, 3, 2]",31,0
4,contract_type,int64,0 ~ 2,"[1, 0, 2]",3,0
5,floor,int64,-3 ~ 60,"[14, 4, 5, 1, 13]",62,0
6,built_year,int64,1966 ~ 2024,"[2016, 1997, 1990, 2005, 2002]",59,0
7,latitude,float64,36.9570886 ~ 38.1060333,"[36.9654234, 36.9631054, 36.9570886, 36.960033...",11872,0
8,longitude,float64,126.4787081 ~ 127.6608961,"[127.0487791, 127.0406778, 127.0474487, 127.05...",11871,0
9,age,int64,0 ~ 58,"[8, 27, 34, 19, 22]",59,0


In [30]:
# 고유 아파트 단지 개수 구하기 (latitude, longitude의 조합)
unique_complexes = train_data[['latitude', 'longitude']].drop_duplicates()

# 고유한 아파트 단지 개수 출력
num_unique_complexes = unique_complexes.shape[0]
print(f"고유한 아파트 단지의 개수: {num_unique_complexes}")

고유한 아파트 단지의 개수: 18491


In [31]:
unique_complexes = train_data[['latitude', 'longitude', 'built_year']].drop_duplicates()

num_unique_complexes = unique_complexes.shape[0]
print(f"{num_unique_complexes}")

18915
