In [1]:
import pandas as pd
import numpy as np

from sklearn.neighbors import BallTree
from scipy.spatial import cKDTree
from scipy.stats import skew, kurtosis
from prophet import Prophet
from pathlib import Path

from sklearn.preprocessing import StandardScaler
from utils.clustering import ClusteringMethods


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
data_path = Path('./data')

In [3]:
train_df = pd.read_csv(data_path / 'train.csv')
test_df = pd.read_csv(data_path / 'test.csv')

In [4]:
test_df['deposit'] = 0
train_df['_type'] = 'train'
test_df['_type'] = 'test'
df = train_df.copy()

In [5]:
df = pd.concat([df, test_df], axis=0)

In [6]:
df['deposit_by_area'] = df['deposit'] / df['area_m2']


# 아파트 별 피처 추가
- 공원
- 지하철
- 학교

In [10]:
lat_lon_df = df[['latitude', 'longitude']].drop_duplicates()

In [11]:
park_df = pd.read_csv(data_path / 'parkInfo.csv')
subway_df = pd.read_csv(data_path / 'subwayInfo.csv')
school_df = pd.read_csv(data_path / 'schoolinfo.csv')
interest_df = pd.read_csv(data_path / 'interestRate.csv')
interest_df = interest_df.rename(columns={'year_month': 'contract_year_month'})


In [12]:
# 공원
def haversine_distance(lat1, lon1, lat2, lon2):
    R = 6371
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
    dlat, dlon = lat2 - lat1, lon2 - lon1
    a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
    return 2 * R * np.arcsin(np.sqrt(a))

def create_balltree(df):
    coords = df[['latitude', 'longitude']].values
    return BallTree(np.deg2rad(coords), metric='haversine')

def nearest_park_distance(apartment_coords, park_tree):
    distances, _ = park_tree.query(np.deg2rad(apartment_coords), k=1)
    return distances.ravel() * 6371

def count_parks_in_radius(apartment_coords, park_tree, radius):
    return park_tree.query_radius(np.deg2rad(apartment_coords), r=radius/6371, count_only=True)

def weighted_park_score(apartment_coords, park_tree, park_df):
    distances, indices = park_tree.query(np.deg2rad(apartment_coords), k=10)
    distances = distances * 6371
    areas = park_df.loc[indices.ravel(), 'area'].values.reshape(distances.shape)
    return np.sum(areas / (distances + 1), axis=1)

def total_park_area_in_radius(apartment_coords, park_tree, park_df, radius):
    indices = park_tree.query_radius(np.deg2rad(apartment_coords), r=radius/6371)
    return [park_df.loc[idx, 'area'].sum() for idx in indices]

def park_distribution_stats(apartment_coords, park_tree):
    distances, _ = park_tree.query(np.deg2rad(apartment_coords), k=5)
    distances = distances * 6371
    return (
        np.mean(distances, axis=1),
        np.apply_along_axis(skew, 1, distances),
        np.apply_along_axis(kurtosis, 1, distances)
    )

def create_park_features(train_sample, park_df):
    park_tree = create_balltree(park_df)
    apartment_coords = train_sample[['latitude', 'longitude']].values

    features = pd.DataFrame(index=train_sample.index)
    
    features['nearest_park_distance'] = nearest_park_distance(apartment_coords, park_tree)
    
    for radius in [0.5, 1, 2]:
        features[f'park_count_{int(radius*1000)}m'] = count_parks_in_radius(apartment_coords, park_tree, radius)
        features[f'total_park_area_{int(radius*1000)}m'] = total_park_area_in_radius(apartment_coords, park_tree, park_df, radius)
    
    features['weighted_park_score'] = weighted_park_score(apartment_coords, park_tree, park_df)
    
    avg_dist, skewness, kurtosis = park_distribution_stats(apartment_coords, park_tree)
    features['avg_distance_5_parks'] = avg_dist
    features['park_distance_skewness'] = skewness
    features['park_distance_kurtosis'] = kurtosis
    
    return pd.concat([train_sample, features], axis=1)

lat_lon_df = create_park_features(lat_lon_df, park_df)

  buff[ind] = asanyarray(func1d(inarr_view[ind], *args, **kwargs))


In [13]:
# 대형 공원
def create_large_park_features(df, park_df, size_threshold=100000):
    # 대형 공원만 필터링
    large_parks = park_df[park_df['area'] >= size_threshold].reset_index(drop=True)
    
    # BallTree 생성
    large_park_coords = large_parks[['latitude', 'longitude']].values
    large_park_tree = BallTree(np.deg2rad(large_park_coords), metric='haversine')
    
    # 아파트 좌표
    apartment_coords = df[['latitude', 'longitude']].values
    
    # 새로운 특성 생성
    features = pd.DataFrame(index=df.index)
    
    # 가장 가까운 대형 공원까지의 거리
    distances, _ = large_park_tree.query(np.deg2rad(apartment_coords), k=1)
    features['nearest_large_park_distance'] = distances.ravel() * 6371  # km로 변환
    
    # 3km, 5km, 10km 반경 내 대형 공원의 수
    for radius in [3, 5, 10]:
        count = large_park_tree.query_radius(np.deg2rad(apartment_coords), r=radius/6371, count_only=True)
        features[f'large_park_count_{radius}km'] = count
    
    # 10km 반경 내 대형 공원의 총 면적
    indices = large_park_tree.query_radius(np.deg2rad(apartment_coords), r=10/6371)
    total_areas = [large_parks.loc[idx, 'area'].sum() if len(idx) > 0 else 0 for idx in indices]
    features['total_large_park_area_10km'] = total_areas
    
    return pd.concat([df, features], axis=1)

# 데이터 로드 (이미 전처리된 데이터를 사용한다고 가정)
# train_df = pd.read_csv('train_sample.csv')
# park_df = pd.read_csv('parkInfo.csv')

# 대형 공원 특성 생성
lat_lon_df = create_large_park_features(lat_lon_df, park_df)

In [14]:
# 지하철

# Haversine 공식 함수 정의 (두 지점 간의 거리 계산)
def haversine(lat1, lon1, lat2, lon2):
    R = 6371  # 지구 반경 (단위: km)
    phi1, phi2 = np.radians(lat1), np.radians(lat2)
    delta_phi = np.radians(lat2 - lat1)
    delta_lambda = np.radians(lon2 - lon1)

    a = np.sin(delta_phi / 2) ** 2 + np.cos(phi1) * np.cos(phi2) * np.sin(delta_lambda / 2) ** 2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a))

    return R * c  # 결과를 km 단위로 반환

# KD-Tree를 사용해 가장 가까운 지하철을 찾는 함수
def find_nearest_subway(lat, lon, subway_tree, subway_coordinates):
    # 주어진 좌표에 대해 가장 가까운 지하철 역 인덱스 찾기
    distance, index = subway_tree.query([lat, lon], k=1)
    
    # 가장 가까운 지하철 역의 좌표
    nearest_subway = subway_coordinates[index]
    
    # Haversine 공식을 사용하여 거리 계산
    dist_km = haversine(lat, lon, nearest_subway[0], nearest_subway[1])
    
    return dist_km

# subway 데이터의 좌표를 KD-Tree로 변환
subway_coordinates = subway_df[['latitude', 'longitude']].values
subway_tree = cKDTree(subway_coordinates)

lat_lon_df['nearest_subway_distance_km'] = lat_lon_df.apply(
lambda row: find_nearest_subway(row['latitude'], row['longitude'], subway_tree, subway_coordinates),
    axis=1
)

In [15]:
# 학교
def calculate_school_distances_with_kdtree(df, schools):
    school_locations = schools[['latitude', 'longitude']].values
    school_levels = schools['schoolLevel'].values

    # KDTree 생성
    kdtree = cKDTree(school_locations)

    nearby_school_counts = []
    closest_schools = {'elementary': [], 'middle': [], 'high': []}

    for index, row in df.iterrows():
        current_location = (row['latitude'], row['longitude'])

        # KDTree로 주변 학교들의 인덱스와 거리를 구함
        distances, indices = kdtree.query(current_location, k=len(school_locations))

        # Euclidean distance를 사용하여 1km 이내의 학교만 카운팅
        geo_distances = haversine(row['latitude'], row['longitude'], school_locations[:, 0], school_locations[:, 1])
        
        # 1km 이내의 학교만 카운팅
        within_1km = np.where(geo_distances < 1.0)[0]
        nearby_school_count = len(within_1km)

        # 가장 가까운 초등학교, 중학교, 고등학교의 거리 계산
        for level in ['elementary', 'middle', 'high']:
            level_indices = indices[school_levels[indices] == level]
            if len(level_indices) > 0:
                closest_school_distance = geo_distances[level_indices[0]]
            else:
                closest_school_distance = np.nan  # 해당 학교가 없을 경우

            closest_schools[level].append(closest_school_distance)

        nearby_school_counts.append(nearby_school_count)

    # 결과를 DataFrame에 추가
    df['school_count_within_1km'] = nearby_school_counts
    df['closest_elementary_distance'] = closest_schools['elementary']
    df['closest_middle_distance'] = closest_schools['middle']
    df['closest_high_distance'] = closest_schools['high']

    return df

lat_lon_df = calculate_school_distances_with_kdtree(lat_lon_df, school_df)

In [16]:
lat_lon_df.head()

Unnamed: 0,latitude,longitude,nearest_park_distance,park_count_500m,total_park_area_500m,park_count_1000m,total_park_area_1000m,park_count_2000m,total_park_area_2000m,weighted_park_score,...,nearest_large_park_distance,large_park_count_3km,large_park_count_5km,large_park_count_10km,total_large_park_area_10km,nearest_subway_distance_km,school_count_within_1km,closest_elementary_distance,closest_middle_distance,closest_high_distance
0,37.054314,127.045216,0.498619,1,3898.0,6,198124.0,56,1735804.3,144540.902558,...,0.940079,5,8,14,3082215.1,0.716953,4,0.15612,0.465125,0.990855
3,36.964647,127.055847,0.16984,3,3809.0,13,105702.1,23,286268.1,59250.149092,...,5.207932,0,0,7,1877924.1,3.89728,4,0.21456,0.688047,0.644366
4,36.97239,127.084514,0.382402,1,3986.0,1,3986.0,13,37137.4,14232.664397,...,3.137452,0,4,8,2199482.1,2.039685,0,1.708489,2.197946,2.264822
5,36.965423,127.048779,0.288443,3,35950.3,11,76504.3,24,384727.1,48590.986555,...,5.443829,0,0,7,1877924.1,4.284771,2,0.779057,1.313939,1.264233
6,36.957089,127.047449,0.272286,2,106172.0,12,258583.3,21,283702.1,143177.065595,...,6.299737,0,0,7,1877924.1,5.021184,1,0.808416,1.514929,1.448064


In [17]:
df = df.merge(lat_lon_df, on=['latitude', 'longitude'], how='left')

# 금리 데이터 처리 및 예측

In [18]:
# Train_df에서 contract_year_month가 같은 row들의 deposit 평균을 column으로 추가
train_df['deposit_mean'] = train_df.groupby('contract_year_month')['deposit'].transform('mean')

# deposit_mean을 interest_rate_df에 추가 하나씩만
interest_rate_df = pd.merge(interest_df, train_df[['contract_year_month', 'deposit_mean']], on='contract_year_month', how='left')
# unique한 contract_year_month만 남기기
interest_rate_df.drop_duplicates(subset='contract_year_month', keep='first', inplace=True)
# index 초기화
interest_rate_df.reset_index(drop=True, inplace=True)

# contract_year_month 202406 추가 하고 interest_rate 4칸(최대) shift
interest_rate_df = pd.concat([interest_rate_df, pd.DataFrame({'contract_year_month': [202406], 'interest_rate': [0]})])
# index는 contract_year_month로
interest_rate_df.set_index('contract_year_month', inplace=True)
# index 순서대로 정렬
interest_rate_df.sort_index(inplace=True)
interest_rate_df['interest_rate'] = interest_rate_df['interest_rate'].shift(4)

# 이전 금리와 차이값 feature 추가
interest_rate_df['interest_rate_diff'] = interest_rate_df['interest_rate'].diff()

# 2019 04 이후 데이터만 사용
interest_rate_df = interest_rate_df[interest_rate_df.index >= 201904]

# index feature로 사용하기 위해 reset_index
interest_rate_df.reset_index(inplace=True)


# interest_rate_df에서 필요한 컬럼만 선택
# contract_year_month datetime으로 변환
interest_rate_df['contract_year_month'] = pd.to_datetime(interest_rate_df['contract_year_month'], format='%Y%m')

df_prophet = interest_rate_df[['contract_year_month', 'deposit_mean', 'interest_rate']].copy()


# Prophet이 인식할 수 있도록 컬럼명 변경
df_prophet.rename(columns={'contract_year_month': 'ds', 'deposit_mean': 'y'}, inplace=True)
# 모델 정의
model = Prophet()
model.add_regressor('interest_rate')

# 모델 학습
model.fit(df_prophet.dropna())

# 미래 데이터프레임 생성
# ds 2024-01-01 ~ 2024-05-01
# 예측 수행
future = model.make_future_dataframe(periods=6, freq='MS')
future['interest_rate'] = df_prophet['interest_rate']
forecast = model.predict(future)
# interest_rate_df에 merge
interest_rate_df = pd.merge(interest_rate_df, forecast[['ds', 'trend']], left_on='contract_year_month', right_on='ds', how='left')

# trend의 2024-01-01 ~ 2024-06-01 값만 interest_rate_df의 deposit_mean에 대입
interest_rate_df.loc[interest_rate_df['contract_year_month'] >= '2024-01-01', 'deposit_mean'] = interest_rate_df.loc[interest_rate_df['contract_year_month'] >= '2024-01-01', 'trend'].values

11:40:32 - cmdstanpy - INFO - Chain [1] start processing
11:40:32 - cmdstanpy - INFO - Chain [1] done processing


In [19]:
interest_rate_df['contract_year_month'] = interest_rate_df['contract_year_month'].dt.strftime('%Y%m').astype(int)
df = pd.merge(df, interest_rate_df[['contract_year_month', 'deposit_mean', 'interest_rate', 'interest_rate_diff']], on='contract_year_month', how='left')

# 클러스터링 피처 추가

In [20]:
apartment = df[df['contract_year_month'] < 202401][['latitude', 'longitude', 'nearest_subway_distance_km', 'deposit_by_area', 'area_m2', 'built_year', 'school_count_within_1km']]

apartment_avg_deposit = apartment.groupby(['latitude', 'longitude']).agg({
    'deposit_by_area': 'mean',
    'nearest_subway_distance_km': 'first',
    'area_m2' : 'mean',
    'built_year': 'first',
    'school_count_within_1km' : 'first'
}).reset_index()

print(f"Number of apartments: {len(apartment_avg_deposit)}")

apartment_avg_deposit.head(2)


Number of apartments: 18491


Unnamed: 0,latitude,longitude,deposit_by_area,nearest_subway_distance_km,area_m2,built_year,school_count_within_1km
0,36.91791,126.908029,103.697553,16.94161,70.915882,1996,0
1,36.957089,127.047449,111.213897,5.021184,41.767397,1990,1


In [21]:
scaler = StandardScaler()
apartment_scaled = scaler.fit_transform(apartment_avg_deposit)
apartment_scaled_df = pd.DataFrame(apartment_scaled, columns=['latitude', 'longitude', 'deposit_by_area', 'nearest_subway_distance_km', 'area_m2', 'built_year','school_count_within_1km'])


In [22]:
clustering = ClusteringMethods()

## DBCAN
df_cluster = clustering.apply_dbscan(apartment_scaled_df,min_samples=150)
df_cluster.head(3)

# K-means ++
X = np.array(apartment_scaled_df.drop(['latitude', 'longitude'],axis = 1))

k = 5  # 클러스터 개수 설정
labels, centroids = clustering.k_means_plus(X, k)

In [23]:
apartment_cluster = apartment_avg_deposit.copy()
apartment_cluster['Is_Outside'] = df_cluster['cluster_0']
apartment_cluster['Is_Outside'] = apartment_cluster['Is_Outside'].replace(-1, 0)

apartment_cluster['cluster_kmeans'] = labels


In [24]:
merged_df = pd.merge(df, apartment_cluster[['latitude', 'longitude', 'Is_Outside', 'cluster_kmeans']],
                     on=['latitude', 'longitude'],
                     how='left')


In [25]:
non_null_clusters = merged_df[merged_df['cluster_kmeans'].notnull()]
null_clusters = merged_df[merged_df['cluster_kmeans'].isnull()]

non_null_coords = non_null_clusters[['latitude', 'longitude']].values
null_coords = null_clusters[['latitude', 'longitude']].values

# cKDTree를 사용하여 가장 가까운 이웃 찾기
tree = cKDTree(non_null_coords)

# 가장 가까운 이웃의 인덱스를 찾음
_, idx = tree.query(null_coords)

# 가장 가까운 비결측값 행의 cluster_kmeans 값으로 대체
merged_df.loc[merged_df['cluster_kmeans'].isnull(), 'cluster_kmeans'] = non_null_clusters.iloc[idx]['cluster_kmeans'].values
merged_df.loc[merged_df['Is_Outside'].isnull(), 'Is_Outside'] = non_null_clusters.iloc[idx]['Is_Outside'].values


# 주기성 피처 추가

In [None]:
# 월 추출 
merged_df['month'] = merged_df['contract_year_month'].astype(str).str[4:6].astype(int)

# 월을 기준으로 1년 주기성 변환
merged_df['month_sin'] = np.sin(2 * np.pi * merged_df['month'] / 12)
merged_df['month_cos'] = np.cos(2 * np.pi * merged_df['month'] / 12)

merged_df = merged_df.drop('month',axis = 1)

In [34]:
merged_df.to_csv("merged_data.csv", index=False)

In [35]:
merged_df.to_csv("merged_data.csv.gz", index=False, compression='gzip')