#### library

In [1]:
import os
import pandas as pd
import numpy as np
import lightgbm as lgb
import matplotlib.pyplot as plt
import seaborn as sns
from haversine import haversine, Unit
import sys
from datetime import datetime, timedelta
from geopy.distance import great_circle

from sklearn.neighbors import KDTree, BallTree
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler, MinMaxScaler


from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

code_directory = os.path.abspath('..')
sys.path.append(code_directory)

from preprocessing import *

from utils.constant_utils import Directory
from utils import common_utils

import model
from inference import *

features_directory = os.path.abspath('../features')
sys.path.append(features_directory)

from clustering_features import *
from count_features import *
from distance_features import *
from other_features import *

#### data load

In [2]:
print("total data load ...")
df = common_utils.merge_data(Directory.train_data, Directory.test_data)
print(df.shape)
df.head(3)

total data load ...
(1951400, 12)


Unnamed: 0,area_m2,contract_year_month,contract_day,contract_type,floor,built_year,latitude,longitude,age,deposit,type,interest_rate
0,84.9981,201906,25,2,9,2019,37.054314,127.045216,0,17000.0,train,1.78
1,84.9981,202003,26,2,20,2019,37.054314,127.045216,1,23000.0,train,1.26
2,84.9981,202003,28,2,8,2019,37.054314,127.045216,1,23000.0,train,1.26


In [3]:
# 클러스터 피처 apply
print("clustering apply ...")
for info_df_name in ['subway_info', 'school_info', 'park_info']:
    info_df = getattr(Directory, info_df_name)  
    df = clustering(df, info_df, feat_name=info_df_name, n_clusters=20)
df.head(3)

clustering apply ...


Unnamed: 0,area_m2,contract_year_month,contract_day,contract_type,floor,built_year,latitude,longitude,age,deposit,type,interest_rate,subway_info,school_info,park_info
0,84.9981,201906,25,2,9,2019,37.054314,127.045216,0,17000.0,train,1.78,6,2,6
1,84.9981,202003,26,2,20,2019,37.054314,127.045216,1,23000.0,train,1.26,6,2,6
2,84.9981,202003,28,2,8,2019,37.054314,127.045216,1,23000.0,train,1.26,6,2,6


In [4]:
print("start to cleaning outliers...")
df = handle_age_outliers(df)
df.head(3)

start to cleaning outliers...


Unnamed: 0,area_m2,contract_year_month,contract_day,contract_type,floor,built_year,latitude,longitude,age,deposit,type,interest_rate,subway_info,school_info,park_info
0,84.9981,201906,25,2,9,2019,37.054314,127.045216,0,17000.0,train,1.78,6,2,6
1,84.9981,202003,26,2,20,2019,37.054314,127.045216,1,23000.0,train,1.26,6,2,6
2,84.9981,202003,28,2,8,2019,37.054314,127.045216,1,23000.0,train,1.26,6,2,6


In [5]:
print("train, valid, test split for preprocessing & feature engineering ...")
train_data_, valid_data_, test_data_ = common_utils.train_valid_test_split(df)

train, valid, test split for preprocessing & feature engineering ...


In [6]:
print("start to preprocessing...")
# type 카테고리화
train_data_ = numeric_to_categoric(train_data_, 'contract_type', {0:'new', 1:'renew', 2:'unknown'})
valid_data_ = numeric_to_categoric(valid_data_, 'contract_type', {0:'new', 1:'renew', 2:'unknown'})
test_data_ = numeric_to_categoric(test_data_, 'contract_type', {0:'new', 1:'renew', 2:'unknown'})

# 중복 제거
train_data_ = handle_duplicates(train_data_)
valid_data_ = handle_duplicates(valid_data_)

start to preprocessing...


In [7]:
print("start to feature engineering...")
# clustering_feature
print("create clustering features")
train_data, valid_data, test_data = create_clustering_target(train_data_, valid_data_, test_data_)

# distance_features
print("create distance features")
train_data, valid_data, test_data = distance_gangnam(train_data, valid_data, test_data)
train_data, valid_data, test_data = create_nearest_subway_distance(train_data, valid_data, test_data)
train_data, valid_data, test_data = create_nearest_park_distance_and_area(train_data, valid_data, test_data)
train_data, valid_data, test_data = create_nearest_school_distance(train_data, valid_data, test_data)
train_data, valid_data, test_data = weighted_subway_distance(train_data, valid_data, test_data)
train_data, valid_data, test_data = create_nearest_park_distance_and_area(train_data, valid_data, test_data)

# other_features
print("create other features")
train_data, valid_data, test_data = create_temporal_feature(train_data, valid_data, test_data)
train_data, valid_data, test_data = create_sin_cos_season(train_data, valid_data, test_data)
train_data, valid_data, test_data = create_floor_area_interaction(train_data, valid_data, test_data)
train_data, valid_data, test_data = create_sum_park_area_within_radius(train_data, valid_data, test_data)
train_data, valid_data, test_data = shift_interest_rate_function(train_data, valid_data, test_data)
train_data, valid_data, test_data = categorization(train_data, valid_data, test_data, category = 'age')
train_data, valid_data, test_data = categorization(train_data, valid_data, test_data, category = 'floor')
train_data, valid_data, test_data = categorization(train_data, valid_data, test_data, category = 'area_m2')

# count_features
print("create count features")
train_data, valid_data, test_data = transaction_count_function(train_data, valid_data, test_data)
train_data, valid_data, test_data = create_subway_within_radius(train_data, valid_data, test_data)
train_data, valid_data, test_data = create_school_within_radius(train_data, valid_data, test_data)
train_data, valid_data, test_data = create_school_counts_within_radius_by_school_level(train_data, valid_data, test_data)
train_data, valid_data, test_data = create_place_within_radius(train_data, valid_data, test_data)

start to feature engineering...
create clustering features
create distance features
create other features
        date  interest_rate  interest_rate_1year  interest_rate_3months  \
0 2023-07-08           3.69                 2.90                   3.44   
1 2024-04-12           3.54                 3.44                   3.66   
2 2019-06-25           1.78                 1.78                   1.78   

   interest_rate_6months  
0                   3.82  
1                   3.97  
2                   1.78  


In [11]:
train_data.head(3)

Unnamed: 0,area_m2,contract_year_month,contract_day,contract_type,floor,built_year,latitude,longitude,age,deposit,...,interest_rate_1year,interest_rate_3months,interest_rate_6months,transaction_count_last_{months}_months,subways_within_radius,schools_within_radius,elementary_schools_within_radius,middle_schools_within_radius,high_schools_within_radius,public_facility_count
0,84.9981,201906,25,unknown,9,2019,37.054314,127.045216,0,17000.0,...,2.9,3.44,3.82,0,1,12,7,3,2,3
1,84.9981,202003,26,unknown,20,2019,37.054314,127.045216,1,23000.0,...,3.44,3.66,3.97,2,1,12,7,3,2,3
2,84.9981,202003,28,unknown,8,2019,37.054314,127.045216,1,23000.0,...,1.78,1.78,1.78,3,1,12,7,3,2,3


In [13]:
# 계약일 피처 제거
train_data_ = drop_columns(train_data, ['contract_day'])
valid_data_ = drop_columns(valid_data, ['contract_day'])
test_data_ = drop_columns(test_data, ['contract_day'])


In [14]:
train_data_.columns

Index(['area_m2', 'contract_year_month', 'contract_type', 'floor',
       'built_year', 'latitude', 'longitude', 'age', 'deposit', 'type',
       'interest_rate', 'subway_info', 'school_info', 'park_info', 'cluster',
       'density', 'distance_to_centroid', 'gangnam_5km', 'gangnam_10km',
       'gangnam_remote', 'nearest_subway_distance_x', 'nearest_park_distance',
       'nearest_park_area', 'nearest_elementary_distance',
       'nearest_middle_distance', 'nearest_high_distance',
       'nearest_subway_distance_y', 'year', 'month', 'date', 'day_of_week',
       'quarter', 'is_month_end', 'season', 'season_sin', 'season_cos',
       'floor_area_interaction', 'nearest_park_area_sum',
       'interest_rate_1year', 'interest_rate_3months', 'interest_rate_6months',
       'transaction_count_last_{months}_months', 'subways_within_radius',
       'schools_within_radius', 'elementary_schools_within_radius',
       'middle_schools_within_radius', 'high_schools_within_radius',
       'public_f

In [16]:
train_data_.dtypes

area_m2                                          float64
contract_year_month                              float64
contract_type                                   category
floor                                            float64
built_year                                       float64
latitude                                         float64
longitude                                        float64
age                                              float64
deposit                                          float64
type                                              object
interest_rate                                    float64
subway_info                                     category
school_info                                     category
park_info                                       category
cluster                                         category
density                                          float64
distance_to_centroid                             float64
gangnam_5km                    

In [19]:
train_data_['gangnam_5km'] = train_data_['gangnam_5km'].astype(object)
train_data_.dtypes

area_m2                                          float64
contract_year_month                              float64
contract_type                                   category
floor                                            float64
built_year                                       float64
latitude                                         float64
longitude                                        float64
age                                              float64
deposit                                          float64
type                                              object
interest_rate                                    float64
subway_info                                     category
school_info                                     category
park_info                                       category
cluster                                         category
density                                          float64
distance_to_centroid                             float64
gangnam_5km                    

In [21]:
train_data_['gangnam_5km']

0         -0.237832
1         -0.237832
2         -0.237832
3         -0.237832
4         -0.237832
             ...   
1515862   -0.237832
1515863   -0.237832
1515864   -0.237832
1515865   -0.237832
1515866   -0.237832
Name: gangnam_5km, Length: 1515867, dtype: object

In [15]:
train_data_, valid_data_, test_data_ = standardization(train_data_, valid_data_, test_data_, scaling_type = 'standard')


In [23]:
train_data_scaled, valid_data_scaled, test_data_scaled = feature_selection(train_data_, valid_data_, test_data_)


In [25]:
X_train, y_train, X_valid, y_valid, X_test = common_utils.split_feature_target(train_data_scaled, valid_data_scaled, test_data_scaled)


In [26]:
X_train.shape, y_train.shape, X_test.shape

((1515867, 44), (1515867,), (150172, 44))