In [1]:
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.spatial import KDTree

### Load Data

In [41]:
data_path = os.path.join(os.path.dirname(os.path.abspath('')), 'data')
train_df = pd.read_csv(os.path.join(data_path, 'train.csv'), index_col=0)
test_df = pd.read_csv(os.path.join(data_path, 'test.csv'), index_col=0)
park_df = pd.read_csv(os.path.join(data_path, 'parkInfo.csv'))
school_df = pd.read_csv(os.path.join(data_path, 'schoolinfo.csv'))
subway_df = pd.read_csv(os.path.join(data_path, 'subwayInfo.csv'))
interest_df = pd.read_csv(os.path.join(data_path, 'interestRate.csv'))

all_df = pd.concat([train_df, test_df], axis=0, ignore_index=True)
all_df = all_df.merge(interest_df, left_on='contract_year_month', right_on='year_month', how='left')
all_df['contract_datetime'] = all_df['contract_year_month'].astype(str) + '_' + all_df['contract_day'].astype(str)
all_df['contract_datetime'] = pd.to_datetime(all_df['contract_datetime'], format='%Y%m_%d')

In [38]:
unique_locations = all_df[['latitude', 'longitude', 'area_m2']].drop_duplicates().values.tolist()
loc_to_id = {tuple(loc): id for id, loc in enumerate(unique_locations)}

tuple_array = [all_df['latitude'].values, all_df['longitude'].values, all_df['area_m2'].values]
all_df['location_id'] = pd.Series([loc_to_id[(lat, long, area)] for lat, long, area in zip(*tuple_array)])
all_df['location_id'] = all_df['location_id'].astype('category')
all_df[all_df['location_id'] == 38869].sort_values('contract_year_month')


Unnamed: 0,area_m2,contract_year_month,contract_day,contract_type,floor,built_year,latitude,longitude,age,deposit,year_month,interest_rate,location_id
1047443,76.79,2019-04-01,1,2,2,1979,37.497741,127.06507,40,43000.0,201904.0,1.85,38869
1047562,76.79,2019-04-01,29,2,1,1979,37.497741,127.06507,40,40000.0,201904.0,1.85,38869
1047560,76.79,2019-04-01,29,2,14,1979,37.497741,127.06507,40,40000.0,201904.0,1.85,38869
1047556,76.79,2019-04-01,28,2,10,1979,37.497741,127.06507,40,47000.0,201904.0,1.85,38869
1047546,76.79,2019-04-01,25,2,5,1980,37.497741,127.06507,39,47000.0,201904.0,1.85,38869
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1857274,76.79,2024-06-01,12,0,6,1979,37.497741,127.06507,45,,,,38869
1857275,76.79,2024-06-01,12,2,6,1979,37.497741,127.06507,45,,,,38869
1857283,76.79,2024-06-01,15,1,9,1979,37.497741,127.06507,45,,,,38869
1857287,76.79,2024-06-01,17,0,8,1979,37.497741,127.06507,45,,,,38869


In [74]:
train_df, test_df = all_df.iloc[:-len(test_df)] , all_df.iloc[-len(test_df):]
X_test = test_df.drop(columns=['deposit'])

val_start, val_end = 202307, 202312
val_idx = (train_df['contract_year_month'] >= val_start) & (train_df['contract_year_month'] <= val_end)
X_train = train_df[~val_idx].drop(columns=['deposit'])
y_train = train_df[~val_idx]['deposit']
X_valid = train_df[val_idx].drop(columns=['deposit'])
y_valid = train_df[val_idx]['deposit']

In [70]:
import lightgbm as lgb

lgb_model = lgb.train(
    params={
        'objective': 'regression',
        'metric': 'mae',
        'num_leaves': 63,
        'seed': 42,
        'verbose': -1,
    },
    train_set=lgb.Dataset(X_train, y_train),
    num_boost_round=2000,
    valid_names=['train', 'holdout'],
    valid_sets=[
        lgb.Dataset(X_train, y_train),
        lgb.Dataset(X_valid, y_valid),
    ],
    callbacks=[
        lgb.log_evaluation(period=100),
    ],
)

lgb.plot_importance(lgb_model)
plt.show()

[100]	train's l2: 5.11901e+07	holdout's l2: 6.83618e+07
[200]	train's l2: 4.64216e+07	holdout's l2: 6.17273e+07
[300]	train's l2: 4.40918e+07	holdout's l2: 6.03547e+07
[400]	train's l2: 4.23689e+07	holdout's l2: 5.98881e+07
[500]	train's l2: 4.13167e+07	holdout's l2: 5.87884e+07
[600]	train's l2: 4.05033e+07	holdout's l2: 5.83306e+07
[700]	train's l2: 3.98663e+07	holdout's l2: 5.81842e+07
[800]	train's l2: 3.92423e+07	holdout's l2: 5.80042e+07
[900]	train's l2: 3.88223e+07	holdout's l2: 5.78465e+07
[1000]	train's l2: 3.83229e+07	holdout's l2: 5.77573e+07
[1100]	train's l2: 3.79069e+07	holdout's l2: 5.76845e+07
[1200]	train's l2: 3.74752e+07	holdout's l2: 5.76527e+07
[1300]	train's l2: 3.71218e+07	holdout's l2: 5.73304e+07
[1400]	train's l2: 3.67402e+07	holdout's l2: 5.72051e+07
[1500]	train's l2: 3.64204e+07	holdout's l2: 5.70436e+07
[1600]	train's l2: 3.6085e+07	holdout's l2: 5.70599e+07
[1700]	train's l2: 3.58488e+07	holdout's l2: 5.70425e+07
[1800]	train's l2: 3.55373e+07	holdout's 

TypeError: unsupported format string passed to numpy.ndarray.__format__