In [1]:
import os
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.metrics import mean_squared_error, mean_absolute_error

In [2]:
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

In [3]:
train_data = pd.read_csv('./data/train.csv')
test_data = pd.read_csv('./data/test.csv')
sample_submission = pd.read_csv('./data/sample_submission.csv')

In [4]:
columns_needed = ['area_m2', 'contract_year_month',  'floor', 'latitude', 'longitude', 'age', 'deposit']
columns_needed_test = ['area_m2', 'contract_year_month',  'floor', 'latitude','longitude','age' ]
train_data = train_data[columns_needed]
test_data = test_data[columns_needed_test]

In [5]:
train_data = train_data.drop_duplicates()
train_data.shape

(1603545, 7)

In [6]:
holdout_start = 202307
holdout_end = 202312
holdout_data = train_data[(train_data['contract_year_month'] >= holdout_start) & (train_data['contract_year_month'] <= holdout_end)]
train_data = train_data[~((train_data['contract_year_month'] >= holdout_start) & (train_data['contract_year_month'] <= holdout_end))]

In [9]:
X_train = train_data.drop(columns=['deposit'])
y_train = train_data['deposit']
X_holdout = holdout_data.drop(columns=['deposit'])
y_holdout = holdout_data['deposit']
X_test = test_data.copy()

In [10]:
# train_data 에서 retreval로 새로운 feature 추가하기
from sklearn.preprocessing import StandardScaler
import faiss

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_holdout_scaled = scaler.transform(X_holdout)
X_holdout_scaled[:, 0] = X_holdout_scaled[:, 0] * 10

In [11]:
# X, y 다시 합치기
train = pd.concat([X_train, y_train], axis=1)
holdout = pd.concat([X_holdout, y_holdout], axis=1)

In [12]:
# lgbm 모델 학습
from lightgbm import LGBMRegressor
lgbm = LGBMRegressor(random_state=RANDOM_SEED)
lgbm.fit(X_train, y_train)

lgbm_pred = lgbm.predict(X_holdout)
#MAE
mae = mean_absolute_error(y_holdout, lgbm_pred)
print(f'MAE: {mae:.5f}')

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003205 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 938
[LightGBM] [Info] Number of data points in the train set: 1423166, number of used features: 6
[LightGBM] [Info] Start training from score 37959.583441
MAE: 6637.35027


In [13]:
# 일단 holdout에서 위도, 경도가 일치하는 train이 있는지 확인
result_faiss = 0
cnt_faiss = 0
result_tree = 0
cnt_tree = 0
errors = 0
for i in range(len(holdout)):
    NUM = i

    holdout_data_1 = holdout.iloc[NUM]
    # print(holdout_data_1)
    holdout_data_1 = holdout_data_1.drop('deposit')

    filtered_train = train[(train['latitude'] == holdout_data_1['latitude']) & (train['longitude'] == holdout_data_1['longitude'])]
    filtered_train_X = filtered_train.drop(columns=['deposit'])
    filtered_train_y = filtered_train['deposit']

    if not filtered_train.empty:
        # print(len(filtered_train))
        # holdout_data_1과 가장 가까운 train 데이터를 찾기
        train_X_scaled = scaler.transform(filtered_train_X)
        train_X_scaled[:, 0] = train_X_scaled[:, 0] * 10 # Weight
        index = faiss.IndexFlatL2(train_X_scaled.shape[1])
        index.add(train_X_scaled)
        D, I = index.search(np.array([X_holdout_scaled[NUM]]), 10)
        out = 0
        cnt = 0
        for i in range(10):
            if D[0][i] > 3 and i != 0: break # Weight
            out += filtered_train_y.iloc[I[0][i]]
            cnt += 1
        real_deposit = holdout.iloc[NUM]['deposit']
        mae = mean_absolute_error([real_deposit], [out / cnt])
        errors += real_deposit - out / cnt
        result_faiss += mae
        cnt_faiss += 1
    else:
        lgbm_pred = lgbm.predict([holdout_data_1])
        mae = mean_absolute_error([holdout.iloc[NUM]['deposit']], lgbm_pred)
        result_tree += mae
        cnt_tree += 1
        
print(f'faiss MAE: {result_faiss / cnt_faiss:.5f}')
print(f'tree MAE: {result_tree / cnt_tree:.5f}')
print(f'all_mae: {(result_faiss + result_tree) / (cnt_faiss + cnt_tree):.5f}')
print(f'errors: {errors / cnt_faiss:.5f}')

faiss MAE: 4699.90981
tree MAE: 6710.81704
all_mae: 4718.52736
errors: 1508.95397


2.5 10 no data: 

D 3, weight 10, 10개: 
faiss MAE: 4701.91494
tree MAE: 7658.20250
all_mae: 4729.21043

D 2.5 weigth 10: 
faiss MAE: 4741.54740
tree MAE: 7658.20250
all_mae: 4768.47696

D 값 1 이하 채택: 
aiss MAE: 5332.00448 / tree MAE: 7658.20250 / all_mae: 5353.48234

D 값 0.8 이하 채택: 
faiss MAE: 5364.42212 / tree MAE: 7658.20250 / all_mae: 5385.60066

In [14]:
from lightgbm import LGBMRegressor

# lgbm regression으로 
NUM = 130
holdout_data_1 = holdout.iloc[NUM]
filtered_train = train[(train['latitude'] == holdout_data_1['latitude']) & (train['longitude'] == holdout_data_1['longitude'])]
filtered_train_X = filtered_train.drop(columns=['deposit'])
filtered_train_y = filtered_train['deposit']

if not filtered_train.empty:
    
    model = LGBMRegressor(random_state=RANDOM_SEED, verbose=-1)
    model.fit(filtered_train_X, filtered_train_y)
    pred = model.predict([X_holdout_scaled[NUM]])
    print(pred)
    
    real_deposit = y_holdout.iloc[NUM]
    print(real_deposit)

[12527.36437153]
17000.0
