In [194]:
import os
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.metrics import mean_absolute_error

In [195]:
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

In [196]:
train_data = pd.read_csv('./data/train.csv')
test_data = pd.read_csv('./data/test.csv')
sample_submission = pd.read_csv('./data/sample_submission.csv')
sample_submission['deposit'] = sample_submission['deposit'].astype(float)

In [197]:
columns_needed = ['area_m2', 'contract_year_month', 'floor', 'latitude', 'longitude', 'age', 'deposit']
columns_needed_test = ['area_m2', 'contract_year_month', 'floor', 'latitude','longitude', 'age']
train_data = train_data[columns_needed]
test_data = test_data[columns_needed_test]

In [198]:
# 중복 제거
train_data = train_data.drop_duplicates()

In [199]:
from sklearn.preprocessing import StandardScaler
import faiss

scaler = StandardScaler()
X_train = train_data.drop(columns=['deposit'])
y_train = train_data['deposit']
scaler.fit(X_train.values)

In [200]:
# lgbm 모델 학습
from lightgbm import LGBMRegressor
lgbm = LGBMRegressor(random_state=RANDOM_SEED)
lgbm.fit(X_train, y_train)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003523 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 945
[LightGBM] [Info] Number of data points in the train set: 1603545, number of used features: 6
[LightGBM] [Info] Start training from score 38322.623006


In [201]:
import faiss
for test_data_index in range(len(test_data)):
    test_X = test_data.iloc[test_data_index]
    # 위도 경도 같은 것만 뽑기
    filtered_train_data = train_data[(train_data['latitude'] == test_X['latitude']) & (train_data['longitude'] == test_X['longitude'])]
    if not filtered_train_data.empty:
        filtered_train_y = filtered_train_data['deposit']
        filtered_train_data = filtered_train_data.drop(columns=['deposit'])
        filtered_train_data_scaled = scaler.transform(filtered_train_data.values)
        filtered_train_data_scaled[:, 0] = filtered_train_data_scaled[:, 0] * 2 # weight
        # test_X
        test_X = scaler.transform([test_X.values])
        test_X[:, 0] = test_X[:, 0] * 2 # weight
        
        # faiss
        index = faiss.IndexFlatL2(filtered_train_data_scaled.shape[1])
        index.add(filtered_train_data_scaled)
        D, I = index.search(test_X, 10)
        result_pred = []
        for i in range(10):
            if i != 0 and D[0][i] > 1: # 1 이상이면 break
                break
            result_pred.append(filtered_train_y.iloc[I[0][i]])
        pred = np.mean(result_pred)
    else:
        # lgbm
        pred = lgbm.predict([test_X])
    if test_data_index % 1000 == 0:
        print(test_data_index, pred)
    sample_submission.iloc[test_data_index, 1] = pred

0 22325.0
1000 19608.0
2000 11620.0
3000 16680.6
4000 28201.0
5000 21500.0
6000 42300.0
7000 20333.333333333332
8000 68100.0
9000 43000.0
10000 22173.833333333332
11000 28650.0
12000 30900.0
13000 25200.0
14000 50310.0
15000 32425.0
16000 [20456.45888756]
17000 17750.0
18000 16333.333333333334
19000 28777.777777777777
20000 10450.0
21000 55050.0
22000 43400.0
23000 48333.333333333336
24000 21550.0
25000 58500.0
26000 36480.0
27000 41450.0
28000 9000.0
29000 14510.0
30000 44000.0
31000 40000.0
32000 32540.0
33000 16517.5
34000 38000.0
35000 47000.0
36000 38666.666666666664
37000 45214.28571428572
38000 91100.0
39000 77250.0
40000 21000.0
41000 44852.0
42000 18900.0
43000 16005.0
44000 16000.0
45000 60800.0
46000 37784.375
47000 25683.3
48000 115000.0
49000 25785.0
50000 22490.0
51000 15514.0
52000 38833.333333333336
53000 95200.0
54000 44750.0
55000 60377.77777777778
56000 60000.0
57000 32895.0
58000 124270.0
59000 87810.0
60000 61000.0
61000 19315.0
62000 12330.0
63000 29150.0
64000 32

In [202]:
sample_submission.to_csv('retrieval_pred.csv', index=False)