# 라이브러리 불러오기

In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.neighbors import BallTree
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import lightgbm as lgb
from sklearn.metrics import mean_squared_error, mean_absolute_error

# 랜덤 시드 설정

In [2]:
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

# 데이터 불러오기

In [3]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
sample_submission = pd.read_csv('sample_submission.csv')
park = pd.read_csv('parkInfo.csv')
schools = pd.read_csv('schoolinfo.csv')

In [4]:
# 'index'를 제외한 모든 열에 대해 중복 확인
columns_to_check = [col for col in train.columns if col != 'index']
duplicates = train.duplicated(subset=columns_to_check, keep=False)

# 중복된 행 출력
print(train[duplicates])

# 중복된 행의 수 출력
print(f"중복된 행의 수: {duplicates.sum()}")

           index   area_m2  contract_year_month  contract_day  contract_type  \
15            15   84.9342               201907            31              2   
16            16   84.9342               201907            31              2   
28            28  146.4005               201911            21              2   
29            29  146.4005               201911            21              2   
33            33   84.9342               201912            14              2   
...          ...       ...                  ...           ...            ...   
1801197  1801197  101.9088               202308            22              2   
1801198  1801198  114.9285               202308            28              1   
1801199  1801199  114.9285               202308            28              1   
1801210  1801210  114.9285               202310            26              2   
1801211  1801211  114.9285               202310            26              2   

         floor  built_year   latitude  

In [5]:
# 중복 제거
train_no_duplicates = train.drop_duplicates(subset=columns_to_check)

print(f"원본 데이터셋 크기: {len(train)}")
print(f"중복 제거 후 데이터셋 크기: {len(train_no_duplicates)}")
print(f"제거된 행의 수: {len(train) - len(train_no_duplicates)}")

원본 데이터셋 크기: 1801228
중복 제거 후 데이터셋 크기: 1717611
제거된 행의 수: 83617


In [6]:
train_sample = train_no_duplicates.sample(frac=0.01, random_state=42)
print(f"중복제거 데이터셋 크기: {len(train_no_duplicates)}")
print(f"1/100 후 데이터셋 크기: {len(train_sample)}")
print(f"제거된 행의 수: {len(train_no_duplicates) - len(train_sample)}")

중복제거 데이터셋 크기: 1717611
1/100 후 데이터셋 크기: 17176
제거된 행의 수: 1700435


In [7]:
# 데이터 로드
train_df = train_no_duplicates
test_df = test

# 특성 선택
features = ['latitude', 'longitude', 'area_m2', 'floor', 'built_year', 'contract_year_month']

# Train 데이터를 train과 validation으로 분리
train_df, val_df = train_test_split(train_df, test_size=0.2, random_state=42)

# 스케일링
scaler = StandardScaler()
train_scaled = scaler.fit_transform(train_df[features])
val_scaled = scaler.transform(val_df[features])
test_scaled = scaler.transform(test_df[features])

# BallTree 구축
tree = BallTree(train_scaled, leaf_size=2)

# 예측 함수
def predict_deposit(neighbors, distances):
    weights = 1 / (distances + 1e-5)
    weighted_deposits = neighbors['deposit'] * weights
    return np.sum(weighted_deposits) / np.sum(weights)

# 예측 수행 함수
def make_predictions(data_scaled, k):
    distances, indices = tree.query(data_scaled, k=k)
    predictions = []
    for i in range(len(data_scaled)):
        neighbor_indices = indices[i]
        neighbor_distances = distances[i]
        neighbors = train_df.iloc[neighbor_indices]
        pred = predict_deposit(neighbors, neighbor_distances)
        predictions.append(pred)
    return predictions

# Validation 예측
k = 5
val_predictions = make_predictions(val_scaled, k)

# Validation MAE 계산
val_mae = mean_absolute_error(val_df['deposit'], val_predictions)
print(f"Validation MAE: {val_mae}")

# Test 예측
test_predictions = make_predictions(test_scaled, k)

# 결과 저장
submission_df = pd.DataFrame({
    'index': test_df['index'],
    'deposit': test_predictions
})

submission_df.to_csv('submission.csv', index=False)

print("예측이 완료되었습니다. 'submission.csv' 파일을 확인하세요.")

Validation MAE: 5399.236570026839
예측이 완료되었습니다. 'submission.csv' 파일을 확인하세요.


In [None]:
# 데이터 로드
train_df = train_sample
test_df = test

# 특성 선택
features = ['latitude', 'longitude', 'area_m2', 'floor', 'built_year', 'contract_year_month']

# Train 데이터를 train과 validation으로 분리
train_df, val_df = train_test_split(train_df, test_size=0.2, random_state=42)

# 스케일링
scaler = StandardScaler()
train_scaled = scaler.fit_transform(train_df[features])
val_scaled = scaler.transform(val_df[features])
test_scaled = scaler.transform(test_df[features])

# BallTree 구축
tree = BallTree(train_scaled, leaf_size=2)

# 예측 함수
def predict_deposit(neighbors, distances):
    weights = 1 / (distances + 1e-5)
    weighted_deposits = neighbors['deposit'] * weights
    return np.sum(weighted_deposits) / np.sum(weights)

# 예측 수행 함수
def make_predictions(data_scaled, k):
    distances, indices = tree.query(data_scaled, k=k)
    predictions = []
    for i in range(len(data_scaled)):
        neighbor_indices = indices[i]
        neighbor_distances = distances[i]
        neighbors = train_df.iloc[neighbor_indices]
        pred = predict_deposit(neighbors, neighbor_distances)
        predictions.append(pred)
    return predictions

# Validation 예측
k = 5
val_predictions = make_predictions(val_scaled, k)

# Validation MAE 계산
val_mae = mean_absolute_error(val_df['deposit'], val_predictions)
print(f"Validation MAE: {val_mae}")

# Test 예측
test_predictions = make_predictions(test_scaled, k)

# 결과 저장
submission_df = pd.DataFrame({
    'index': test_df['index'],
    'deposit': test_predictions
})

submission_df.to_csv('submission.csv', index=False)

print("예측이 완료되었습니다. 'submission.csv' 파일을 확인하세요.")

Validation MAE: 8816.349132933661


In [12]:
# 데이터 로드
train_df = train_sample
test_df = test

# 특성 선택
features = ['latitude', 'longitude', 'area_m2', 'floor', 'built_year', 'contract_year_month']

# Train 데이터를 train과 validation으로 분리
train_df, val_df = train_test_split(train_df, test_size=0.2, random_state=42)

# 스케일링
scaler = StandardScaler()
train_scaled = scaler.fit_transform(train_df[features])
val_scaled = scaler.transform(val_df[features])
test_scaled = scaler.transform(test_df[features])

# 특성 가중치 조정
weights = [1, 1, 2, 0.5, 0.5, 1]  # latitude, longitude, area_m2, floor, built_year, contract_year_month 순서
train_scaled = train_scaled * weights
val_scaled = val_scaled * weights
test_scaled = test_scaled * weights

# BallTree 구축
tree = BallTree(train_scaled, leaf_size=2)

# 예측 함수
def predict_deposit(neighbors, distances):
    weights = 1 / (distances + 1e-5)
    weighted_deposits = neighbors['deposit'] * weights
    return np.sum(weighted_deposits) / np.sum(weights)

# 예측 수행 함수
def make_predictions(data_scaled, k):
    distances, indices = tree.query(data_scaled, k=k)
    predictions = []
    for i in range(len(data_scaled)):
        neighbor_indices = indices[i]
        neighbor_distances = distances[i]
        neighbors = train_df.iloc[neighbor_indices]
        pred = predict_deposit(neighbors, neighbor_distances)
        predictions.append(pred)
    return predictions

# Validation 예측
k = 5
val_predictions = make_predictions(val_scaled, k)

# Validation MAE 계산
val_mae = mean_absolute_error(val_df['deposit'], val_predictions)
print(f"Validation MAE: {val_mae}")

# Test 예측
test_predictions = make_predictions(test_scaled, k)

# 결과 저장
# submission_df = pd.DataFrame({
#     'index': test_df['index'],
#     'deposit': test_predictions
# })

# submission_df.to_csv('submission.csv', index=False)

# print("예측이 완료되었습니다. 'submission.csv' 파일을 확인하세요.")

Validation MAE: 8580.527569380753


In [13]:
# 데이터 로드
train_df = train_sample
test_df = test

# 특성 선택
features = ['latitude', 'longitude', 'area_m2', 'floor', 'built_year', 'contract_year_month']

# Train 데이터를 train과 validation으로 분리
train_df, val_df = train_test_split(train_df, test_size=0.2, random_state=42)

# 스케일링
scaler = StandardScaler()
train_scaled = scaler.fit_transform(train_df[features])
val_scaled = scaler.transform(val_df[features])
test_scaled = scaler.transform(test_df[features])

# 특성 가중치 조정
weights = [1, 1, 5, 0.5, 0.5, 0.5]  # latitude, longitude, area_m2, floor, built_year, contract_year_month 순서
train_scaled = train_scaled * weights
val_scaled = val_scaled * weights
test_scaled = test_scaled * weights

# BallTree 구축
tree = BallTree(train_scaled, leaf_size=2)

# 예측 함수
def predict_deposit(neighbors, distances):
    weights = 1 / (distances + 1e-5)
    weighted_deposits = neighbors['deposit'] * weights
    return np.sum(weighted_deposits) / np.sum(weights)

# 예측 수행 함수
def make_predictions(data_scaled, k):
    distances, indices = tree.query(data_scaled, k=k)
    predictions = []
    for i in range(len(data_scaled)):
        neighbor_indices = indices[i]
        neighbor_distances = distances[i]
        neighbors = train_df.iloc[neighbor_indices]
        pred = predict_deposit(neighbors, neighbor_distances)
        predictions.append(pred)
    return predictions

# Validation 예측
k = 5
val_predictions = make_predictions(val_scaled, k)

# Validation MAE 계산
val_mae = mean_absolute_error(val_df['deposit'], val_predictions)
print(f"Validation MAE: {val_mae}")

# Test 예측
test_predictions = make_predictions(test_scaled, k)

# 결과 저장
# submission_df = pd.DataFrame({
#     'index': test_df['index'],
#     'deposit': test_predictions
# })

# submission_df.to_csv('submission.csv', index=False)

# print("예측이 완료되었습니다. 'submission.csv' 파일을 확인하세요.")

Validation MAE: 8537.947125057197


In [23]:
# 데이터 로드
train_df = train_no_duplicates
test_df = test

# 특성 선택
features = ['latitude', 'longitude', 'area_m2', 'floor', 'built_year', 'contract_year_month']

# Train 데이터를 train과 validation으로 분리
train_df, val_df = train_test_split(train_df, test_size=0.2, random_state=42)

# 스케일링
scaler = StandardScaler()
train_scaled = scaler.fit_transform(train_df[features])
val_scaled = scaler.transform(val_df[features])
test_scaled = scaler.transform(test_df[features])

# 특성 가중치 조정
weights = [1, 1, 5, 0.5, 0.5, 0.5]  # latitude, longitude, area_m2, floor, built_year, contract_year_month 순서
train_scaled = train_scaled * weights
val_scaled = val_scaled * weights
test_scaled = test_scaled * weights

# BallTree 구축
tree = BallTree(train_scaled, leaf_size=2)

# 예측 함수
def predict_deposit(neighbors, distances):
    weights = 1 / (distances + 1e-5)
    weighted_deposits = neighbors['deposit'] * weights
    return np.sum(weighted_deposits) / np.sum(weights)

# 예측 수행 함수
def make_predictions(data_scaled, k):
    distances, indices = tree.query(data_scaled, k=k)
    predictions = []
    for i in range(len(data_scaled)):
        neighbor_indices = indices[i]
        neighbor_distances = distances[i]
        neighbors = train_df.iloc[neighbor_indices]
        pred = predict_deposit(neighbors, neighbor_distances)
        predictions.append(pred)
    return predictions

# Validation 예측
k = 5
val_predictions = make_predictions(val_scaled, k)

# Validation MAE 계산
val_mae = mean_absolute_error(val_df['deposit'], val_predictions)
print(f"Validation MAE: {val_mae}")

# Test 예측
test_predictions = make_predictions(test_scaled, k)

# 결과 저장
# submission_df = pd.DataFrame({
#     'index': test_df['index'],
#     'deposit': test_predictions
# })

# submission_df.to_csv('submission.csv', index=False)

# print("예측이 완료되었습니다. 'submission.csv' 파일을 확인하세요.")

Validation MAE: 5190.820037476034


KeyboardInterrupt: 

# 병렬처리 코드

In [13]:
from joblib import Parallel, delayed

# 데이터 로드
train_df = train_no_duplicates
test_df = test

# 특성 선택
features = ['latitude', 'longitude', 'area_m2', 'floor', 'built_year', 'contract_year_month']

# Train 데이터를 train과 validation으로 분리
train_df, val_df = train_test_split(train_df, test_size=0.2, random_state=42)

# 스케일링
scaler = StandardScaler()
train_scaled = scaler.fit_transform(train_df[features])
val_scaled = scaler.transform(val_df[features])
test_scaled = scaler.transform(test_df[features])

# 특성 가중치 조정
weights = np.array([1, 1, 5, 0.5, 0.5, 0.5])  # latitude, longitude, area_m2, floor, built_year, contract_year_month 순서
train_scaled = train_scaled * weights
val_scaled = val_scaled * weights
test_scaled = test_scaled * weights

# BallTree 구축
tree = BallTree(train_scaled, leaf_size=40)  # leaf_size를 증가시켜 성능 향상

# 예측 함수
def predict_deposit(neighbors, distances):
    weights = 1 / (distances + 1e-5)
    weighted_deposits = neighbors['deposit'] * weights
    return np.sum(weighted_deposits) / np.sum(weights)

# 단일 데이터 포인트에 대한 예측 함수
def predict_single(data_point, k):
    distances, indices = tree.query([data_point], k=k)
    neighbors = train_df.iloc[indices[0]]
    return predict_deposit(neighbors, distances[0])

# 병렬 예측 수행 함수
def make_predictions_parallel(data_scaled, k):
    return Parallel(n_jobs=-1)(delayed(predict_single)(data_point, k) for data_point in data_scaled)

# Validation 예측
k = 5
val_predictions = make_predictions_parallel(val_scaled, k)

# Validation MAE 계산
val_mae = mean_absolute_error(val_df['deposit'], val_predictions)
print(f"Validation MAE: {val_mae}")

# Test 예측
test_predictions = make_predictions_parallel(test_scaled, k)

Validation MAE: 5190.968640296472


# 병렬처리 동적 코드

In [7]:
from joblib import Parallel, delayed

train_df = train_no_duplicates
test_df = test

# 특성 선택
features = ['latitude', 'longitude', 'area_m2', 'floor', 'built_year', 'contract_year_month']

# Train 데이터를 train과 validation으로 분리
train_df, val_df = train_test_split(train_df, test_size=0.2, random_state=42)

# 스케일링
scaler = StandardScaler()
train_scaled = scaler.fit_transform(train_df[features])
val_scaled = scaler.transform(val_df[features])
test_scaled = scaler.transform(test_df[features])

# 특성 가중치 조정
weights = np.array([1, 1, 5, 0.5, 0.5, 0.5])  # latitude, longitude, area_m2, floor, built_year, contract_year_month 순서
train_scaled = train_scaled * weights
val_scaled = val_scaled * weights
test_scaled = test_scaled * weights

# BallTree 구축
tree = BallTree(train_scaled, leaf_size=40)

# 예측 함수
def predict_deposit(neighbors, distances):
    weights = 1 / (distances + 1e-5)
    weighted_deposits = neighbors['deposit'] * weights
    return np.sum(weighted_deposits) / np.sum(weights)

# 단일 예측 함수
def predict_single(data_point, min_k, max_k):
    distances, indices = tree.query([data_point], k=max_k)
    k = min(max_k, max(min_k, int(np.sum(distances < np.median(distances)))))
    neighbor_indices = indices[0][:k]
    neighbor_distances = distances[0][:k]
    neighbors = train_df.iloc[neighbor_indices]
    return predict_deposit(neighbors, neighbor_distances)

# 병렬 예측 수행 함수
def make_predictions_parallel(data_scaled, min_k, max_k):
    return Parallel(n_jobs=-1)(delayed(predict_single)(data_point, min_k, max_k) for data_point in data_scaled)

# Validation 예측
min_k = 3
max_k = 10
val_predictions = make_predictions_parallel(val_scaled, min_k, max_k)

# Validation MAE 계산
val_mae = mean_absolute_error(val_df['deposit'], val_predictions)
print(f"Validation MAE: {val_mae}")

# Test 예측
test_predictions = make_predictions_parallel(test_scaled, min_k, max_k)

Validation MAE: 5197.313491807083


# 이상치 처리 병렬 코드

In [21]:
# 데이터 로드
train_df = train_sample
test_df = test

# 특성 선택
features = ['latitude', 'longitude', 'area_m2', 'floor', 'built_year', 'contract_year_month']

# Train 데이터를 train과 validation으로 분리
train_df, val_df = train_test_split(train_df, test_size=0.2, random_state=42)

# 스케일링
scaler = StandardScaler()
train_scaled = scaler.fit_transform(train_df[features])
val_scaled = scaler.transform(val_df[features])
test_scaled = scaler.transform(test_df[features])

# 특성 가중치 조정
weights = np.array([1, 1, 5, 0.5, 0.5, 0.5])  # latitude, longitude, area_m2, floor, built_year, contract_year_month 순서
train_scaled = train_scaled * weights
val_scaled = val_scaled * weights
test_scaled = test_scaled * weights

# BallTree 구축
tree = BallTree(train_scaled, leaf_size=40)  # leaf_size를 증가시켜 성능 향상

# 예측 함수
def predict_deposit(neighbors, distances):
    weights = 1 / (distances + 1e-5)
    weighted_deposits = neighbors['deposit'] * weights
    return np.sum(weighted_deposits) / np.sum(weights)

# 단일 데이터 포인트에 대한 예측 함수
def predict_single(data_point, k):
    distances, indices = tree.query([data_point], k=k)
    neighbors = train_df.iloc[indices[0]]
    return predict_deposit(neighbors, distances[0])

# 병렬 예측 수행 함수
def make_predictions_parallel(data_scaled, k):
    return Parallel(n_jobs=-1)(delayed(predict_single)(data_point, k) for data_point in data_scaled)

# 이상치 처리 함수
def handle_outliers_log(predictions, threshold=3):
    log_predictions = np.log1p(predictions)
    median = np.median(log_predictions)
    mad = np.median(np.abs(log_predictions - median))
    lower_bound = median - threshold * mad
    upper_bound = median + threshold * mad
    clipped_log_predictions = np.clip(log_predictions, lower_bound, upper_bound)
    return np.expm1(clipped_log_predictions)

# Validation 예측
k = 5
val_predictions = make_predictions_parallel(val_scaled, k)
val_predictions = val_predictions = handle_outliers_log(val_predictions, threshold=5)  # 임계값을 높임

# Validation MAE 계산
val_mae = mean_absolute_error(val_df['deposit'], val_predictions)
print(f"Validation MAE: {val_mae}")

# Test 예측
test_predictions = make_predictions_parallel(test_scaled, k)
test_predictions = handle_outliers_log(test_predictions)

Validation MAE: 8552.217652688742


In [23]:
from sklearn.cluster import KMeans

# 데이터 로드
train_df = train_sample
test_df = test

# 특성 선택
features = ['latitude', 'longitude', 'area_m2', 'floor', 'built_year', 'contract_year_month']

# Train 데이터를 train과 validation으로 분리
train_df, val_df = train_test_split(train_df, test_size=0.2, random_state=42)

# 스케일링
scaler = StandardScaler()
train_scaled = scaler.fit_transform(train_df[features])
val_scaled = scaler.transform(val_df[features])
test_scaled = scaler.transform(test_df[features])

# 특성 가중치 조정
weights = np.array([1, 1, 5, 0.5, 0.5, 0.5])  # latitude, longitude, area_m2, floor, built_year, contract_year_month 순서
train_scaled = train_scaled * weights
val_scaled = val_scaled * weights
test_scaled = test_scaled * weights

# BallTree 구축
tree = BallTree(train_scaled, leaf_size=40)  # leaf_size를 증가시켜 성능 향상

# 예측 함수
def predict_deposit(neighbors, distances):
    weights = 1 / (distances + 1e-5)
    weighted_deposits = neighbors['deposit'] * weights
    return np.sum(weighted_deposits) / np.sum(weights)

# 단일 데이터 포인트에 대한 예측 함수
def predict_single(data_point, k):
    distances, indices = tree.query([data_point], k=k)
    neighbors = train_df.iloc[indices[0]]
    return predict_deposit(neighbors, distances[0])

# 병렬 예측 수행 함수
def make_predictions_parallel(data_scaled, k):
    return Parallel(n_jobs=-1)(delayed(predict_single)(data_point, k) for data_point in data_scaled)

# 지역별 이상치 처리 함수
def handle_outliers_by_region(predictions, latitudes, longitudes, threshold=3):
    # predictions를 numpy 배열로 변환
    predictions = np.array(predictions)
    
    # 간단한 지역 클러스터링
    coords = np.column_stack((latitudes, longitudes))
    kmeans = KMeans(n_clusters=10, random_state=42).fit(coords)
    regions = kmeans.predict(coords)
    
    cleaned_predictions = np.zeros_like(predictions)
    for region in np.unique(regions):
        region_mask = (regions == region)
        region_predictions = predictions[region_mask]
        region_cleaned = handle_outliers_log(region_predictions, threshold)
        cleaned_predictions[region_mask] = region_cleaned
    
    return cleaned_predictions

# Validation 예측
k = 5
val_predictions = make_predictions_parallel(val_scaled, k)

# 지역별 이상치 처리 적용
val_predictions_region = handle_outliers_by_region(val_predictions, val_df['latitude'], val_df['longitude'], threshold=5)

val_mae_region = mean_absolute_error(val_df['deposit'], val_predictions_region)

print(f"Validation MAE (Region-based): {val_mae_region}")

Validation MAE (Region-based): 8545.55447674819


In [24]:
from joblib import Parallel, delayed

train_df = train_sample
test_df = test

# 특성 선택
features = ['latitude', 'longitude', 'area_m2', 'floor', 'built_year', 'contract_year_month']

# Train 데이터를 train과 validation으로 분리
train_df, val_df = train_test_split(train_df, test_size=0.2, random_state=42)

# 스케일링
scaler = StandardScaler()
train_scaled = scaler.fit_transform(train_df[features])
val_scaled = scaler.transform(val_df[features])
test_scaled = scaler.transform(test_df[features])

# 특성 가중치 조정
weights = np.array([1, 1, 5, 0.5, 0.5, 0.5])  # latitude, longitude, area_m2, floor, built_year, contract_year_month 순서
train_scaled = train_scaled * weights
val_scaled = val_scaled * weights
test_scaled = test_scaled * weights

# BallTree 구축
tree = BallTree(train_scaled, leaf_size=40)

# 예측 함수
def predict_deposit(neighbors, distances):
    weights = 1 / (distances + 1e-5)
    weighted_deposits = neighbors['deposit'] * weights
    return np.sum(weighted_deposits) / np.sum(weights)

# 단일 예측 함수
def predict_single(data_point, min_k, max_k):
    distances, indices = tree.query([data_point], k=max_k)
    k = min(max_k, max(min_k, int(np.sum(distances < np.median(distances)))))
    neighbor_indices = indices[0][:k]
    neighbor_distances = distances[0][:k]
    neighbors = train_df.iloc[neighbor_indices]
    return predict_deposit(neighbors, neighbor_distances)

# 병렬 예측 수행 함수
def make_predictions_parallel(data_scaled, min_k, max_k):
    return Parallel(n_jobs=-1)(delayed(predict_single)(data_point, min_k, max_k) for data_point in data_scaled)

# Validation 예측
min_k = 3
max_k = 10
val_predictions = make_predictions_parallel(val_scaled, min_k, max_k)

# Validation MAE 계산
val_mae = mean_absolute_error(val_df['deposit'], val_predictions)
print(f"Validation MAE: {val_mae}")

# Test 예측
test_predictions = make_predictions_parallel(test_scaled, min_k, max_k)

Validation MAE: 8537.869233380277


# 앙상블

In [8]:
# 데이터 로드
train_df = train_sample
test_df = test

# 특성 선택
features = ['latitude', 'longitude', 'area_m2', 'floor', 'built_year', 'contract_year_month']

# Train 데이터를 train과 validation으로 분리
train_df, val_df = train_test_split(train_df, test_size=0.2, random_state=42)

# 스케일링
scaler = StandardScaler()
train_scaled = scaler.fit_transform(train_df[features])
val_scaled = scaler.transform(val_df[features])
test_scaled = scaler.transform(test_df[features])

# 특성 가중치 조정
weights = np.array([1, 1, 5, 0.5, 0.5, 0.5])  # latitude, longitude, area_m2, floor, built_year, contract_year_month 순서
train_scaled = train_scaled * weights
val_scaled = val_scaled * weights
test_scaled = test_scaled * weights

# BallTree 구축
tree = BallTree(train_scaled, leaf_size=40)

# 예측 함수
def predict_deposit(neighbors, distances):
    weights = 1 / (distances + 1e-5)
    weighted_deposits = neighbors['deposit'] * weights
    return np.sum(weighted_deposits) / np.sum(weights)

# 단일 예측 함수
def predict_single(data_point, k):
    distances, indices = tree.query([data_point], k=k)
    neighbors = train_df.iloc[indices[0]]
    return predict_deposit(neighbors, distances[0])

# 병렬 예측 수행 함수
def make_predictions_parallel(data_scaled, k):
    return Parallel(n_jobs=-1)(delayed(predict_single)(data_point, k) for data_point in data_scaled)

# 앙상블 예측 함수
def ensemble_predictions(data_scaled, k_values):
    all_predictions = Parallel(n_jobs=-1)(
        delayed(make_predictions_parallel)(data_scaled, k) for k in k_values
    )
    return np.mean(all_predictions, axis=0)

# k 값 리스트 정의
k_values = [3, 5, 7, 9]

# Validation 예측
val_predictions = ensemble_predictions(val_scaled, k_values)

# Validation MAE 계산
val_mae = mean_absolute_error(val_df['deposit'], val_predictions)
print(f"Validation MAE: {val_mae}")

# Test 예측
test_predictions = ensemble_predictions(test_scaled, k_values)

Validation MAE: 8405.48183286097


In [9]:
# 데이터 로드
train_df = train_sample
test_df = test

# 특성 선택
features = ['latitude', 'longitude', 'area_m2', 'floor', 'built_year', 'contract_year_month']

# Train 데이터를 train과 validation으로 분리
train_df, val_df = train_test_split(train_df, test_size=0.2, random_state=42)

# 스케일링
scaler = StandardScaler()
train_scaled = scaler.fit_transform(train_df[features])
val_scaled = scaler.transform(val_df[features])
test_scaled = scaler.transform(test_df[features])

# 특성 가중치 조정
weights = np.array([1, 1, 5, 0.5, 0.5, 0.5])  # latitude, longitude, area_m2, floor, built_year, contract_year_month 순서
train_scaled = train_scaled * weights
val_scaled = val_scaled * weights
test_scaled = test_scaled * weights

# BallTree 구축
tree = BallTree(train_scaled, leaf_size=40)

# 예측 함수 (거리에 따른 가중치 함수 조정)
def predict_deposit(neighbors, distances):
    weights = np.exp(-distances)  # 지수 함수 사용
    weighted_deposits = neighbors['deposit'] * weights
    return np.sum(weighted_deposits) / np.sum(weights)

# 단일 예측 함수
def predict_single(data_point, k):
    distances, indices = tree.query([data_point], k=k)
    neighbors = train_df.iloc[indices[0]]
    return predict_deposit(neighbors, distances[0])

# 병렬 예측 수행 함수
def make_predictions_parallel(data_scaled, k):
    return Parallel(n_jobs=-1)(delayed(predict_single)(data_point, k) for data_point in data_scaled)

# 앙상블 예측 함수
def ensemble_predictions(data_scaled, k_values):
    all_predictions = Parallel(n_jobs=-1)(
        delayed(make_predictions_parallel)(data_scaled, k) for k in k_values
    )
    return np.mean(all_predictions, axis=0)

# k 값 리스트 정의
k_values = [3, 5, 7, 9]

# Validation 예측
val_predictions = ensemble_predictions(val_scaled, k_values)

# Validation MAE 계산
val_mae = mean_absolute_error(val_df['deposit'], val_predictions)
print(f"Validation MAE: {val_mae}")

# Test 예측
test_predictions = ensemble_predictions(test_scaled, k_values)

Validation MAE: 8486.101253090092


# StandardScaler, RobustScaler, MinMaxScaler / 로그변환 비교

In [10]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler
from sklearn.neighbors import BallTree
from sklearn.metrics import mean_absolute_error
from joblib import Parallel, delayed

# 데이터 로드 (이전 코드에서 가정)
train_df = train_sample
test_df = test

# 특성 선택
features = ['latitude', 'longitude', 'area_m2', 'floor', 'built_year', 'contract_year_month']

# Train 데이터를 train과 validation으로 분리
train_df, val_df = train_test_split(train_df, test_size=0.2, random_state=42)

# 로그 변환 함수
def log_transform(df):
    return df.apply(lambda x: np.log1p(x) if x.min() >= 0 else x)

# 스케일링 및 예측 함수
def scale_and_predict(scaler, apply_log=False):
    if apply_log:
        train_scaled = log_transform(train_df[features])
        val_scaled = log_transform(val_df[features])
        test_scaled = log_transform(test_df[features])
    else:
        train_scaled = train_df[features]
        val_scaled = val_df[features]
        test_scaled = test_df[features]
    
    train_scaled = scaler.fit_transform(train_scaled)
    val_scaled = scaler.transform(val_scaled)
    test_scaled = scaler.transform(test_scaled)
    
    # 특성 가중치 조정
    weights = np.array([1, 1, 5, 0.5, 0.5, 0.5])
    train_scaled = train_scaled * weights
    val_scaled = val_scaled * weights
    test_scaled = test_scaled * weights
    
    # BallTree 구축
    tree = BallTree(train_scaled, leaf_size=40)
    
    # 예측 함수
    def predict_deposit(neighbors, distances):
        weights = 1 / (distances + 1e-5)
        weighted_deposits = neighbors['deposit'] * weights
        return np.sum(weighted_deposits) / np.sum(weights)
    
    # 단일 예측 함수
    def predict_single(data_point, k):
        distances, indices = tree.query([data_point], k=k)
        neighbors = train_df.iloc[indices[0]]
        return predict_deposit(neighbors, distances[0])
    
    # 병렬 예측 수행 함수
    def make_predictions_parallel(data_scaled, k):
        return Parallel(n_jobs=-1)(delayed(predict_single)(data_point, k) for data_point in data_scaled)
    
    # 앙상블 예측 함수
    def ensemble_predictions(data_scaled, k_values):
        all_predictions = Parallel(n_jobs=-1)(
            delayed(make_predictions_parallel)(data_scaled, k) for k in k_values
        )
        return np.mean(all_predictions, axis=0)
    
    # k 값 리스트 정의
    k_values = [3, 5, 7, 9]
    
    # Validation 예측
    val_predictions = ensemble_predictions(val_scaled, k_values)
    
    # Validation MAE 계산
    val_mae = mean_absolute_error(val_df['deposit'], val_predictions)
    
    return val_mae

# 각 스케일러에 대해 MAE 계산
scalers = {
    'StandardScaler': StandardScaler(),
    'RobustScaler': RobustScaler(),
    'MinMaxScaler': MinMaxScaler()
}

results = {}

for name, scaler in scalers.items():
    # 로그 변환 없이
    mae_without_log = scale_and_predict(scaler, apply_log=False)
    print(f"{name} without Log Transform - Validation MAE: {mae_without_log}")
    results[f"{name} without Log"] = mae_without_log
    
    # 로그 변환 적용
    mae_with_log = scale_and_predict(scaler, apply_log=True)
    print(f"{name} with Log Transform - Validation MAE: {mae_with_log}")
    results[f"{name} with Log"] = mae_with_log

# 결과 정렬
sorted_results = sorted(results.items(), key=lambda x: x[1])

print("\nResults sorted by MAE:")
for method, mae in sorted_results:
    print(f"{method}: {mae}")

print(f"\nBest method: {sorted_results[0][0]} with MAE: {sorted_results[0][1]}")

StandardScaler without Log Transform - Validation MAE: 8405.48183286097
StandardScaler with Log Transform - Validation MAE: 8468.876950218077
RobustScaler without Log Transform - Validation MAE: 8584.839449059366
RobustScaler with Log Transform - Validation MAE: 8540.002242918164
MinMaxScaler without Log Transform - Validation MAE: 8566.245673670523
MinMaxScaler with Log Transform - Validation MAE: 8771.50220388082

Results sorted by MAE:
StandardScaler without Log: 8405.48183286097
StandardScaler with Log: 8468.876950218077
RobustScaler with Log: 8540.002242918164
MinMaxScaler without Log: 8566.245673670523
RobustScaler without Log: 8584.839449059366
MinMaxScaler with Log: 8771.50220388082

Best method: StandardScaler without Log with MAE: 8405.48183286097


# 거리 계산 방식을 변경

In [12]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import BallTree, KDTree
from sklearn.metrics import mean_absolute_error
from joblib import Parallel, delayed

# 데이터 로드
train_df = train_sample
test_df = test

# 특성 선택
features = ['latitude', 'longitude', 'area_m2', 'floor', 'built_year', 'contract_year_month']

# Train 데이터를 train과 validation으로 분리
train_df, val_df = train_test_split(train_df, test_size=0.2, random_state=42)

# 스케일링
scaler = StandardScaler()
train_scaled = scaler.fit_transform(train_df[features])
val_scaled = scaler.transform(val_df[features])
test_scaled = scaler.transform(test_df[features])

# 특성 가중치 조정
weights = np.array([1, 1, 5, 0.5, 0.5, 0.5])  # latitude, longitude, area_m2, floor, built_year, contract_year_month 순서
train_scaled = train_scaled * weights
val_scaled = val_scaled * weights
test_scaled = test_scaled * weights

# 예측 함수 (거리에 따른 가중치 함수 조정)
def predict_deposit(neighbors, distances):
    weights = np.exp(-distances)  # 지수 함수 사용
    weighted_deposits = neighbors['deposit'] * weights
    return np.sum(weighted_deposits) / np.sum(weights)

# 단일 예측 함수
def predict_single(tree, data_point, k):
    distances, indices = tree.query([data_point], k=k)
    neighbors = train_df.iloc[indices[0]]
    return predict_deposit(neighbors, distances[0])

# 병렬 예측 수행 함수
def make_predictions_parallel(tree, data_scaled, k):
    return Parallel(n_jobs=-1)(delayed(predict_single)(tree, data_point, k) for data_point in data_scaled)

# 앙상블 예측 함수
def ensemble_predictions(tree, data_scaled, k_values):
    all_predictions = Parallel(n_jobs=-1)(
        delayed(make_predictions_parallel)(tree, data_scaled, k) for k in k_values
    )
    return np.mean(all_predictions, axis=0)

# k 값 리스트 정의
k_values = [3, 5, 7, 9]

# 각 거리 메트릭에 대한 예측 및 MAE 계산 함수
def predict_and_calculate_mae(tree, name):
    val_predictions = ensemble_predictions(tree, val_scaled, k_values)
    val_mae = mean_absolute_error(val_df['deposit'], val_predictions)
    print(f"{name} Validation MAE: {val_mae}")
    return val_mae

# 유클리드 거리 (기본 BallTree)
euclidean_tree = BallTree(train_scaled, leaf_size=40, metric='euclidean')
euclidean_mae = predict_and_calculate_mae(euclidean_tree, "Euclidean")

# 맨해튼 거리
manhattan_tree = BallTree(train_scaled, leaf_size=40, metric='manhattan')
manhattan_mae = predict_and_calculate_mae(manhattan_tree, "Manhattan")

# 민코프스키 거리 (p=3)
minkowski_tree = BallTree(train_scaled, leaf_size=40, metric='minkowski', p=3)
minkowski_mae = predict_and_calculate_mae(minkowski_tree, "Minkowski (p=3)")

# 결과 비교
results = {
    "Euclidean": euclidean_mae,
    "Manhattan": manhattan_mae,
    "Minkowski (p=3)": minkowski_mae
}

best_metric = min(results, key=results.get)
print(f"\nBest performing distance metric: {best_metric} with MAE: {results[best_metric]}")

# 최적의 거리 메트릭을 사용하여 테스트 데이터에 대한 예측
best_tree = locals()[f"{best_metric.lower().split()[0]}_tree"]
test_predictions = ensemble_predictions(best_tree, test_scaled, k_values)

print(f"\nTest predictions using {best_metric} distance metric have been generated.")

Euclidean Validation MAE: 8486.101253090092
Manhattan Validation MAE: 8031.291888515132
Minkowski (p=3) Validation MAE: 8742.976038886232

Best performing distance metric: Manhattan with MAE: 8031.291888515132

Test predictions using Manhattan distance metric have been generated.


# 맨해튼 거리방식으로 변경

In [14]:
train_df = pd.read_csv('merged_data.csv')

In [4]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import BallTree
from sklearn.metrics import mean_absolute_error
from joblib import Parallel, delayed

# 데이터 로드
train_df = pd.read_csv('merged_data.csv')
test_df = pd.read_csv

# 특성 선택
features = train_df.columns.drop(['deposit', 'index']).tolist()

# Train 데이터를 train과 validation으로 분리
train_df, val_df = train_test_split(train_df, test_size=0.2, random_state=42)

# 스케일링
scaler = StandardScaler()
train_scaled = scaler.fit_transform(train_df[features])
val_scaled = scaler.transform(val_df[features])
test_scaled = scaler.transform(test_df[features])

# 특성 가중치 조정
weights = np.array([1, 1, 5, 0.5, 0.5, 0.5])  # latitude, longitude, area_m2, floor, built_year, contract_year_month 순서
train_scaled = train_scaled * weights
val_scaled = val_scaled * weights
test_scaled = test_scaled * weights

# BallTree 구축 (맨해튼 거리 사용)
tree = BallTree(train_scaled, leaf_size=40, metric='manhattan')

# 예측 함수 (거리에 따른 가중치 함수 조정)
def predict_deposit(neighbors, distances):
    weights = np.exp(-distances)  # 지수 함수 사용
    weighted_deposits = neighbors['deposit'] * weights
    return np.sum(weighted_deposits) / np.sum(weights)

# 단일 예측 함수
def predict_single(data_point, k):
    distances, indices = tree.query([data_point], k=k)
    neighbors = train_df.iloc[indices[0]]
    return predict_deposit(neighbors, distances[0])

# 병렬 예측 수행 함수
def make_predictions_parallel(data_scaled, k):
    return Parallel(n_jobs=-1)(delayed(predict_single)(data_point, k) for data_point in data_scaled)

# 앙상블 예측 함수
def ensemble_predictions(data_scaled, k_values):
    all_predictions = Parallel(n_jobs=-1)(
        delayed(make_predictions_parallel)(data_scaled, k) for k in k_values
    )
    return np.mean(all_predictions, axis=0)

# k 값 리스트 정의
k_values = [3, 5, 7, 9]

# Validation 예측
val_predictions = ensemble_predictions(val_scaled, k_values)

# Validation MAE 계산
val_mae = mean_absolute_error(val_df['deposit'], val_predictions)
print(f"Validation MAE: {val_mae}")

# Test 예측
test_predictions = ensemble_predictions(test_scaled, k_values)

# 결과 저장
submission_df = pd.DataFrame({
    'index': test_df['index'],
    'deposit': test_predictions
})

submission_df.to_csv('submission.csv', index=False)

print("예측이 완료되었습니다. 'submission.csv' 파일을 확인하세요.")

NameError: name 'test' is not defined

In [5]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import BallTree
from sklearn.metrics import mean_absolute_error
from joblib import Parallel, delayed

# 데이터 로드
merged_data = pd.read_csv('merged_data_cleaned.csv')

# Train과 Test 데이터 분리
train_df = merged_data[merged_data['deposit'] != 0].copy()
test_df = merged_data[merged_data['deposit'] == 0].copy()

# 특성 선택
features = ['latitude', 'longitude', 'area_m2', 'floor', 'built_year', 'contract_year_month']

# Train 데이터를 train과 validation으로 분리
train_df, val_df = train_test_split(train_df, test_size=0.2, random_state=42)

# 스케일링
scaler = StandardScaler()
train_scaled = scaler.fit_transform(train_df[features])
val_scaled = scaler.transform(val_df[features])
test_scaled = scaler.transform(test_df[features])

# 특성 가중치 조정
weights = np.array([1, 1, 5, 0.5, 0.5, 0.5])  # latitude, longitude, area_m2, floor, built_year, contract_year_month 순서
train_scaled = train_scaled * weights
val_scaled = val_scaled * weights
test_scaled = test_scaled * weights

# BallTree 구축 (맨해튼 거리 사용)
tree = BallTree(train_scaled, leaf_size=40, metric='manhattan')

# 예측 함수 (거리에 따른 가중치 함수 조정)
def predict_deposit(neighbors, distances):
    weights = np.exp(-distances)  # 지수 함수 사용
    weighted_deposits = neighbors['deposit'] * weights
    return np.sum(weighted_deposits) / np.sum(weights)

# 단일 예측 함수
def predict_single(data_point, k):
    distances, indices = tree.query([data_point], k=k)
    neighbors = train_df.iloc[indices[0]]
    return predict_deposit(neighbors, distances[0])

# 병렬 예측 수행 함수
def make_predictions_parallel(data_scaled, k):
    return Parallel(n_jobs=-1)(delayed(predict_single)(data_point, k) for data_point in data_scaled)

# 앙상블 예측 함수
def ensemble_predictions(data_scaled, k_values):
    all_predictions = Parallel(n_jobs=-1)(
        delayed(make_predictions_parallel)(data_scaled, k) for k in k_values
    )
    return np.mean(all_predictions, axis=0)

# k 값 리스트 정의
k_values = [3, 5, 7, 9]

# Validation 예측
val_predictions = ensemble_predictions(val_scaled, k_values)

# Validation MAE 계산
val_mae = mean_absolute_error(val_df['deposit'], val_predictions)
print(f"Validation MAE: {val_mae}")

# Test 예측
test_predictions = ensemble_predictions(test_scaled, k_values)

# 결과 저장
submission_df = pd.DataFrame({
    'index': test_df.index,
    'deposit': test_predictions
})

submission_df.to_csv('submission.csv', index=False)

print("예측이 완료되었습니다. 'submission.csv' 파일을 확인하세요.")

KeyboardInterrupt: 

In [23]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import BallTree
from sklearn.metrics import mean_absolute_error
from joblib import Parallel, delayed
from sklearn.ensemble import RandomForestRegressor
import matplotlib.pyplot as plt

# 데이터 로드
merged_data = pd.read_csv('merged_data.csv')
test_data = pd.read_csv('test.csv')

# 'deposit' 열이 있는지 확인하고 있으면 train 데이터로, 없으면 test 데이터로 처리
if 'deposit' in merged_data.columns:
    train_df = merged_data[merged_data['deposit'].notna()].copy()
    test_df = test_data.copy()
else:
    print("Warning: 'deposit' column not found in merged_data. Using all merged_data as training data.")
    train_df = merged_data.copy()
    test_df = test_data.copy()

# 불필요한 열 제거
columns_to_drop = ['index'] if 'index' in train_df.columns else []
train_df = train_df.drop(columns=columns_to_drop)
test_df = test_df.drop(columns=columns_to_drop)

# 특성 선택
features = [col for col in train_df.columns if col != 'deposit']

# Train 데이터를 train과 validation으로 분리
train_df, val_df = train_test_split(train_df, test_size=0.2, random_state=42)

# 스케일링
scaler = StandardScaler()
train_scaled = scaler.fit_transform(train_df[features])
val_scaled = scaler.transform(val_df[features])
test_scaled = scaler.transform(test_df[features])

# 특성 가중치 조정 (모든 특성에 대해 동일한 가중치 1 적용)
weights = np.ones(len(features))
train_scaled = train_scaled * weights
val_scaled = val_scaled * weights
test_scaled = test_scaled * weights

# BallTree 구축 (맨해튼 거리 사용)
tree = BallTree(train_scaled, leaf_size=40, metric='manhattan')

# 예측 함수 (거리에 따른 가중치 함수 조정)
def predict_deposit(neighbors, distances):
    weights = np.exp(-distances)  # 지수 함수 사용
    weighted_deposits = neighbors['deposit'] * weights
    return np.sum(weighted_deposits) / np.sum(weights)

# 단일 예측 함수
def predict_single(data_point, k):
    distances, indices = tree.query([data_point], k=k)
    neighbors = train_df.iloc[indices[0]]
    return predict_deposit(neighbors, distances[0])

# 병렬 예측 수행 함수
def make_predictions_parallel(data_scaled, k):
    return Parallel(n_jobs=-1)(delayed(predict_single)(data_point, k) for data_point in data_scaled)

# 앙상블 예측 함수
def ensemble_predictions(data_scaled, k_values):
    all_predictions = Parallel(n_jobs=-1)(
        delayed(make_predictions_parallel)(data_scaled, k) for k in k_values
    )
    return np.mean(all_predictions, axis=0)

# k 값 리스트 정의
k_values = [3, 5, 7, 9]

# Validation 예측
val_predictions = ensemble_predictions(val_scaled, k_values)

# Validation MAE 계산
val_mae = mean_absolute_error(val_df['deposit'], val_predictions)
print(f"Validation MAE: {val_mae}")

# Test 예측
test_predictions = ensemble_predictions(test_scaled, k_values)

# 결과 저장
submission_df = pd.DataFrame({
    'index': test_df['index'] if 'index' in test_df.columns else range(len(test_predictions)),
    'deposit': test_predictions
})

submission_df.to_csv('submission.csv', index=False)

print("예측이 완료되었습니다. 'submission.csv' 파일을 확인하세요.")

# 특성 중요도 분석
# 1. 상관관계 기반 중요도
correlation_importance = train_df[features + ['deposit']].corr()['deposit'].abs().sort_values(ascending=False)

# 2. 랜덤 포레스트 기반 중요도
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(train_scaled, train_df['deposit'])
rf_importance = pd.Series(rf_model.feature_importances_, index=features).sort_values(ascending=False)

print("\n상관관계 기반 특성 중요도:")
print(correlation_importance)

print("\n랜덤 포레스트 기반 특성 중요도:")
print(rf_importance)

# 중요도 시각화
plt.figure(figsize=(12, 6))
correlation_importance[:10].plot(kind='bar')
plt.title('Top 10 Features (Correlation-based Importance)')
plt.tight_layout()
plt.savefig('correlation_importance.png')

plt.figure(figsize=(12, 6))
rf_importance[:10].plot(kind='bar')
plt.title('Top 10 Features (Random Forest Importance)')
plt.tight_layout()
plt.savefig('rf_importance.png')

print("특성 중요도 그래프가 저장되었습니다.")

KeyError: "['nearest_park_distance', 'park_count_500m', 'total_park_area_500m', 'park_count_1000m', 'total_park_area_1000m', 'park_count_2000m', 'total_park_area_2000m', 'weighted_park_score', 'avg_distance_5_parks', 'park_distance_skewness', 'park_distance_kurtosis', 'nearest_large_park_distance', 'large_park_count_3km', 'large_park_count_5km', 'large_park_count_10km', 'total_large_park_area_10km', 'nearest_subway_distance_km', 'school_count_within_1km', 'closest_elementary_distance', 'closest_middle_distance', 'closest_high_distance', 'deposit_mean', 'interest_rate', 'interest_rate_diff'] not in index"

In [22]:
merged_data

Unnamed: 0,area_m2,contract_year_month,contract_day,contract_type,floor,built_year,latitude,longitude,age,deposit,...,large_park_count_10km,total_large_park_area_10km,nearest_subway_distance_km,school_count_within_1km,closest_elementary_distance,closest_middle_distance,closest_high_distance,deposit_mean,interest_rate,interest_rate_diff
0,84.9981,201906,25,2,9,2019,37.054314,127.045216,0,17000.0,...,14,3082215.1,0.716953,4,0.156120,0.465125,0.990855,31188.259433,1.92,-0.07
1,84.9981,202003,26,2,20,2019,37.054314,127.045216,1,23000.0,...,14,3082215.1,0.716953,4,0.156120,0.465125,0.990855,32309.834287,1.63,0.08
2,84.9981,202003,28,2,8,2019,37.054314,127.045216,1,23000.0,...,14,3082215.1,0.716953,4,0.156120,0.465125,0.990855,32309.834287,1.63,0.08
3,59.3400,201907,15,2,1,1986,36.964647,127.055847,33,5000.0,...,7,1877924.1,3.897280,4,0.214560,0.688047,0.644366,31786.283137,1.94,0.02
4,59.8100,201904,12,2,6,1995,36.972390,127.084514,24,1800.0,...,8,2199482.1,2.039685,0,1.708489,2.197946,2.264822,30459.486563,2.04,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1867778,115.5101,202402,27,0,17,2010,37.528394,126.659398,14,0.0,...,21,10819839.1,1.483045,7,0.313129,0.482436,0.224754,46357.076007,3.97,0.15
1867779,142.8738,202403,2,0,4,2010,37.528394,126.659398,14,0.0,...,21,10819839.1,1.483045,7,0.313129,0.482436,0.224754,46711.029696,4.00,0.03
1867780,142.8738,202403,16,1,13,2010,37.528394,126.659398,14,0.0,...,21,10819839.1,1.483045,7,0.313129,0.482436,0.224754,46711.029696,4.00,0.03
1867781,114.9285,202403,22,1,2,2010,37.528394,126.659398,14,0.0,...,21,10819839.1,1.483045,7,0.313129,0.482436,0.224754,46711.029696,4.00,0.03


In [24]:
# 데이터 로드
merged_data = pd.read_csv('merged_data.csv')

# Train과 Test 데이터 분리
train_df = merged_data[merged_data['deposit'] != 0].copy()
test_df = merged_data[merged_data['deposit'] == 0].copy()

print(f"Train data shape: {train_df.shape}")
print(f"Test data shape: {test_df.shape}")

Train data shape: (1717611, 35)
Test data shape: (150172, 35)


In [26]:
# NaN 값 확인
print("NaN 값이 있는 열:")
print(merged_data.columns[merged_data.isna().any()].tolist())


NaN 값이 있는 열:
['index', 'park_distance_skewness', 'park_distance_kurtosis', 'interest_rate_diff']


In [30]:
import pandas as pd
import numpy as np

# 데이터 로드
merged_data = pd.read_csv('merged_data_cleaned.csv')

# NaN 값이 있는 열 확인
nan_columns = merged_data.columns[merged_data.isna().any()].tolist()

print("NaN 값이 있는 열과 해당 열의 NaN 비율:")
for col in nan_columns:
    nan_percentage = (merged_data[col].isna().sum() / len(merged_data)) * 100
    print(f"{col}: {nan_percentage:.2f}%")

# 전체 데이터셋에서 NaN 값의 비율
total_nan_percentage = (merged_data.isna().sum().sum() / (merged_data.shape[0] * merged_data.shape[1])) * 100
print(f"\n전체 데이터셋에서 NaN 값의 비율: {total_nan_percentage:.2f}%")

# NaN 값이 있는 행의 비율
rows_with_nan = merged_data[merged_data.isna().any(axis=1)]
rows_with_nan_percentage = (len(rows_with_nan) / len(merged_data)) * 100
print(f"NaN 값이 하나 이상 있는 행의 비율: {rows_with_nan_percentage:.2f}%")

NaN 값이 있는 열과 해당 열의 NaN 비율:

전체 데이터셋에서 NaN 값의 비율: 0.00%
NaN 값이 하나 이상 있는 행의 비율: 0.00%


In [28]:
import pandas as pd
import numpy as np

# 데이터 로드
merged_data = pd.read_csv('merged_data.csv')

# index 열 제거
merged_data = merged_data.drop('index', axis=1)

# NaN 값을 중앙값으로 대체
columns_to_fill = ['park_distance_skewness', 'park_distance_kurtosis', 'interest_rate_diff']
for col in columns_to_fill:
    merged_data[col] = merged_data[col].fillna(merged_data[col].median())

# 결과를 새로운 데이터프레임에 저장
merged_data_cleaned = merged_data.copy()

# NaN 값이 모두 처리되었는지 확인
remaining_nan = merged_data_cleaned.isna().sum()
print("남아있는 NaN 값:")
print(remaining_nan[remaining_nan > 0])

# 정보 출력
print("\n정제된 데이터 형태:", merged_data_cleaned.shape)
print("\n정제된 데이터의 처음 몇 행:")
print(merged_data_cleaned.head())

# 필요하다면 정제된 데이터를 CSV 파일로 저장
merged_data_cleaned.to_csv('merged_data_cleaned.csv', index=False)
print("\n정제된 데이터가 'merged_data_cleaned.csv' 파일로 저장되었습니다.")

남아있는 NaN 값:
Series([], dtype: int64)

정제된 데이터 형태: (1867783, 34)

정제된 데이터의 처음 몇 행:
   area_m2  contract_year_month  contract_day  contract_type  floor  \
0  84.9981               201906            25              2      9   
1  84.9981               202003            26              2     20   
2  84.9981               202003            28              2      8   
3  59.3400               201907            15              2      1   
4  59.8100               201904            12              2      6   

   built_year   latitude   longitude  age  deposit  ...  \
0        2019  37.054314  127.045216    0  17000.0  ...   
1        2019  37.054314  127.045216    1  23000.0  ...   
2        2019  37.054314  127.045216    1  23000.0  ...   
3        1986  36.964647  127.055847   33   5000.0  ...   
4        1995  36.972390  127.084514   24   1800.0  ...   

   large_park_count_10km  total_large_park_area_10km  \
0                     14                   3082215.1   
1                     14

In [32]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import BallTree
from sklearn.metrics import mean_absolute_error
from joblib import Parallel, delayed
from sklearn.ensemble import RandomForestRegressor
import matplotlib.pyplot as plt

# 데이터 로드
merged_data = pd.read_csv('merged_data_cleaned.csv')

# NaN 값 확인
print("NaN 값이 있는 열:")
print(merged_data.columns[merged_data.isna().any()].tolist())

# NaN 값 처리
for column in merged_data.columns:
    if merged_data[column].isna().any():
        if column != 'deposit':  # deposit 열은 NaN으로 둡니다 (테스트 데이터의 경우)
            merged_data[column] = merged_data[column].fillna(merged_data[column].median())

# 다시 한 번 NaN 값 확인
print("\n처리 후 NaN 값이 있는 열:")
print(merged_data.columns[merged_data.isna().any()].tolist())

# Train과 Test 데이터 분리
train_df = merged_data[merged_data['deposit'] != 0].copy()
test_df = merged_data[merged_data['deposit'] == 0].copy()

# 특성 선택
features = [col for col in merged_data.columns if col not in ['deposit', 'index']]

# Train 데이터를 train과 validation으로 분리
train_df, val_df = train_test_split(train_df, test_size=0.2, random_state=42)

# 스케일링
scaler = StandardScaler()
train_scaled = scaler.fit_transform(train_df[features])
val_scaled = scaler.transform(val_df[features])
test_scaled = scaler.transform(test_df[features])

# NaN 값 다시 확인
print("\n스케일링 후 NaN 값 확인:")
print("train_scaled NaN:", np.isnan(train_scaled).any())
print("val_scaled NaN:", np.isnan(val_scaled).any())
print("test_scaled NaN:", np.isnan(test_scaled).any())

# 만약 스케일링 후에도 NaN 값이 있다면 0으로 대체
train_scaled = np.nan_to_num(train_scaled)
val_scaled = np.nan_to_num(val_scaled)
test_scaled = np.nan_to_num(test_scaled)

# 특성 가중치 조정 (모든 특성에 대해 동일한 가중치 1 적용)
weights = np.ones(len(features))
train_scaled = train_scaled * weights
val_scaled = val_scaled * weights
test_scaled = test_scaled * weights

# BallTree 구축 (맨해튼 거리 사용)
tree = BallTree(train_scaled, leaf_size=40, metric='manhattan')

# 예측 함수 (거리에 따른 가중치 함수 조정)
def predict_deposit(neighbors, distances):
    weights = np.exp(-distances)  # 지수 함수 사용
    weighted_deposits = neighbors['deposit'] * weights
    return np.sum(weighted_deposits) / np.sum(weights)

# 단일 예측 함수
def predict_single(data_point, k):
    distances, indices = tree.query([data_point], k=k)
    neighbors = train_df.iloc[indices[0]]
    return predict_deposit(neighbors, distances[0])

# 병렬 예측 수행 함수
def make_predictions_parallel(data_scaled, k):
    return Parallel(n_jobs=-1)(delayed(predict_single)(data_point, k) for data_point in data_scaled)

# 앙상블 예측 함수
def ensemble_predictions(data_scaled, k_values):
    all_predictions = Parallel(n_jobs=-1)(
        delayed(make_predictions_parallel)(data_scaled, k) for k in k_values
    )
    return np.mean(all_predictions, axis=0)

# k 값 리스트 정의
k_values = [3, 5, 7, 9]

# Validation 예측
val_predictions = ensemble_predictions(val_scaled, k_values)

# Validation MAE 계산
val_mae = mean_absolute_error(val_df['deposit'], val_predictions)
print(f"Validation MAE: {val_mae}")

# Test 예측
test_predictions = ensemble_predictions(test_scaled, k_values)

# 결과 저장
submission_df = pd.DataFrame({
    'index': test_df['index'] if 'index' in test_df.columns else range(len(test_predictions)),
    'deposit': test_predictions
})

submission_df.to_csv('submission.csv', index=False)

print("예측이 완료되었습니다. 'submission.csv' 파일을 확인하세요.")

# 특성 중요도 분석
# 1. 상관관계 기반 중요도
correlation_importance = train_df[features + ['deposit']].corr()['deposit'].abs().sort_values(ascending=False)

# 2. 랜덤 포레스트 기반 중요도
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(train_scaled, train_df['deposit'])
rf_importance = pd.Series(rf_model.feature_importances_, index=features).sort_values(ascending=False)

print("\n상관관계 기반 특성 중요도:")
print(correlation_importance)

print("\n랜덤 포레스트 기반 특성 중요도:")
print(rf_importance)

# 중요도 시각화
plt.figure(figsize=(12, 6))
correlation_importance[:10].plot(kind='bar')
plt.title('Top 10 Features (Correlation-based Importance)')
plt.tight_layout()
plt.savefig('correlation_importance.png')

plt.figure(figsize=(12, 6))
rf_importance[:10].plot(kind='bar')
plt.title('Top 10 Features (Random Forest Importance)')
plt.tight_layout()
plt.savefig('rf_importance.png')

print("특성 중요도 그래프가 저장되었습니다.")

NaN 값이 있는 열:
[]

처리 후 NaN 값이 있는 열:
[]

스케일링 후 NaN 값 확인:
train_scaled NaN: False
val_scaled NaN: False
test_scaled NaN: False


KeyboardInterrupt: 

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_absolute_error
from joblib import Parallel, delayed
import matplotlib.pyplot as plt

# 데이터 로드 및 샘플링
merged_data = pd.read_csv('merged_data_cleaned.csv')

# Train과 Test 데이터 분리
train_df = merged_data[merged_data['deposit'] != 0].copy()
test_df = merged_data[merged_data['deposit'] == 0].copy()

# 특성 선택 (중요도가 높은 10개 특성만 선택)
features = ['area_m2', 'latitude', 'longitude', 'floor', 'built_year', 'contract_year_month', 
            'nearest_subway_distance_km', 'deposit_mean', 'interest_rate', 'school_count_within_1km']

# Train 데이터를 train과 validation으로 분리
train_df, val_df = train_test_split(train_df, test_size=0.2, random_state=42)

# 스케일링
scaler = StandardScaler()
train_scaled = scaler.fit_transform(train_df[features])
val_scaled = scaler.transform(val_df[features])
test_scaled = scaler.transform(test_df[features])

# KNN 모델 생성
k = 5  # 단일 k 값 사용
knn = KNeighborsRegressor(n_neighbors=k, weights='distance', metric='euclidean', n_jobs=-1)
knn.fit(train_scaled, train_df['deposit'])

# Validation 예측
val_predictions = knn.predict(val_scaled)

# Validation MAE 계산
val_mae = mean_absolute_error(val_df['deposit'], val_predictions)
print(f"Validation MAE: {val_mae}")

# Test 예측
test_predictions = knn.predict(test_scaled)

# 결과 저장
submission_df = pd.DataFrame({
    'index': test_df['index'] if 'index' in test_df.columns else range(len(test_predictions)),
    'deposit': test_predictions
})

submission_df.to_csv('submission.csv', index=False)

print("예측이 완료되었습니다. 'submission.csv' 파일을 확인하세요.")

Validation MAE: 5591.6022472139475
예측이 완료되었습니다. 'submission.csv' 파일을 확인하세요.


# XGBoost로 변경

In [8]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_absolute_error
import xgboost as xgb

# 데이터 로드
merged_data = pd.read_csv('merged_data_cleaned.csv')

# Train과 Test 데이터 분리
train_df = merged_data[merged_data['deposit'] != 0].copy()
test_df = merged_data[merged_data['deposit'] == 0].copy()

# 타겟 변수 분리
y_train = train_df['deposit']
X_train = train_df.drop('deposit', axis=1)

# test_df에서 deposit 열 제거
X_test = test_df.drop('deposit', axis=1)

# 범주형 변수 인코딩
le = LabelEncoder()
for col in X_train.select_dtypes(include=['object']).columns:
    X_train[col] = le.fit_transform(X_train[col].astype(str))
    X_test[col] = le.transform(X_test[col].astype(str))

# Train 데이터를 train과 validation으로 분리
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# XGBoost 모델 생성 및 학습
model = xgb.XGBRegressor(
    n_estimators=500,
    learning_rate=0.1,
    max_depth=6, 
    subsample=0.8, 
    colsample_bytree=0.8, 
    random_state=42,
    tree_method='hist',
    n_jobs=-1
)

model.fit(
    X_train, y_train, 
    eval_set=[(X_val, y_val)], 
    verbose=100
)

# Validation 예측 및 MAE 계산
val_predictions = model.predict(X_val)
val_mae = mean_absolute_error(y_val, val_predictions)
print(f"Validation MAE: {val_mae}")

# Test 예측
test_predictions = model.predict(X_test)

# 결과 저장
submission_df = pd.DataFrame({
    'index': test_df.index,
    'deposit': test_predictions
})

submission_df.to_csv('submission.csv', index=False)

print("예측이 완료되었습니다. 'submission.csv' 파일을 확인하세요.")

# 특성 중요도 출력
feature_importance = model.feature_importances_
feature_names = X_train.columns
feature_importance_df = pd.DataFrame({'feature': feature_names, 'importance': feature_importance})
feature_importance_df = feature_importance_df.sort_values('importance', ascending=False)
print("\n상위 10개 중요 특성:")
print(feature_importance_df.head(10))

[0]	validation_0-rmse:25153.55059
[100]	validation_0-rmse:9157.11119
[200]	validation_0-rmse:8450.68627
[300]	validation_0-rmse:8132.15768
[400]	validation_0-rmse:7918.70562
[499]	validation_0-rmse:7769.50328
Validation MAE: 4714.424762007194
예측이 완료되었습니다. 'submission.csv' 파일을 확인하세요.

상위 10개 중요 특성:
                       feature  importance
24  total_large_park_area_10km    0.172603
0                      area_m2    0.128057
6                     latitude    0.110311
7                    longitude    0.085020
23       large_park_count_10km    0.082671
5                   built_year    0.081762
25  nearest_subway_distance_km    0.041096
30                deposit_mean    0.024595
8                          age    0.022970
21        large_park_count_3km    0.020564


# 앙상블용 knn모델 베이스

In [9]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import BallTree
from sklearn.metrics import mean_absolute_error
from joblib import Parallel, delayed

# 데이터 로드
merged_data = pd.read_csv('merged_data_cleaned.csv')

# Train과 Test 데이터 분리
train_df = merged_data[merged_data['deposit'] != 0].copy()
test_df = merged_data[merged_data['deposit'] == 0].copy()

# 특성 선택
features = ['latitude', 'longitude', 'area_m2', 'floor', 'built_year', 'contract_year_month']

# Train 데이터를 train과 validation으로 분리
train_df, val_df = train_test_split(train_df, test_size=0.2, random_state=42)

# 스케일링
scaler = StandardScaler()
train_scaled = scaler.fit_transform(train_df[features])
val_scaled = scaler.transform(val_df[features])
test_scaled = scaler.transform(test_df[features])

# 특성 가중치 조정
weights = np.array([1, 1, 5, 0.5, 0.5, 0.5])  # latitude, longitude, area_m2, floor, built_year, contract_year_month 순서
train_scaled = train_scaled * weights
val_scaled = val_scaled * weights
test_scaled = test_scaled * weights

# BallTree 구축 (맨해튼 거리 사용)
tree = BallTree(train_scaled, leaf_size=40, metric='manhattan')

# 예측 함수 (거리에 따른 가중치 함수 조정)
def predict_deposit(neighbors, distances):
    weights = np.exp(-distances)  # 지수 함수 사용
    weighted_deposits = neighbors['deposit'] * weights
    return np.sum(weighted_deposits) / np.sum(weights)

# 단일 예측 함수
def predict_single(data_point, k):
    distances, indices = tree.query([data_point], k=k)
    neighbors = train_df.iloc[indices[0]]
    return predict_deposit(neighbors, distances[0])

# 병렬 예측 수행 함수
def make_predictions_parallel(data_scaled, k):
    return Parallel(n_jobs=-1)(delayed(predict_single)(data_point, k) for data_point in data_scaled)

# 앙상블 예측 함수
def ensemble_predictions(data_scaled, k_values):
    all_predictions = Parallel(n_jobs=-1)(
        delayed(make_predictions_parallel)(data_scaled, k) for k in k_values
    )
    return np.mean(all_predictions, axis=0)

# k 값 리스트 정의
k_values = [3, 5, 7, 9]

# Validation 예측
val_predictions = ensemble_predictions(val_scaled, k_values)

# Validation MAE 계산
val_mae = mean_absolute_error(val_df['deposit'], val_predictions)
print(f"Validation MAE: {val_mae}")

# Test 예측
test_predictions = ensemble_predictions(test_scaled, k_values)

# 결과 저장
submission_df = pd.DataFrame({
    'index': test_df.index,
    'deposit': test_predictions
})

submission_df.to_csv('submission.csv', index=False)

print("예측이 완료되었습니다. 'submission.csv' 파일을 확인하세요.")

Validation MAE: 5192.85860614596
예측이 완료되었습니다. 'submission.csv' 파일을 확인하세요.


# 앙상블용 knn 그리드 서치중

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import BallTree
from sklearn.metrics import mean_absolute_error
from joblib import Parallel, delayed
from itertools import product
import random

# 데이터 로드 및 전처리 (이전과 동일)
merged_data = pd.read_csv('merged_data_cleaned.csv')
train_df = merged_data[merged_data['deposit'] != 0].copy()
test_df = merged_data[merged_data['deposit'] == 0].copy()
features = ['latitude', 'longitude', 'area_m2', 'floor', 'built_year', 'contract_year_month']
train_df, val_df = train_test_split(train_df, test_size=0.2, random_state=42)
scaler = StandardScaler()
train_scaled = scaler.fit_transform(train_df[features])
val_scaled = scaler.transform(val_df[features])
test_scaled = scaler.transform(test_df[features])

# 예측 함수 최적화
def predict_deposit(neighbors, distances):
    weights = np.exp(-distances)
    return np.average(neighbors['deposit'], weights=weights)

# 단일 예측 함수 (변경 없음)
def predict_single(data_point, tree, k):
    distances, indices = tree.query([data_point], k=k)
    neighbors = train_df.iloc[indices[0]]
    return predict_deposit(neighbors, distances[0])

# 병렬 예측 수행 함수 (변경 없음)
def make_predictions_parallel(data_scaled, tree, k):
    return Parallel(n_jobs=-1)(delayed(predict_single)(data_point, tree, k) for data_point in data_scaled)

# 앙상블 예측 함수 (변경 없음)
def ensemble_predictions(data_scaled, tree, k_values):
    all_predictions = Parallel(n_jobs=-1)(
        delayed(make_predictions_parallel)(data_scaled, tree, k) for k in k_values
    )
    return np.mean(all_predictions, axis=0)

# 최적화된 그리드 서치 함수
def optimized_grid_search(train_scaled, val_scaled, val_df, k_values, max_iterations=100):
    best_mae = float('inf')
    best_weights = None
    
    # 가중치 범위 설정
    weight_options = [0.1, 0.5, 1, 2, 5, 10]
    
    for _ in range(max_iterations):
        # 무작위로 가중치 선택
        weights = [random.choice(weight_options) for _ in range(len(features))]
        
        # 가중치 적용
        train_weighted = train_scaled * weights
        val_weighted = val_scaled * weights
        
        # BallTree 구축
        tree = BallTree(train_weighted, leaf_size=40, metric='manhattan')
        
        # 예측 및 MAE 계산
        val_predictions = ensemble_predictions(val_weighted, tree, k_values)
        mae = mean_absolute_error(val_df['deposit'], val_predictions)
        
        # 최적의 가중치 업데이트
        if mae < best_mae:
            best_mae = mae
            best_weights = weights
        
        print(f"Iteration {_+1}/{max_iterations}, Weights: {weights}, MAE: {mae}")
    
    return best_weights, best_mae

# k 값 리스트 정의
k_values = [3, 5, 7]  # k 값 감소

# 최적화된 그리드 서치 수행
best_weights, best_mae = optimized_grid_search(train_scaled, val_scaled, val_df, k_values, max_iterations=100)

print(f"Best weights: {best_weights}")
print(f"Best Validation MAE: {best_mae}")

# 최적의 가중치 적용
train_scaled = train_scaled * best_weights
test_scaled = test_scaled * best_weights

# 최종 모델 학습
final_tree = BallTree(train_scaled, leaf_size=40, metric='manhattan')

# Test 예측
test_predictions = ensemble_predictions(test_scaled, final_tree, k_values)

# 결과 저장
submission_df = pd.DataFrame({
    'index': test_df.index,
    'deposit': test_predictions
})

submission_df.to_csv('submission.csv', index=False)

print("예측이 완료되었습니다. 'submission.csv' 파일을 확인하세요.")

Iteration 1/100, Weights: [0.5, 2, 0.5, 5, 5, 5], MAE: 6653.438921437086
Iteration 2/100, Weights: [0.1, 2, 0.5, 0.1, 10, 5], MAE: 5286.464545782188
Iteration 3/100, Weights: [2, 0.5, 0.1, 1, 5, 10], MAE: 6443.017427882524
