# 라이브러리 불러오기

In [1]:
import os
import sys
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans

sys.path.append(os.path.abspath(".."))
from data.load_dataset import load_dataset
from data.merge_dataset import merge_dataset
from model.inference import save_csv
from model.feature_select import select_features
from model.data_split import split_features_and_target
from model.log_transformation import apply_log_transformation
from model.model_train import cv_train, set_model, optuna_train

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

# 데이터 불러오기

In [3]:
# 기존 데이터 불러오기
train_data, test_data, sample_submission, interest_data, subway_data, school_data, park_data = load_dataset()

In [4]:
# 기존 데이터에 새로운 feature들을 병합한 데이터프레임 불러오기
train_data, test_data = merge_dataset(train_data, test_data, interest_data, subway_data, school_data, park_data)

# Data Preprocessing

In [5]:
# 위치 중복도 낮은 행 삭제
groups = train_data.groupby(["latitude", "longitude"])["index"].count()
conditioned_groups_index = groups[(groups >= 2) & (groups <= 5)].index # 이 범위를 파라미터로 조정하는걸로
small_groups = train_data[
    train_data["latitude"].isin(conditioned_groups_index.get_level_values(0)) &
    train_data["longitude"].isin(conditioned_groups_index.get_level_values(1))
]
train_data.drop(small_groups.index, axis=0, inplace=True)

In [6]:
# built_year > 2024 행 삭제
train_data = train_data[train_data["built_year"] < 2024]
train_data.reset_index(drop=True, inplace=True)

# Feature Engineering

In [7]:
# log 변환
train_data, test_data = apply_log_transformation(train_data, test_data)

In [8]:
# train_data split
X, y = split_features_and_target(train_data)

In [9]:
# Feature Select
X, test_data = select_features(X, y, test_data)

#### 클러스터링

In [10]:
cluster_data = train_data[["deposit"]]

In [None]:
# Elbow Method
sse = []
k_values = range(1, 15)

for k in k_values:
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(cluster_data)
    sse.append(kmeans.inertia_)

# 결과 시각화
plt.figure(figsize=(10, 6))
plt.plot(k_values, sse, marker="o")
plt.xlabel("Number of Clusters")
plt.ylabel("SSE")
plt.title("Elbow Method")
plt.xticks(k_values)
plt.grid()
plt.show()

In [11]:
# K-means 클러스터링 적용
kmeans = KMeans(n_clusters=3, random_state=42)
train_data["region"] = kmeans.fit_predict(cluster_data)

In [None]:
# 산점도 결과 시각화
plt.figure(figsize=(10, 6))
sns.scatterplot(data=train_data, x="longitude", y="latitude", hue="region", palette="YlOrBr", alpha=0.2, size="region", sizes=(20, 200))
plt.title("Location Clustering with Price per Area")
plt.xlabel("Longitude")
plt.ylabel("Latitude")
plt.legend(title="Cluster")
plt.show()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# 고유한 region 목록 가져오기
regions = train_data['region'].unique()

plt.figure(figsize=(12, 8))

# 각 region에 대해 KDE 플롯 생성
for region in regions:
    subset = train_data[train_data['region'] == region]
    sns.kdeplot(data=subset, x="longitude", y="latitude", fill=True, alpha=0.5, label=region)

plt.title("KDE Density by Region")
plt.xlabel("Longitude")
plt.ylabel("Latitude")
plt.legend(title="Region")
plt.show()

In [None]:
# 3D 산점도 시각화
fig = plt.figure(figsize=(10, 6))
ax = fig.add_subplot(111, projection='3d')

# 산점도 그리기
scatter = ax.scatter(train_data["longitude"], train_data["latitude"],
                     c=train_data["region"], cmap='tab10', alpha=0.2, s=20)

# 축 라벨 설정
ax.set_title("3D Location Clustering with Price per Area")
ax.set_xlabel("longitude")
ax.set_ylabel("latitude")
ax.set_zlabel("region")

# 컬러바 추가
cbar = plt.colorbar(scatter)
cbar.set_label('Cluster')

plt.show()

In [14]:
import folium
import pandas as pd
import matplotlib.cm as cm
import matplotlib.colors as colors

def visualize_by_region(train_data):
    # 기본 지도 설정 (초기 중심점은 데이터의 평균 위도와 경도로 설정)
    center_lat = train_data['latitude'].mean()
    center_lon = train_data['longitude'].mean()
    map_folium = folium.Map(location=[center_lat, center_lon], zoom_start=12)

    # 고유한 지역 목록 가져오기
    regions = train_data['region'].unique()

    # 색상 매핑: 고유한 지역마다 색상을 할당
    cmap = cm.get_cmap('Set1', len(regions))  # 'Set1'은 9개의 고유한 색상을 제공
    norm = colors.Normalize(vmin=0, vmax=len(regions)-1)
    region_colors = {region: colors.to_hex(cmap(norm(i))) for i, region in enumerate(regions)}

    # 지역별로 지도에 마커 추가
    for _, row in train_data.iterrows():
        folium.CircleMarker(
            location=[row['latitude'], row['longitude']],
            radius=5,
            color=region_colors[row['region']],
            fill=True,
            fill_color=region_colors[row['region']],
            fill_opacity=0.7,
            popup=f"Region: {row['region']}\nLatitude: {row['latitude']}\nLongitude: {row['longitude']}"
        ).add_to(map_folium)

    return map_folium

In [None]:
map_folium = visualize_by_region(train_data)

# Modeling

# Inference