In [25]:
import pandas as pd
import numpy as np
from scipy.stats import pearsonr, spearmanr, kendalltau
from scipy import stats
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder

In [26]:
path = './data/'

df = pd.read_csv(path+'Coordinates_Preprocessed.csv')

In [27]:
# JS_Price를 5개의 범주로 나누고 기존 변수 삭제
df['JS_Price_Category'] = pd.cut(df['JS_Price'], bins=5, labels=False)
df.drop('JS_Price', axis=1, inplace=True)

# 범주형 변수 더미화 함수
def oh_encoding(df):
    # DataFrame의 복사본을 만듭니다.
    encoded_df = df.copy()
    for column in df.columns:
        if df[column].dtype == object:
            encoded_df = pd.get_dummies(encoded_df, columns=[column], prefix=column)
    return encoded_df

# 범주형 변수 더미화 적용
df_encoded = oh_encoding(df)

# 상관관계 분석을 위한 DataFrame 생성
result_df = pd.DataFrame(columns=['Column_Name', 'PCC', 'p-value'])

# 'JS_Price_Category'와 다른 열 간의 PCC 및 p-value 계산 및 저장
for column in df_encoded.columns:
    if column != 'JS_Price_Category':
        correlation, p_value = pearsonr(df_encoded['JS_Price_Category'], df_encoded[column])
        result_df = result_df.append({'Column_Name': column, 'PCC': correlation, 'p-value': p_value}, ignore_index=True)

# PCC 및 p-value를 기준으로 필터링
delete_columns = []

for index, row in result_df.iterrows():
    if abs(row['PCC']) < 0.1 or row['p-value'] > 0.05:
        delete_columns.append(row['Column_Name'])
delete_columns.append('JS_Price_Category')
        
# 선택된 특성 열
selected_features = list(set(df_encoded.columns) - set(delete_columns))

In [28]:
result_df.sort_values('PCC', ascending=False)

Unnamed: 0,Column_Name,PCC,p-value
9,Sell_Price,0.586392,0.0
1,JS_BA,0.388496,7.817819e-180
4,LC_index,0.21772,1.0183610000000001e-54
6,TC_index,0.214449,4.207082e-53
5,CA_index,0.212032,6.3199170000000004e-52
12,YearMonth,0.211557,1.0728419999999999e-51
8,HSP_index,0.209573,9.626457e-51
17,Region_Name_강남구,0.19469,6.664436e-44
31,Region_Name_서초구,0.154363,4.863025e-28
42,Building_Use_아파트,0.104752,1.121975e-13


In [29]:
selected_features

['IR',
 'Building_Use_아파트',
 'Region_Name_서초구',
 'HSP_index',
 'YearMonth',
 'LC_index',
 'TC_index',
 'CA_index',
 'JS_BA',
 'Sell_Price',
 'Region_Name_강남구']