In [9]:
import pandas as pd
import numpy as np
from scipy.stats import pearsonr, spearmanr, kendalltau
from scipy import stats
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder
import random
from imblearn.over_sampling import RandomOverSampler

In [10]:
df = pd.read_csv('./data/Before_Encoding_5000.csv', encoding='euc-kr')

In [11]:
df

Unnamed: 0,Building_Age,JS_Price,JS_BA,Population,UR,LC_index,CA_index,TC_index,SDT_index,HSP_index,Sell_Price,Crime_Rates,IR,Region_Name,Building_Use,YearMonth,Shortest_Distance_to_Subway,Shortest_Distance_to_School,Shortest_Distance_to_Univ
0,14,22500,84.70,433809,4.1,90.4,95.3,91.0,107.634598,91.7,39900.00,0.967620,1.25,강동구,아파트,201703,218.546661,342.320637,2080.047982
1,0,16000,17.45,662019,3.4,98.0,101.1,99.1,112.039216,131.7,18000.00,0.834577,1.25,송파구,오피스텔,201912,365.167081,428.396368,2078.432085
2,30,42000,108.47,553927,2.7,78.0,84.3,81.7,120.439963,74.7,135000.00,1.537764,2.50,강남구,아파트,201310,698.127221,334.807784,1514.222790
3,4,48000,84.95,674828,2.9,72.9,80.0,77.1,114.366829,79.4,91646.15,1.145652,3.25,송파구,아파트,201110,536.947700,24.176463,3817.518298
4,0,70000,84.99,302243,2.1,109.0,109.0,110.3,87.677816,167.9,108000.00,0.725826,2.50,서대문구,아파트,202208,1173.890039,335.949816,1165.416466
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,0,133000,84.86,530126,3.4,102.6,101.3,100.8,128.819696,158.1,193333.33,1.352069,0.50,강남구,아파트,202011,1341.605321,298.254673,3567.318940
4996,10,49000,84.91,427540,3.1,74.5,81.5,78.5,102.461258,74.2,81850.00,1.221012,3.00,서초구,아파트,201207,440.715060,269.506677,1053.568719
4997,0,23000,30.00,425539,4.5,93.9,98.0,94.1,94.786910,106.1,27038.00,0.907344,1.50,강동구,연립다세대,201803,364.897534,391.843327,1835.115994
4998,0,71000,84.65,571614,3.8,106.9,103.8,103.4,117.233889,177.7,110000.00,0.671993,0.50,강서구,아파트,202106,0.000000,809.669099,2549.064034


In [12]:
# 범주형 변수 변환
def target_encoding(df, categorical_columns, target_column):
    for categorical_column in categorical_columns:
        encoding_map = df.groupby(categorical_column)[target_column].mean().to_dict()
        df[categorical_column + '_encoded'] = df[categorical_column].map(encoding_map)
        df.drop(columns=[categorical_column], inplace=True)
    return df

# 범주형 변수 리스트 지정
categorical_columns = ['Region_Name', 'Building_Use']
target_column = 'JS_Price'

# 타겟 인코딩 적용
df_encoded = target_encoding(df, categorical_columns, target_column)

# JS_Price를 5개의 범주로 나누고 기존 변수 삭제
df_encoded['JS_Price_Category'] = pd.cut(df_encoded['JS_Price'], bins=5, labels=False)
df_encoded.drop('JS_Price', axis=1, inplace=True)

# 오버샘플링할 데이터와 레이블을 준비합니다.
X = df_encoded.drop('JS_Price_Category', axis=1)  # 독립변수
y = df_encoded['JS_Price_Category']  # 종속변수

# RandomOverSampler를 초기화합니다.
oversampler = RandomOverSampler(sampling_strategy='auto', random_state=42)

# 오버샘플링을 적용합니다.
X_resampled, y_resampled = oversampler.fit_resample(X, y)

# 오버샘플링된 데이터를 새로운 데이터프레임으로 만듭니다.
df_encoded = pd.concat([X_resampled, y_resampled], axis=1)

# 오버샘플링된 데이터의 클래스 분포를 확인합니다.
print(df_encoded['JS_Price_Category'].value_counts())

# 상관관계 분석을 위한 DataFrame 생성
result_df = pd.DataFrame(columns=['Column_Name', 'PCC', 'p-value'])

# 'JS_Price_Category'와 다른 열 간의 PCC 및 p-value 계산 및 저장
for column in df_encoded.columns:
    if column != 'JS_Price_Category':
        correlation, p_value = pearsonr(df_encoded['JS_Price_Category'], df_encoded[column])
        result_df = result_df.append({'Column_Name': column, 'PCC': correlation, 'p-value': p_value}, ignore_index=True)

# PCC 및 p-value를 기준으로 필터링
delete_columns = []

for index, row in result_df.iterrows():
    if abs(row['PCC']) < 0.1 or row['p-value'] > 0.05:
        delete_columns.append(row['Column_Name'])
delete_columns.append('JS_Price_Category')
        
# 선택된 특성 열
selected_features = list(set(df_encoded.columns) - set(delete_columns))

0    4501
1    4501
3    4501
2    4501
4    4501
Name: JS_Price_Category, dtype: int64


In [13]:
result_df.sort_values('PCC', ascending=False).reset_index(drop=True)

Unnamed: 0,Column_Name,PCC,p-value
0,Sell_Price,0.88731,0.0
1,Region_Name_encoded,0.691956,0.0
2,JS_BA,0.648755,0.0
3,HSP_index,0.473909,0.0
4,CA_index,0.455294,0.0
5,LC_index,0.454219,0.0
6,YearMonth,0.44786,0.0
7,TC_index,0.4221,0.0
8,Building_Use_encoded,0.229331,2.33256e-266
9,SDT_index,-0.024821,0.0001961556


In [14]:
selected_features

['LC_index',
 'IR',
 'Population',
 'Shortest_Distance_to_Univ',
 'Shortest_Distance_to_School',
 'Building_Age',
 'Sell_Price',
 'YearMonth',
 'JS_BA',
 'Shortest_Distance_to_Subway',
 'Region_Name_encoded',
 'TC_index',
 'Building_Use_encoded',
 'HSP_index',
 'CA_index']