## 라이브러리

In [1]:
import pandas as pd
import numpy as np
from scipy.stats import pearsonr, spearmanr, kendalltau
from scipy import stats
from sklearn.ensemble import RandomForestRegressor

## 데이터 불러오기

In [2]:
path = './data/'

df = pd.read_csv(path+'Total_APT_for_Target_Features.csv')

In [3]:
df

Unnamed: 0,Sell_Price,Sell_Count,JS_Price,JS_Count,CR,UR,LC_index,CA_index,TC_index,SDT_index,IR,Crime_Rates,Total_Pop,Univ_Counts,Park_Counts,School_Counts,Subway_Counts
0,82549.28,133,38076.56,750,46.13,3.1,74.5,81.5,78.5,102.461258,3.0,1.548846,554870,1.0,7,77,21
1,44177.42,116,21442.84,490,48.54,3.1,74.5,81.5,78.5,102.461258,3.0,1.548846,484742,0.0,7,60,14
2,28957.76,58,18593.14,217,64.21,3.1,74.5,81.5,78.5,102.461258,3.0,1.548846,338041,1.0,4,34,3
3,29813.92,96,19282.70,548,64.68,3.1,74.5,81.5,78.5,102.461258,3.0,1.548846,561431,2.0,9,80,9
4,27317.47,99,21386.54,260,78.29,3.1,74.5,81.5,78.5,102.461258,3.0,1.548846,517095,1.0,2,55,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3045,132585.00,30,73065.05,261,55.11,2.1,109.0,109.0,110.3,87.677816,2.5,0.865139,215891,2.0,2,34,10
3046,54418.49,119,40223.93,453,73.92,2.1,109.0,109.0,110.3,87.677816,2.5,0.865139,465727,1.0,7,66,13
3047,23635.22,23,50643.70,64,214.27,2.1,109.0,109.0,110.3,87.677816,2.5,0.865139,140477,6.0,12,36,15
3048,99441.18,34,56290.27,135,56.61,2.1,109.0,109.0,110.3,87.677816,2.5,0.865139,119206,2.0,4,31,23


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3050 entries, 0 to 3049
Data columns (total 17 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Sell_Price     3050 non-null   float64
 1   Sell_Count     3050 non-null   int64  
 2   JS_Price       3050 non-null   float64
 3   JS_Count       3050 non-null   int64  
 4   CR             3050 non-null   float64
 5   UR             3050 non-null   float64
 6   LC_index       3050 non-null   float64
 7   CA_index       3050 non-null   float64
 8   TC_index       3050 non-null   float64
 9   SDT_index      3050 non-null   float64
 10  IR             3050 non-null   float64
 11  Crime_Rates    3050 non-null   float64
 12  Total_Pop      3050 non-null   int64  
 13  Univ_Counts    3050 non-null   float64
 14  Park_Counts    3050 non-null   int64  
 15  School_Counts  3050 non-null   int64  
 16  Subway_Counts  3050 non-null   int64  
dtypes: float64(11), int64(6)
memory usage: 405.2 KB


## Categorical:

- Nominal(variables that have two or more categories, but which do not have an intrinsic order.)

    - Region_Name : 자치구 명
    - Building_Use : 건물 용도
    
- Ordinal(variables that have two or more categories just like nominal variables. Only the categories can also be ordered or ranked.)

    
## Numeric:

- Discrete
    - Year : 년
    - Month : 월
    - Region_Code : 자치구 코드
    - JS_Count : 전세 거래량
    - Sell_Count : 매매 거래량
    - School_Counts : 자치구 내 초중고 수
    - Subway_Counts : 자치구 내 지하철역 수
    - Univ_Counts : 자치구 내 대학교 수
    - Park_Counts : 자치구 내 공원 수
   
- Continous
    - Sell : 매매
    - Sell_Price : 매매 가격
    - Sell_BA = Sell_building Area : 매매 건물 면적
    - Sell_PPA = Sell_Price Per Area : 면적 당 매매 가격
    - Sell_PPP = Sell_Price Per Pyeong : 평 당 매매 가격
    - JS : 전세
    - JS_Price : 전세 가격
    - JS_BA = JS_Building Area : 임대 면적
    - JS_PPA = JS_Price Per Area : 임대 면적 당 전세 가격
    - JS_PPP = JS_Price Per Pyeong : 평 당 전세 가격
    - CR = Charter Rate : 전세가율
    - CR_PPA  = Charter_Rate_Price Per Area : 면적 당 전세가율
    - CR_PPP = Charter Rate_Price Per Pyeong : 평 당 전세가율
    - lR = Interest Rate : 금리
    - UR = Unemployment Rate : 실업률
    - LC_index = Leading Composite index : 선행종합 지수
    - CA_index = Comprehensive Accompany index : 동행종합 지수
    - TC_index = Trailing Composite index : 후행종합 지수
    - SDT_index = Supply and Demand Trend index = 전세수급동향 지수
    

## PCC

In [5]:
# 결과를 저장할 DataFrame 생성
result_df = pd.DataFrame(columns=['Column_Name', 'PCC', 'p-value'])

# 'JS_Price'와 다른 열 간의 PCC 및 p-value 계산 및 저장
for column in df.columns:
    if column != 'JS_Price':
        if df[column].dtype != object:
            correlation, p_value = pearsonr(df['JS_Price'], df[column])
            result_df = result_df.append({'Column_Name': column, 'PCC': correlation, 'p-value': p_value}, ignore_index=True)

In [6]:
result_df.sort_values(by='PCC', ascending=False).reset_index(drop=True)

Unnamed: 0,Column_Name,PCC,p-value
0,Sell_Price,0.917783,0.0
1,LC_index,0.567855,5.6409609999999996e-260
2,TC_index,0.561043,1.680917e-252
3,CA_index,0.55736,1.5680110000000002e-248
4,Subway_Counts,0.474524,4.279994e-171
5,JS_Count,0.221859,2.535643e-35
6,Park_Counts,0.134422,9.021008e-14
7,UR,0.084473,2.98882e-06
8,School_Counts,-0.014272,0.430733
9,Univ_Counts,-0.02689,0.1376253


## ANOVA (continous vs discrete)

In [7]:
# def anova(df):
#     for column in df.columns:
#         if np.issubdtype(df[column].dtype, np.integer):
#             # 이 코드는 정수형 열에 대해서만 분산분석을 수행합니다.
#             f_statistic, p_value = stats.f_oneway(*[group for name, group in df.groupby(column)['JS_Price']])
#             print(column)
#             print("F-statistic:", f_statistic, "p-value:", p_value)

In [8]:
# anova(df)

Sell_Count
F-statistic: 1.0385625203674753 p-value: 0.2591115149646185
JS_Count
F-statistic: 1.8327368845634977 p-value: 3.948771745173836e-29
Total_Pop
F-statistic: 330.3594044134726 p-value: 0.0
Park_Counts
F-statistic: 57.49163390146516 p-value: 1.088451007839592e-77
School_Counts
F-statistic: 175.95297731052418 p-value: 0.0
Subway_Counts
F-statistic: 209.20903896900464 p-value: 0.0


## KCC (continous vs categorical)

- 범주형 변수 더미화

In [9]:
# 범주형 변수 더미화 함수, 범주형 변수의 범주 레벨 간의 관계가 중요할 시 사용
def oh_encoding(df):
    # DataFrame의 복사본을 만듭니다.
    encoded_df = df.copy()
    for column in df.columns:
        if df[column].dtype == object:
            encoded_df = pd.get_dummies(encoded_df, columns=[column], prefix=column)
            print(column)
    return encoded_df

df_encoded = oh_encoding(df)

In [10]:
# 결과를 저장할 DataFrame 생성
result_df = pd.DataFrame(columns=['Column_Name', 'KCC', 'p-value'])

# 'JS_Price'와 다른 열 간의 Kendall 상관 계수 및 p-value 계산 및 저장
for column in df_encoded.columns:
    if column != 'JS_Price':
        if column.startswith('Region') or column.startswith('Building'):  
            kendall_corr, p_value = kendalltau(df_encoded['JS_Price'], df_encoded[column])
            result_df = result_df.append({'Column_Name': column, 'KCC': kendall_corr, 'p-value': p_value}, ignore_index=True)

In [11]:
result_df.sort_values(by='KCC', ascending=False).reset_index(drop=True)

Unnamed: 0,Column_Name,KCC,p-value


- p-value값 고려
    - PCC 결과 a=0.05일 때 School_Counts, Univ_Counts, Total_Pop은 통계적으로 상관관계가 유의하지 않으므로 변수에서 제외한다.
- correlation값 고려
    - PCC결과 상관계수 절댓값이 0.1이하인 변수 제외 -> UR, School_Counts, Univ_Counts, Total_Pop, Crime_Rates를 변수에서 제외