## 부정청약, 위장전입, 불법매매에 해당할 확률의 순위를 기반으로 조사대상을 반환하는 코드

### 1-1. 환경설정 및 데이터 로딩

In [1]:
# 라이브러리 로드 및 환경설정 (아래 라이브러리가 모두 설치되어야함)
import numpy as np
import pandas as pd
import sklearn
from sklearn import preprocessing
from sklearn.ensemble  import RandomForestClassifier
from sklearn import metrics 
import xgboost as xgb   
from xgboost import XGBClassifier
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_columns', None)

import warnings
import warnings
warnings.filterwarnings(action='ignore')

# 자료 읽기 
DF=pd.read_csv(r'21data_add.csv', encoding='CP949', sep=",")
DF_test=pd.read_csv(r'22data_add.csv', encoding='CP949', sep=",")

#DF_add = DF[DF['부정청약판정'] == 0].sample(n=400, random_state=1004)

# 학습 세트의 경우, 실제 조사를 한 경우만 로딩
cond=DF['검사여부']==1
DF_train=DF[cond]
#DF_test = pd.concat([DF_test, DF_add])

### 1-2. 위험순위 및 사전설정 함수 정의

In [2]:
# 위험순위에 대한 quantile points 설정 (1 기준. 예시: 0.5 = 상위 50% 추출)

measurements=[0.5, 0.25, 0.1]
#measurements=[0.9, 0.8, 0.7, 0.6, 0.5, 0.4, 0.3, 0.2, 0.1]


# 확률과 quantile point를 입력값으로 조사 결정을 변환하는 함수

def convert(pred_prob,m): 
    e=[]
    for h in range(0,len(pred_prob)):
        e.append(pred_prob[h][1])
    e_temp=sorted(e)
    e_temp.reverse()
    e_ind=[]
    for i in e:
        e_ind.append(e_temp.index(i)+1)    
    last_ind=len(e_ind)
    init=0
    for x in range(0,last_ind):
        dd=e[x]
        if dd == 0:
            e_ind[x]=last_ind
    point_m=int(round(len(e_ind)*m))
    pred=[]
    
    for x in range(0,last_ind):
        hh=e_ind[x]
    
        if hh<=point_m:
            pred.append(1)
        else:
            pred.append(0)

    return pred

 # 조사 시 검거 확률과 미스 확률을 반환하는 함수 (성능 테스트용)

def performance(X, Y):
    detect=[]
    miss=[]
    last_ind=len(Y)
    for i in range(0,last_ind):
        x_i=X[i]
        y_i=Y[i]

    
        if (x_i==1) & (y_i==1):
            detect.append(1)
        elif (x_i==1) & (y_i==0):
            detect.append(0)
        elif (x_i==0) & (y_i==1):
            miss.append(1)
        elif (x_i==0) & (y_i==0):
            miss.append(0)    
    prob_detect=detect.count(1)/len(detect)
    prob_miss=miss.count(1)/len(miss)       
    return [prob_detect,prob_miss]

# 자료 인코딩용 함수 정의 : 아래 함수에서 dataDF는 데이터프레임, list는 라벨화하려는 범주변수의 목록
def encode_labels(list, dataDF):                               
    for x in list:
        temp=preprocessing.LabelEncoder()
        dataDF[x]=temp.fit_transform(dataDF[x])
    return dataDF

### 1-3. 시행할 내용의 설정 (시행 시=1, 시행 안할 시=0)

In [3]:
#------------- 각 flag는 어떤 test를 할 것인지 설정.  RF는 random forest, XGB는 XGBoost, Y1,Y2,Y3는 부정청약,위장전입,부정매매 test
flag_RF=1
flag_XGB=1

flag_Y1=1
flag_Y2=1
flag_Y3=1

### 2-1. random forest 및 XGBoost의 하이퍼 파라메터 설정

In [4]:
#------- RF parameters----------

max_depth_Y1=10
max_depth_Y2=7
max_depth_Y3=14

min_samples_leaf_Y1=5
min_samples_leaf_Y2=3
min_samples_leaf_Y3=3

max_leaf_nodes_Y1=None
max_leaf_nodes_Y2=None
max_leaf_nodes_Y3=None

max_features_Y1=10
max_features_Y2=10
max_features_Y3=6

min_samples_split_Y1=4
min_samples_split_Y2=4
min_samples_split_Y3=4

bootstrap_Y1=True
bootstrap_Y2=True
bootstrap_Y3=True

warm_start_Y1=False
warm_start_Y2=False
warm_start_Y3=False

class_weight_Y1={0:1, 1:1}
class_weight_Y2={0:1, 1:1}
class_weight_Y3={0:1, 1:1}

#-------XGBoost parameters-------

reg_alpha_Y1 = 0.75
reg_alpha_Y2 = 0.75
reg_alpha_Y3 = 0.75

reg_lambda_Y1 = 0.5
reg_lambda_Y2 = 0.5
reg_lambda_Y3 = 0.5

gamma_Y1 = 0
gamma_Y2 = 0
gamma_Y3 = 0

booster_Y1 = 'gbtree'
booster_Y2 = 'gbtree'
booster_Y3 = 'gbtree'
#  'gbtree', 'gblinear' 'dart'

max_depth_Y1 = 10
max_depth_Y2 = 10
max_depth_Y3 = 10 

objective_Y1 = 'binary:logistic'
objective_Y2 = 'binary:logistic'
objective_Y3 = 'binary:logistic'
#'binary:logistic', 'binary:logitraw', 'binary:hinge'

learning_rate_Y1=0.75
learning_rate_Y2=0.75
learning_rate_Y3=0.75

min_child_weight_Y1=1
min_child_weight_Y2=1
min_child_weight_Y3=1

colsample_bytree_Y1=1
colsample_bytree_Y2=1
colsample_bytree_Y3=1

scale_pos_weight_Y1=1
scale_pos_weight_Y2=1
scale_pos_weight_Y3=1

subsample_Y1=1
subsample_Y2=1
subsample_Y3=1

In [5]:
DF_train.describe()

Unnamed: 0.1,Unnamed: 0,주택관리번호,동,호수,공급유형,특별공급종류,공급위치_시군구코드,크기,일반공급,특별공급,공급금액,생년,연령,변경일자,주소일치여부_부동산원,주소중복횟수,특일동시여부,2년청약건수,세대원수,분리세대원수,폰중복횟수_부동산원,IP중복당첨횟수_부동산원,IP중복신청횟수_부동산원,폰번호,IP_1,IP_2,IP_3,IP_4,ad_IP중복_2자리,ad_IP중복_3자리,ad_IP중복_4자리,접수일자,가점신청여부,가점당첨여부,가점합계,ad_신혼미공특소득구분,청약_재당첨제한여부,청약_혼인및한부모가족,세대주여부,과거5년이내당첨,무주택세대구성원,소득기준,소득(소득세 5개년도 납부)기준,세대구성,무주택세대주,자산기준,노부모부양,3명이상미성년자녀,혼인기간7년이내,ad_주소일치여부,검사여부,부정청약판정,부정_거주지유지,부정_주소지유지,부정_위장전입,부정_자격위조,부정_입주자저축증서매매,부정_위장이혼,부정_기타,모집공고일,당첨자발표일,해당전입제한일,1순위요건,총공급가구수,분양가상한제여부,과밀성장권역여부,1순위인터넷접수일,2순위인터넷접수일시,ad_변경일자,ad_총공급,ad_성명생년중복,ad_성명생년전화중복,ad_행정변경시점,ad_접수시간,ad_신청당첨거주일치여부,ad_부양가족수,ad_저축가입기간,ad_무주택기간,ad_청약납부회차,ad_청약납부금액,ad_청약경과기간,ad_다자녀영유아자녀수,ad_다자녀무주택기간,ad_다자녀해당지역거주기간,ad_다자녀입주자저축가입기간,ad_다자녀총점,ad_다자녀미성년자녀수,ad_신혼미공특미성년자녀수,ad_신혼미공특태아수,ad_미성년자녀수,ad_총점,ad_변경시점2
count,4443.0,4443.0,4443.0,4443.0,4443.0,4443.0,4443.0,4443.0,4443.0,4443.0,4443.0,4443.0,4443.0,4443.0,4443.0,4443.0,4443.0,4443.0,4443.0,4443.0,4443.0,4443.0,4443.0,4443.0,4443.0,4443.0,4443.0,4443.0,4443.0,4443.0,4443.0,4443.0,4443.0,4443.0,4443.0,4443.0,4443.0,4443.0,4443.0,4443.0,4443.0,4443.0,4443.0,4443.0,4443.0,4443.0,4443.0,4443.0,4443.0,4443.0,4443.0,4443.0,4443.0,4443.0,4443.0,4443.0,4443.0,4443.0,4443.0,4443.0,4443.0,4443.0,4443.0,4443.0,4443.0,4443.0,4443.0,4443.0,4443.0,4443.0,4443.0,4443.0,4443.0,4443.0,4443.0,4443.0,4443.0,4443.0,4443.0,4443.0,4443.0,4443.0,4443.0,4443.0,4443.0,4443.0,4443.0,4443.0,4443.0,4443.0,4443.0,4443.0
mean,14120.227099,2020418000.0,293.144272,1428.487508,14.863831,1.511141,39051.458474,83.405189,127.59982,113.342336,44068.29372,751888.540626,44.087328,20128670.0,0.626604,8.040288,0.256358,0.757146,3.180959,0.417961,0.763673,1.208643,5.656763,1057886000.0,139.098132,127.09138,123.684447,121.056043,407.462075,48.020032,0.358542,20205600.0,0.450596,0.279091,21.40063,0.039388,0.094981,0.003151,0.022957,0.03061,0.272339,0.15485,0.059869,0.059869,0.019806,0.007427,0.019806,0.063471,0.146298,0.560882,1.0,0.044114,0.000675,0.002026,0.030385,0.001801,0.011029,0.000225,0.00045,20205370.0,20206630.0,20199370.0,10.853477,848.238578,0.294396,0.165654,20205610.0,20205610.0,20128670.0,240.942156,1.00045,0.0,2825.463876,486.154175,0.751744,1.633356,3.764123,5.102183,6.533423,4539441.0,24.967589,0.158677,0.225523,0.2298,0.086203,4.682647,0.09363,0.313752,0.188386,0.614675,4.727211,2814.21472
std,7517.973395,492684.0,440.919901,959.937744,4.998708,1.871502,9780.268355,12.988401,101.266414,99.201776,14403.778015,151292.324043,13.74668,100061.7,0.48376,15.077391,0.436671,1.53108,1.842792,0.962471,0.760023,1.309714,30.365222,36588360.0,69.54954,75.971313,75.474667,74.115491,883.112044,134.591239,1.310595,4656.537,0.497609,0.448603,26.886235,0.220569,0.293222,0.056052,0.149785,0.172278,0.445213,0.361803,0.237272,0.237272,0.13935,0.085871,0.13935,0.243835,0.353444,0.496335,0.0,0.205372,0.025979,0.044967,0.171663,0.0424,0.104448,0.015002,0.021214,4645.555,4555.832,7452.364,7.988787,379.745574,0.455822,0.371811,4656.649,4656.057,100061.7,186.937449,0.021214,0.0,3660.878856,211.323147,0.43205,2.148774,5.102612,7.326372,27.5712,4903766.0,45.895383,0.645549,0.870501,0.882617,0.336168,17.449048,0.376316,0.819702,0.475733,1.314067,17.450847,3660.799225
min,2.0,2020001000.0,101.0,101.0,10.0,0.0,11740.0,39.929,1.0,0.0,22160.0,103.0,18.0,19680820.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,103992300.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,20200630.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,20200620.0,20200710.0,20181020.0,6.0,164.0,0.0,0.0,20200630.0,20200700.0,19680820.0,1.0,1.0,0.0,-632.0,60.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-643.0
25%,8213.5,2020001000.0,104.0,606.0,10.0,0.0,31110.0,84.03,54.0,43.0,34885.0,690327.5,34.0,20090720.0,0.0,1.0,0.0,0.0,2.0,0.0,0.0,1.0,1.0,1037618000.0,106.0,51.0,55.0,58.0,29.0,1.0,0.0,20201100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,20201020.0,20201110.0,20200120.0,6.0,592.0,0.0,0.0,20201100.0,20201100.0,20090720.0,105.0,1.0,0.0,158.0,284.5,1.0,0.0,0.0,0.0,0.0,2100000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,147.5
50%,14193.0,2020001000.0,108.0,1301.0,10.0,0.0,44200.0,84.906,98.0,89.0,39500.0,780823.0,42.0,20170400.0,1.0,2.0,0.0,0.0,3.0,0.0,1.0,1.0,1.0,1054908000.0,121.0,120.0,129.0,122.0,123.0,6.0,0.0,20201230.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,20201220.0,20210110.0,20200810.0,6.0,780.0,0.0,0.0,20201230.0,20201230.0,20170400.0,192.0,1.0,0.0,1376.0,554.0,1.0,0.0,0.0,0.0,0.0,3170000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1365.0
75%,20331.0,2021000000.0,305.0,2004.0,20.0,3.0,46150.0,84.9869,181.0,147.0,49170.0,860206.5,51.0,20200620.0,1.0,9.0,1.0,1.0,4.0,0.0,1.0,1.0,2.0,1082719000.0,211.0,199.0,190.0,183.0,419.0,27.0,0.0,20210400.0,1.0,1.0,48.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,20210320.0,20210410.0,20201020.0,24.0,1144.0,1.0,0.0,20210410.0,20210410.0,20200620.0,313.0,1.0,0.0,4190.5,640.5,1.0,3.0,6.5,14.0,0.0,5320000.0,36.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4180.0
max,27300.0,2021000000.0,3103.0,4906.0,20.0,5.0,48250.0,163.0429,429.0,471.0,108660.0,991216.0,96.0,20220920.0,1.0,148.0,1.0,21.0,13.0,10.0,13.0,22.0,913.0,1099998000.0,223.0,255.0,255.0,254.0,11583.0,1385.0,21.0,20210610.0,1.0,1.0,82.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,20210530.0,20210620.0,20210530.0,24.0,1655.0,1.0,1.0,20210610.0,20210610.0,20220920.0,816.0,2.0,0.0,19217.0,779.0,1.0,7.0,17.0,17.0,391.0,60850000.0,389.0,4.0,4.0,4.0,2.0,95.0,4.0,6.0,4.0,7.0,95.0,19205.0


In [6]:
print(DF_test.columns)
DF_test.describe()

Index(['Unnamed: 0', '80', '민영국민', '주택명', '동', '호수', '공급유형', '특별공급',
       '공급위치_시군구코드', '크기',
       ...
       'ad_다자녀무주택기간', 'ad_다자녀해당지역거주기간', 'ad_다자녀입주자저축가입기간', 'ad_다자녀총점',
       'ad_다자녀미성년자녀수', 'ad_신혼미공특미성년자녀수', 'ad_신혼미공특태아수', 'ad_미성년자녀수', 'ad_총점',
       'ad_변경시점2'],
      dtype='object', length=151)


Unnamed: 0.1,Unnamed: 0,80,동,호수,공급유형,특별공급,공급위치_시군구코드,크기,일반공급,특별공급.1,공급금액,생년,연령,변경일자,배우자,주소일치여부_부동산원,주소중복횟수,특일동시여부,2년청약건수,세대원수,분리세대원수,폰중복횟수_부동산원,IP중복당첨횟수_부동산원,IP중복신청횟수_부동산원,폰번호,ad_IP중복_2자리,ad_IP중복_3자리,ad_IP중복_4자리,접수일자,가점신청여부,가점당첨여부,가점합계,ad_신혼미공특소득구분,신혼공특_혼인기간,청약_재당첨제한여부,청약_혼인및한부모가족,세대주여부,과거5년이내당첨,무주택세대구성원,소득기준,소득(소득세 5개년도 납부)기준,세대구성,무주택세대주,자산기준,노부모부양,3명이상미성년자녀,혼인기간7년이내,ad_주소일치여부,검사여부,부정청약판정,부정_거주지유지,부정_주소지유지,부정_위장전입,부정_자격위조,부정_입주자저축증서매매,부정_위장이혼,부정_기타,모집공고일,당첨자발표일,해당전입제한일,1순위요건,총공급가구수,분양가상한제여부,과밀성장권역여부,1순위인터넷접수일,2순위인터넷접수일시,ad_변경일자,ad_총공급,ad_성명생년중복,ad_성명생년전화중복,ad_행정변경시점,ad_접수시간,ad_신청당첨거주일치여부,ad_부양가족수,ad_저축가입기간,ad_무주택기간,ad_청약납부회차,ad_청약납부금액,ad_청약경과기간,ad_다자녀영유아자녀수,ad_다자녀무주택기간,ad_다자녀해당지역거주기간,ad_다자녀입주자저축가입기간,ad_다자녀총점,ad_다자녀미성년자녀수,ad_신혼미공특미성년자녀수,ad_신혼미공특태아수,ad_미성년자녀수,ad_총점,ad_변경시점2
count,41282.0,41282.0,41282.0,41282.0,41282.0,41282.0,41282.0,41282.0,41282.0,41282.0,41282.0,41282.0,41282.0,41282.0,41282.0,41282.0,41282.0,41282.0,41282.0,41282.0,41282.0,41282.0,41282.0,41282.0,41282.0,41282.0,41282.0,41282.0,41282.0,41282.0,41282.0,41282.0,41282.0,41282.0,41282.0,41282.0,41282.0,41282.0,41282.0,41282.0,41282.0,41282.0,41282.0,41282.0,41282.0,41282.0,41282.0,41282.0,41282.0,41282.0,41282.0,41282.0,41282.0,41282.0,41282.0,41282.0,41282.0,41282.0,41282.0,41282.0,41282.0,41282.0,41282.0,41282.0,41282.0,41282.0,41282.0,41282.0,41282.0,41282.0,41282.0,41282.0,41282.0,41282.0,41282.0,41282.0,41282.0,41282.0,41282.0,41282.0,41282.0,41282.0,41282.0,41282.0,41282.0,41282.0,41282.0,41282.0,41282.0,41282.0
mean,20640.5,2021315000.0,250.91575,1247.79095,14.453515,1.17986,38038.166828,82.511554,94.311104,96.697713,48898.490965,789390.257788,41.849935,20130910.0,0.626883,0.702413,0.0,0.296885,0.0,2.978756,0.109854,0.307543,0.050967,1.554237,1058853000.0,507.71685,45.096628,0.050967,20215370.0,0.506201,0.330095,22.780582,0.0,0.0,0.0,0.024248,0.119519,0.0,0.445351,0.242139,0.130372,0.073349,0.01468,0.165375,0.018265,0.055424,0.182065,0.914248,0.214161,0.007752,0.0,0.0,0.005475,0.000266,0.00155,0.00046,0.000266,20215320.0,20215400.0,20206570.0,19.259096,717.554237,0.341868,0.294511,20215370.0,20215370.0,20131160.0,95.490965,1.102175,0.0,3074.990747,492.224577,0.834165,1.477569,5.122257,5.569498,12.643816,6369201.0,34.033283,0.127126,0.166634,0.188702,0.068529,3.602175,0.061334,0.386827,0.216269,0.614432,3.8479,3063.470302
std,11917.231243,463932.0,403.126129,784.658439,4.970106,1.529025,8079.16504,15.675227,70.150101,76.640584,15707.292005,114235.510275,10.835255,89170.24,0.545364,0.457203,0.0,0.458532,0.0,1.350948,0.492742,0.478238,0.406423,13.13833,26356710.0,1486.222795,211.25548,0.406423,4728.48,0.499968,0.470253,25.457077,0.0,0.0,0.0,0.15382,0.324403,0.0,0.497011,0.428383,0.336716,0.260712,0.120268,0.371523,0.133908,0.228808,0.385903,0.28,0.410244,0.087702,0.0,0.0,0.073788,0.016322,0.039344,0.021449,0.016322,4722.822,4729.172,7896.735,7.571029,383.930218,0.474341,0.455828,4728.19,4726.679,89001.95,70.011926,0.302882,0.0,3242.883725,215.593617,0.371937,1.730811,5.892351,7.220541,34.183449,5396999.0,51.229556,0.587633,0.736181,0.810322,0.300074,15.092846,0.26681,0.886661,0.491076,1.224769,15.117304,3242.782106
min,0.0,2021000000.0,1.0,1.0,10.0,0.0,11305.0,30.6774,1.0,0.0,14840.0,103.0,19.0,19681020.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,102439500.0,0.0,0.0,0.0,20210510.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,20210430.0,20210520.0,20190430.0,6.0,28.0,0.0,0.0,20210510.0,20210510.0,19681020.0,1.0,1.0,0.0,-225.0,60.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-235.0
25%,10320.25,2021001000.0,103.0,602.0,10.0,0.0,30140.0,74.98,40.0,34.0,41000.0,730516.0,34.0,20090710.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,1037659000.0,17.0,1.0,0.0,20210910.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,20210900.0,20210930.0,20200720.0,12.0,448.0,0.0,0.0,20210910.0,20210920.0,20090730.0,41.0,1.0,0.0,630.0,278.0,1.0,0.0,0.0,0.0,0.0,3020000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,619.0
50%,20640.5,2021001000.0,106.0,1105.0,10.0,0.0,41461.0,84.828,77.0,73.0,45700.0,810527.0,40.0,20160910.0,1.0,1.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,1054703000.0,86.0,4.0,0.0,20211120.0,1.0,0.0,12.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,20211110.0,20211200.0,20210120.0,24.0,706.0,0.0,0.0,20211120.0,20211120.0,20160920.0,79.0,1.0,0.0,1919.0,571.0,1.0,1.0,3.0,0.0,0.0,5000000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1908.0
75%,30960.75,2022000000.0,203.0,1802.0,20.0,2.0,43113.0,84.986,139.0,137.0,52900.0,870918.0,48.0,20200330.0,1.0,1.0,0.0,1.0,0.0,4.0,0.0,1.0,0.0,1.0,1084232000.0,464.0,27.0,0.0,20220400.0,1.0,1.0,49.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,20220320.0,20220410.0,20210630.0,24.0,856.0,1.0,1.0,20220400.0,20220410.0,20200330.0,139.0,1.0,0.0,4508.0,646.0,1.0,3.0,9.0,14.0,0.0,8150000.0,62.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4497.0
max,41281.0,2022000000.0,2010.0,4904.0,20.0,5.0,50110.0,279.2201,384.0,278.0,269580.0,991229.0,96.0,20220730.0,3.0,1.0,0.0,2.0,0.0,41.0,7.0,6.0,12.0,466.0,1945865000.0,26430.0,5066.0,12.0,20220720.0,1.0,1.0,84.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,20220630.0,20220730.0,20220620.0,24.0,1734.0,1.0,1.0,20220720.0,20220720.0,20220730.0,388.0,2.0,0.0,19609.0,779.0,1.0,7.0,17.0,17.0,388.0,77990000.0,398.0,4.0,4.0,4.0,2.0,90.0,4.0,5.0,4.0,7.0,90.0,19597.0


### 2-2. 변수처리

In [7]:
# 사용할 변수 리스트

X_list=['크기', 'ad_총공급', 'ad_성명생년전화중복', 'ad_행정변경시점',\
        '배우자', '2년청약건수', '세대원수', '분리세대원수', '폰중복횟수_부동산원', 'IP중복신청횟수_부동산원',\
        'ad_IP중복_3자리', 'ad_IP중복_4자리', 'ad_접수시간', 'ad_신청당첨거주일치여부', 'ad_부양가족수', 'ad_저축가입기간', 'ad_무주택기간',\
        'ad_청약납부회차', 'ad_청약경과기간', 'ad_총점', 'ad_주소일치여부', 'ad_변경시점2',\
        '공급금액', '연령', '세대주관계', '특일동시여부', '주소중복횟수', '가점합계', 'ad_신청유형', '기관추천종류',\
        '접수매체', '주택소유구분', '장기복무군인', '분양가상한제여부'
        ]
# 범주형 변수 선언
str_list = ['배우자','세대주관계', 'ad_신청유형', '기관추천종류', '접수매체', '주택소유구분', '장기복무군인']


# 결측치 제거 위해 테스트 대상 특정
#DF_train_selected=DF_train[X_list+['부정청약판정', '부정_위장전입', '부정_입주자저축증서매매']]
#DF_test_selected=DF_test[X_list]

# 결측치있는 표본 제거
DF_train_selected=DF_train.dropna()
DF_test_selected=DF_test.dropna()
print(len(DF_train)-len(DF_train_selected))
print(len(DF_test)-len(DF_test_selected))

# 설명변수 설정
X_train=DF_train_selected[X_list]
X_test=DF_test_selected[X_list]

# 예측변수 설정
Y1=DF_train_selected['부정청약판정']
Y2=DF_train_selected['부정_위장전입']
Y3=DF_train_selected['부정_입주자저축증서매매']


Y1_test=DF_test_selected['부정청약판정']
Y2_test=DF_test_selected['부정_위장전입']
Y3_test=DF_test_selected['부정_입주자저축증서매매']


# 범주형 변수 라벨화

X_train[ str_list ]=X_train[ str_list ].astype('str')
X_test[ str_list ]=X_test[ str_list ].astype('str')

tmp = pd.DataFrame()
for x in str_list:
        temp=preprocessing.LabelEncoder()
        tmp[x]=temp.fit_transform(X_test[x])
        print(x)
        mapping = dict(zip(temp.classes_, range(1, len(temp.classes_)+1)))
        print(mapping)
        

X_train=encode_labels(str_list, X_train)
X_test=encode_labels(str_list, X_test)



56
158
배우자
{'0': 1, '1': 2, '2': 3, '3': 4}
세대주관계
{'고모': 1, '고종': 2, '누나': 3, '누이': 4, '동거인': 5, '매': 6, '매부': 7, '며느리': 8, '모': 9, '배우자': 10, '배우자의자녀': 11, '백부': 12, '본인': 13, '부': 14, '사위': 15, '손': 16, '시모': 17, '시부': 18, '언니': 19, '오빠': 20, '올케': 21, '외가친척': 22, '외손': 23, '외조모': 24, '이모': 25, '자녀': 26, '장모': 27, '장인': 28, '제': 29, '제수': 30, '조모': 31, '조카': 32, '처가친척': 33, '처남': 34, '처제': 35, '처형': 36, '친척': 37, '형': 38, '형부': 39, '형수': 40}
ad_신청유형
{'기관추천': 1, '다자녀': 2, '무': 3, '신혼공특미적용': 4}
기관추천종류
{'10년이상복무군인': 1, '경제자유구역종사자(제38조)': 2, '국가유공자 등(35조, 36조)': 3, '국가유공자 등(45조)': 4, '기타(그밖의법령·국가시책)': 5, '납북피해자': 6, '다문화가족': 7, '도시재생사업 토지·건축물 소유자(35조,36조)': 8, '무': 9, '북한이탈주민': 10, '시책추진대상(투자촉진,전통문화)': 11, '우수선수·기능인': 12, '의상자,의사자유족': 13, '장기복무제대군인': 14, '장애인': 15, '중소기업근로자': 16, '철거주택(35조,36조)': 17, '철거주택세입자(35조)': 18}
접수매체
{'모바일앱': 1, '인터넷': 2, '창구': 3}
주택소유구분
{'-': 1, '1주택(처분서약)': 2, '1주택이상(처분미서약)': 3, '무주택자': 4}
장기복무군인
{'N': 1, 'T': 2, 'Y': 3}


### 3. 기계학습예측기의 학습

In [8]:
if flag_RF==1:
    if flag_Y1==1:
        rf1=RandomForestClassifier(n_estimators=100, random_state=11, max_depth=max_depth_Y1, min_samples_leaf=min_samples_leaf_Y1,\
                                   max_features=max_features_Y1, max_leaf_nodes=max_leaf_nodes_Y1, min_samples_split=min_samples_split_Y1,\
                                  bootstrap=bootstrap_Y1, warm_start=warm_start_Y1, class_weight=class_weight_Y1)
        rf1.fit(X_train,Y1)
        rf1_pred = rf1.predict_proba(X_test)
        #rf1_prd = rf1.predict(X_test)
        threshold = 0.1
        rf1_prd = (rf1.predict_proba(X_test)[:, 1] > threshold).astype('float')
    if flag_Y2==1:
        rf2=RandomForestClassifier(n_estimators=100, random_state=11, max_depth=max_depth_Y2, min_samples_leaf=min_samples_leaf_Y2,\
                                   max_features=max_features_Y2, max_leaf_nodes=max_leaf_nodes_Y2, min_samples_split=min_samples_split_Y2,\
                                   bootstrap=bootstrap_Y2, warm_start=warm_start_Y2, class_weight=class_weight_Y1)
        rf2.fit(X_train,Y2)
        rf2_pred=rf2.predict_proba(X_test)
        #rf2_prd = rf2.predict(X_test)
        rf2_prd = (rf2.predict_proba(X_test)[:, 1] > threshold).astype('float')
    if flag_Y3==1:
        rf3=RandomForestClassifier(n_estimators=100, random_state=11, max_depth=max_depth_Y3, min_samples_leaf=min_samples_leaf_Y3,\
                                   max_features=max_features_Y3, max_leaf_nodes=max_leaf_nodes_Y3, min_samples_split=min_samples_split_Y3,\
                                   bootstrap=bootstrap_Y3, warm_start=warm_start_Y3, class_weight=class_weight_Y1)
        rf3.fit(X_train,Y3)
        rf3_pred=rf3.predict_proba(X_test)
        #rf3_prd = rf3.predict(X_test)
        rf3_prd = (rf3.predict_proba(X_test)[:, 1] > threshold).astype('float')
            
#-------------------------------XGB: training-----------------------------------
if flag_XGB==1:
    if flag_Y1==1:
        xgb1=XGBClassifier(reg_alpha=reg_alpha_Y1, reg_lambda=reg_lambda_Y1, gamma=gamma_Y1, booster=booster_Y1, max_depth=max_depth_Y1,\
                           objective=objective_Y1, learning_rate=learning_rate_Y1, min_child_weight=min_child_weight_Y1,\
                           colsample_bytree=colsample_bytree_Y1, scale_pos_weight=scale_pos_weight_Y1, subsample_=subsample_Y1,\
                           verbosity = 0)
        xgb1.fit(X_train,Y1)
        xgb1_pred=xgb1.predict_proba(X_test)
        xgb1_prd = xgb1.predict(X_test)
    if flag_Y2==1:
        xgb2=XGBClassifier(reg_alpha=reg_alpha_Y2,reg_lambda=reg_lambda_Y2, gamma=gamma_Y2, booster=booster_Y2, max_depth=max_depth_Y2,\
                           objective=objective_Y2, learning_rate=learning_rate_Y2, min_child_weight=min_child_weight_Y2,\
                           colsample_bytree=colsample_bytree_Y2, scale_pos_weight=scale_pos_weight_Y2, subsample_=subsample_Y2,\
                           verbosity = 0)
        xgb2.fit(X_train,Y2)
        xgb2_pred=xgb2.predict_proba(X_test)
        xgb2_prd = xgb2.predict(X_test)
    if flag_Y3==1:
        xgb3=XGBClassifier(reg_alpha=reg_alpha_Y3,reg_lambda=reg_lambda_Y3, gamma=gamma_Y3, booster=booster_Y3, max_depth=max_depth_Y3,\
                           objective=objective_Y3, learning_rate=learning_rate_Y3, min_child_weight=min_child_weight_Y3,\
                           colsample_bytree=colsample_bytree_Y3, scale_pos_weight=scale_pos_weight_Y3, subsample_=subsample_Y3,\
                           verbosity = 0)
        xgb3.fit(X_train,Y3)
        xgb3_pred=xgb3.predict_proba(X_test)
        xgb3_prd = xgb3.predict(X_test)

### 4. 결과 반환

In [9]:
    
if flag_Y1==1:
    
    if flag_RF==1:
        DF_test_selected['부정청약_rf_prob']=rf1_pred[:,1]
        for m in measurements:
            DF_test_selected['부정청약_rf_'+str(m)]=convert(rf1_pred,m)
            print('RF-부정청약여부/quantile=',m)
                
    if flag_XGB==1:
        DF_test_selected['부정청약_xgb_prob']=xgb1_pred[:,1]
        for m in measurements:
            DF_test_selected['부정청약_xgb_'+str(m)]=convert(xgb1_pred,m)
            print('XGB-부정청약여부/quantile=',m)
        
if flag_Y2==1:
    print('-------------------------------------위장전입여부----------------------------------')
    
    if flag_RF==1:
        DF_test_selected['위장전입_rf_prob']=rf2_pred[:,1]
        for m in measurements:
            DF_test_selected['위장전입_rf_'+str(m)]=convert(rf2_pred,m)
            print('RF-위장전입/quantile=',m)
                
    if flag_XGB==1:
        DF_test_selected['위장전입_xgb_prob']=xgb2_pred[:,1]
        for m in measurements:
            DF_test_selected['위장전입_xgb_'+str(m)]=convert(xgb2_pred,m)
            print('XGB-위장전입/quantile=',m)

    
if flag_Y3==1:
    print('-------------------------------------매매여부-------------------------------------')    
    
    if flag_RF==1:
        DF_test_selected['매매_rf_prob']=rf3_pred[:,1]
        for m in measurements:
            DF_test_selected['매매_rf_'+str(m)]=convert(rf3_pred,m)
            print('RF-불법매매/quantile=',m)
                
    if flag_XGB==1:
        DF_test_selected['매매_xgb_prob']=xgb3_pred[:,1]
        for m in measurements:
            DF_test_selected['매매_xgb_'+str(m)]=convert(xgb3_pred,m)
            print('XGB-불법매매/quantile=',m)

DF_test_selected.to_csv('Result.csv',encoding='CP949',sep=",")

RF-부정청약여부/quantile= 0.5
RF-부정청약여부/quantile= 0.25
RF-부정청약여부/quantile= 0.1
XGB-부정청약여부/quantile= 0.5
XGB-부정청약여부/quantile= 0.25
XGB-부정청약여부/quantile= 0.1
-------------------------------------위장전입여부----------------------------------
RF-위장전입/quantile= 0.5
RF-위장전입/quantile= 0.25
RF-위장전입/quantile= 0.1
XGB-위장전입/quantile= 0.5
XGB-위장전입/quantile= 0.25
XGB-위장전입/quantile= 0.1
-------------------------------------매매여부-------------------------------------
RF-불법매매/quantile= 0.5
RF-불법매매/quantile= 0.25
RF-불법매매/quantile= 0.1
XGB-불법매매/quantile= 0.5
XGB-불법매매/quantile= 0.25
XGB-불법매매/quantile= 0.1
