In [1]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor
# IterativeImputer 사용을 명시적으로 활성화
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, KNNImputer, IterativeImputer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler, PolynomialFeatures
from sklearn.compose import ColumnTransformer
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import Lasso
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import mean_squared_error
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from datetime import datetime
import zipfile

In [28]:
# 데이터 읽기, 용량 절약을 위해 zip 파일의 압축을 해제하지 않고 바로 읽도록한다.
zf = zipfile.ZipFile('20241112.zip') 

df_source = pd.read_csv(zf.open('20241112.csv'), dtype = {'사업자등록번호':'string', '등록국세청코드':'string', '납세자유형코드':'string', 
                                                          '사업자유형코드':'string', '산업분류코드':'string', '시도':'string', 
                                                          '국세청상호명':'string', '국세청상호명존재여부':'string', '영업일수':'int', 
                                                          '영업일수(100일단위)':'int', '개업일':'string', '폐업일':'string', '통신판매사업자여부':'string', 
                                                          '통신판매사업자전화번호':'string', '통신판매사업자전자우편':'string', 
                                                          '나라장터조달업체제조구분코드':'string', '고용보험 업종코드':'string', '나라장터조달업체업무구분코드':'string',
                                                          '사업장 우편번호':'string'}
                                                          )
df_source.columns = df_source.columns.str.strip().str.replace(' ', '_')
display(df_source)

Unnamed: 0,사업자등록번호,등록국세청코드,사업자유형코드,납세자유형코드,산업분류코드,시도,국세청상호명,국세청상호명존재여부,영업일수,영업일수(100일단위),...,사업장_우편번호,사업장_주소,고용보험_업종코드,고용보험_업종명,산재보험_성립일자,고용보험_성립일자,산재보험_상시근로자수,고용보험_상시근로자수,산재보험_사업구분,고용보험_사업구분
0,1010109091,101,01,01,56114,서울,김밥천국삼청점,Y,4766,48,...,,,,,,,,,,
1,1010109107,101,01,01,56122,서울,명송 하나,Y,5791,58,...,,,,,,,,,,
2,1010112688,101,01,01,47312,서울,가인전자,Y,2586,26,...,,,,,,,,,,
3,1010112733,101,01,01,20400,서울,켐스펙교역,Y,2371,24,...,,,,,,,,,,
4,1010112806,101,01,01,46596,서울,동광전업사,Y,2477,25,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1595328,8999100276,899,91,04,03112,경북,제2007대성호,Y,2146,21,...,,,,,,,,,,
1595329,8999300310,899,93,04,46312,부산,경북농산,Y,2023,20,...,,,,,,,,,,
1595330,8999601213,899,96,04,90212,대구,글나루독서실,Y,868,9,...,,,,,,,,,,
1595331,8999700981,899,97,04,96921,전북,길묘,Y,363,4,...,,,,,,,,,,


In [30]:
# 개인사업자의 데이터만 남긴다. (81 ~ 88 제외)
df_stage1 = df_source[(df_source['사업자유형코드'] <= '80') | (df_source['사업자유형코드'] >= '89')]
display(df_stage1)

Unnamed: 0,사업자등록번호,등록국세청코드,사업자유형코드,납세자유형코드,산업분류코드,시도,국세청상호명,국세청상호명존재여부,영업일수,영업일수(100일단위),...,사업장_우편번호,사업장_주소,고용보험_업종코드,고용보험_업종명,산재보험_성립일자,고용보험_성립일자,산재보험_상시근로자수,고용보험_상시근로자수,산재보험_사업구분,고용보험_사업구분
0,1010109091,101,01,01,56114,서울,김밥천국삼청점,Y,4766,48,...,,,,,,,,,,
1,1010109107,101,01,01,56122,서울,명송 하나,Y,5791,58,...,,,,,,,,,,
2,1010112688,101,01,01,47312,서울,가인전자,Y,2586,26,...,,,,,,,,,,
3,1010112733,101,01,01,20400,서울,켐스펙교역,Y,2371,24,...,,,,,,,,,,
4,1010112806,101,01,01,46596,서울,동광전업사,Y,2477,25,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1595327,8999001055,899,90,04,85709,경기,아트앤하트 고양탄현에듀포레푸르지오,Y,690,7,...,,,,,,,,,,
1595328,8999100276,899,91,04,03112,경북,제2007대성호,Y,2146,21,...,,,,,,,,,,
1595329,8999300310,899,93,04,46312,부산,경북농산,Y,2023,20,...,,,,,,,,,,
1595330,8999601213,899,96,04,90212,대구,글나루독서실,Y,868,9,...,,,,,,,,,,


In [None]:
# Drop duplicates
df_stage2 = df_stage1.drop(columns = ['사업자등록번호', '영업일수', '국세청상호명'], axis=1)
df_stage2 = df_stage2.drop_duplicates()

# DataType Correction
df_stage2['나라장터조달업체종업원수'] = pd.to_numeric(df_stage2['나라장터조달업체종업원수'],errors='coerce', downcast=None).astype('Int64')
df_stage2['산재보험_상시근로자수'] = pd.to_numeric(df_stage2['산재보험_상시근로자수'],errors='coerce', downcast=None).astype('Int64')
df_stage2['고용보험_상시근로자수'] = pd.to_numeric(df_stage2['고용보험_상시근로자수'],errors='coerce', downcast=None).astype('Int64')

display(df_stage2)

In [15]:
# 데이터 탐색용 데이터프레임 정의
df_exploration = df_stage2.copy()
df_exploration.head()

Unnamed: 0,등록국세청코드,사업자유형코드,납세자유형코드,산업분류코드,시도,국세청상호명존재여부,영업일수(100일단위),개업일,폐업일,통신판매사업자여부,...,사업장_우편번호,사업장_주소,고용보험_업종코드,고용보험_업종명,산재보험_성립일자,고용보험_성립일자,산재보험_상시근로자수,고용보험_상시근로자수,산재보험_사업구분,고용보험_사업구분
0,101,1,1,56114,서울,Y,48,19941001,20071019,N,...,,,,,,,,,,
1,101,1,1,56122,서울,Y,58,19940523,20100331,N,...,,,,,,,,,,
2,101,1,1,47312,서울,Y,26,19970101,20040131,N,...,,,,,,,,,,
3,101,1,1,20400,서울,Y,24,19970101,20030630,N,...,,,,,,,,,,
4,101,1,1,46596,서울,Y,25,19970101,20031014,N,...,,,,,,,,,,


In [16]:
df_exploration.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1114204 entries, 0 to 1595331
Data columns (total 41 columns):
 #   Column            Non-Null Count    Dtype 
---  ------            --------------    ----- 
 0   등록국세청코드           1114204 non-null  string
 1   사업자유형코드           1114204 non-null  string
 2   납세자유형코드           1114204 non-null  string
 3   산업분류코드            1114204 non-null  string
 4   시도                1114204 non-null  string
 5   국세청상호명존재여부        1114204 non-null  string
 6   영업일수(100일단위)      1114204 non-null  int64 
 7   개업일               1114204 non-null  string
 8   폐업일               1114204 non-null  string
 9   통신판매사업자여부         1114204 non-null  string
 10  통신판매사업자전화번호       10922 non-null    string
 11  통신판매사업자전자우편       10650 non-null    string
 12  통신판매사업자사업장소재지     10931 non-null    object
 13  통신판매사업자도로명사업장소재지  10617 non-null    object
 14  통신판매사업자판매방식       39717 non-null    object
 15  통신판매사업자취급품목       39717 non-null    object
 16  통신판매사업자인터넷도메인     38386

In [None]:
df_stage2.info()