In [8]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as spst
from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings(action='ignore')

### **1. KNNImputer를 이용하여 결측치 처리**

In [9]:
from sklearn.impute import KNNImputer
imputer = KNNImputer(n_neighbors=5)

In [10]:
data = pd.read_csv("data-comp/test_dataset_v01.csv")
data = data.loc[:,"url_len":]

In [11]:
cols = data.columns

In [12]:
data_filled = imputer.fit_transform(data)
data_filled

array([[ 97. ,   0. ,  80. , ...,   1. ,   3. ,   0. ],
       [ 37. ,   0. ,  22. , ...,   2. , 182.4,   0. ],
       [ 44. ,   0. ,  24.4, ...,   0. , 136. ,   0. ],
       ...,
       [ 45. ,   1. ,  23. , ...,   2. ,  84.4,   0. ],
       [ 24. ,   0. ,  10. , ...,   2. ,  25. ,   0. ],
       [ 33. ,   2. ,  14.2, ...,   4. , 213. ,   0. ]])

In [13]:
data = pd.DataFrame(data_filled, columns=cols)
data

Unnamed: 0,url_len,url_num_hyphens_dom,url_path_len,url_domain_len,url_hostname_len,url_num_dots,url_num_underscores,url_query_len,url_num_query_para,url_ip_present,...,html_num_tags('iframe'),html_num_tags('script'),html_num_tags('embed'),html_num_tags('object'),html_num_tags('div'),html_num_tags('head'),html_num_tags('body'),html_num_tags('form'),html_num_tags('a'),html_num_tags('applet')
0,97.0,0.0,80.0,17.0,17.0,4.0,0.0,0.0,0.0,0.0,...,0.0,5.0,0.0,0.0,12.0,1.0,1.0,1.0,3.0,0.0
1,37.0,0.0,22.0,15.0,15.0,3.0,1.0,0.0,0.0,0.0,...,1.0,27.0,0.0,0.0,158.0,1.0,1.0,2.0,182.4,0.0
2,44.0,0.0,24.4,20.0,20.0,3.0,3.0,0.0,0.0,0.0,...,0.0,6.0,3.0,3.0,34.0,1.0,1.0,0.0,136.0,0.0
3,40.0,0.0,14.0,26.0,26.0,2.0,0.0,0.0,0.0,0.0,...,0.0,1.2,0.0,0.0,29.0,1.0,1.0,1.0,2.0,0.0
4,43.0,2.0,17.2,26.0,26.0,2.0,0.0,0.0,0.0,0.0,...,0.0,2.0,0.0,0.0,41.0,1.0,1.0,1.0,16.4,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2436,63.0,0.0,31.0,17.0,30.8,4.0,2.0,0.0,0.0,0.0,...,1.0,18.0,0.0,0.0,79.0,1.0,1.0,1.0,119.0,0.0
2437,36.0,0.0,1.0,35.0,35.0,2.0,0.0,0.0,0.0,0.0,...,0.0,27.0,0.0,0.0,57.0,1.0,1.0,1.0,11.4,0.0
2438,45.0,1.0,23.0,22.0,17.8,2.0,0.0,0.0,0.0,0.0,...,0.0,8.0,0.0,0.0,199.0,1.0,1.0,2.0,84.4,0.0
2439,24.0,0.0,10.0,14.0,15.0,3.0,0.0,0.0,0.0,1.0,...,0.0,3.0,0.0,0.0,90.0,1.0,1.0,2.0,25.0,0.0


In [14]:
data.to_csv("data-comp/test_data_imputer.csv", index=False)

### **2. 변수 간 상관관계에 따른 결측치 처리**

In [15]:
data = pd.read_csv("data-comp/test_dataset_v01.csv")
data = data.loc[:,"url_len":]

In [16]:
data[(data['url_domain_len'].isnull())&(data['url_hostname_len'].isnull())]

Unnamed: 0,url_len,url_num_hyphens_dom,url_path_len,url_domain_len,url_hostname_len,url_num_dots,url_num_underscores,url_query_len,url_num_query_para,url_ip_present,...,html_num_tags('iframe'),html_num_tags('script'),html_num_tags('embed'),html_num_tags('object'),html_num_tags('div'),html_num_tags('head'),html_num_tags('body'),html_num_tags('form'),html_num_tags('a'),html_num_tags('applet')
12,17,0,0.0,,,1,0,0,0,0,...,0,45.0,0,0,85,1,1,3.0,67.0,0
21,22,1,,,,2,0,0,0,0,...,1,5.0,0,0,28,1,1,0.0,31.0,0
41,22,0,7.0,,,2,0,0,0,0,...,1,32.0,0,0,56,1,1,,7.0,0
66,17,0,0.0,,,2,0,0,0,0,...,0,15.0,0,0,67,1,1,,51.0,0
67,56,1,30.0,,,3,0,0,0,0,...,0,2.0,0,0,1,1,1,,3.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2238,60,0,48.0,,,3,1,0,0,0,...,0,0.0,0,0,4,1,1,0.0,3.0,0
2248,61,0,33.0,,,2,0,0,0,0,...,0,3.0,0,0,3,1,1,1.0,0.0,0
2280,28,0,11.0,,,3,0,0,0,0,...,0,0.0,0,0,0,1,0,0.0,0.0,0
2327,39,0,22.0,,,3,0,0,0,0,...,0,17.0,0,0,32,1,1,,29.0,0


In [17]:
data.isna().sum()

url_len                      0
url_num_hyphens_dom          0
url_path_len               466
url_domain_len             466
url_hostname_len           463
url_num_dots                 0
url_num_underscores          0
url_query_len                0
url_num_query_para           0
url_ip_present               0
url_entropy                  0
url_chinese_present          0
url_port                     0
html_num_tags('iframe')      0
html_num_tags('script')    467
html_num_tags('embed')       0
html_num_tags('object')      0
html_num_tags('div')         0
html_num_tags('head')        0
html_num_tags('body')        0
html_num_tags('form')      467
html_num_tags('a')         461
html_num_tags('applet')      0
dtype: int64

다른 변수들과 상관관계를 보이지 않는 변수들은 중앙값으로 채움

In [18]:
data["html_num_tags('script')"] = data["html_num_tags('script')"].fillna(data["html_num_tags('script')"].median())
data["html_num_tags('a')"] = data["html_num_tags('a')"].fillna(data["html_num_tags('a')"].median())
data["html_num_tags('form')"] = data["html_num_tags('form')"].fillna(data["html_num_tags('form')"].median())

In [19]:
data['idx'] = list(range(0, 2441))

`url_len`과 `url_path_len`은 높은 상관관계를 가짐 (상관계수=0.943, pvalue=0.0)

In [20]:
data = data.sort_values(by=['url_len'])
data['url_path_len'] = data['url_path_len'].interpolate(method='linear')
data['url_path_len'] = data['url_path_len'].fillna(method='bfill')

In [21]:
data[data.url_path_len.isna()]

Unnamed: 0,url_len,url_num_hyphens_dom,url_path_len,url_domain_len,url_hostname_len,url_num_dots,url_num_underscores,url_query_len,url_num_query_para,url_ip_present,...,html_num_tags('script'),html_num_tags('embed'),html_num_tags('object'),html_num_tags('div'),html_num_tags('head'),html_num_tags('body'),html_num_tags('form'),html_num_tags('a'),html_num_tags('applet'),idx


`url_domain_len`과 `url_hostname_len` 높은 상관관계를 가짐 (상관계수=0.999, pvalue=0.0)

In [22]:
data = data.sort_values(by=['url_hostname_len'])
data['url_domain_len'] = data['url_domain_len'].interpolate(method='linear')

In [23]:
data[data["url_domain_len"].isna()]

Unnamed: 0,url_len,url_num_hyphens_dom,url_path_len,url_domain_len,url_hostname_len,url_num_dots,url_num_underscores,url_query_len,url_num_query_para,url_ip_present,...,html_num_tags('script'),html_num_tags('embed'),html_num_tags('object'),html_num_tags('div'),html_num_tags('head'),html_num_tags('body'),html_num_tags('form'),html_num_tags('a'),html_num_tags('applet'),idx


In [25]:
data = data.sort_values(by=['url_domain_len'])
data['url_hostname_len'] = data['url_hostname_len'].interpolate(method='linear')

In [26]:
data[data["url_hostname_len"].isna()]

Unnamed: 0,url_len,url_num_hyphens_dom,url_path_len,url_domain_len,url_hostname_len,url_num_dots,url_num_underscores,url_query_len,url_num_query_para,url_ip_present,...,html_num_tags('script'),html_num_tags('embed'),html_num_tags('object'),html_num_tags('div'),html_num_tags('head'),html_num_tags('body'),html_num_tags('form'),html_num_tags('a'),html_num_tags('applet'),idx


`html_num_tags('div')`과 `html_num_tags('a')` 높은 상관관계를 가짐 (상관계수=0.835, pvalue=0.0)

In [27]:
data = data.sort_values(by=["html_num_tags('div')"])
data["html_num_tags('a')"] = data["html_num_tags('a')"].interpolate(method='linear')

In [28]:
data[data["html_num_tags('a')"].isna()]

Unnamed: 0,url_len,url_num_hyphens_dom,url_path_len,url_domain_len,url_hostname_len,url_num_dots,url_num_underscores,url_query_len,url_num_query_para,url_ip_present,...,html_num_tags('script'),html_num_tags('embed'),html_num_tags('object'),html_num_tags('div'),html_num_tags('head'),html_num_tags('body'),html_num_tags('form'),html_num_tags('a'),html_num_tags('applet'),idx


In [29]:
data.isna().sum()

url_len                    0
url_num_hyphens_dom        0
url_path_len               0
url_domain_len             0
url_hostname_len           0
url_num_dots               0
url_num_underscores        0
url_query_len              0
url_num_query_para         0
url_ip_present             0
url_entropy                0
url_chinese_present        0
url_port                   0
html_num_tags('iframe')    0
html_num_tags('script')    0
html_num_tags('embed')     0
html_num_tags('object')    0
html_num_tags('div')       0
html_num_tags('head')      0
html_num_tags('body')      0
html_num_tags('form')      0
html_num_tags('a')         0
html_num_tags('applet')    0
idx                        0
dtype: int64

In [30]:
data = data.sort_values(by=['idx'])
data.head()

Unnamed: 0,url_len,url_num_hyphens_dom,url_path_len,url_domain_len,url_hostname_len,url_num_dots,url_num_underscores,url_query_len,url_num_query_para,url_ip_present,...,html_num_tags('script'),html_num_tags('embed'),html_num_tags('object'),html_num_tags('div'),html_num_tags('head'),html_num_tags('body'),html_num_tags('form'),html_num_tags('a'),html_num_tags('applet'),idx
0,97,0,80.0,17.0,17.0,4,0,0,0,0,...,5.0,0,0,12,1,1,1.0,3.0,0,0
1,37,0,22.0,15.0,15.0,3,1,0,0,0,...,4.0,0,0,158,1,1,1.0,15.5,0,1
2,44,0,1.0,20.0,20.0,3,3,0,0,0,...,6.0,3,3,34,1,1,0.0,136.0,0,2
3,40,0,14.0,26.0,26.0,2,0,0,0,0,...,4.0,0,0,29,1,1,1.0,2.0,0,3
4,43,2,9.5,26.0,26.0,2,0,0,0,0,...,4.0,0,0,41,1,1,1.0,15.5,0,4


In [31]:
data.drop(columns=['idx'], inplace=True)

In [32]:
data.to_csv("data-comp/test_data_corr.csv", index=False)