In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as spst
import statsmodels.api as sm
from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings(action='ignore')

In [2]:
data = pd.read_csv("data-comp/train_dataset.csv")
data = data.loc[:,"url_len":]

In [3]:
data.head()

Unnamed: 0,url_len,url_num_hyphens_dom,url_path_len,url_domain_len,url_hostname_len,url_num_dots,url_num_underscores,url_query_len,url_num_query_para,url_ip_present,...,html_num_tags('script'),html_num_tags('embed'),html_num_tags('object'),html_num_tags('div'),html_num_tags('head'),html_num_tags('body'),html_num_tags('form'),html_num_tags('a'),html_num_tags('applet'),Result_v1
0,23.0,0.0,8.0,15.0,15.0,2.0,0.0,0.0,0.0,0.0,...,7.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,malicious
1,75.0,0.0,58.0,17.0,17.0,6.0,0.0,0.0,0.0,0.0,...,18.0,0.0,0.0,20.0,1.0,1.0,0.0,21.0,0.0,benign
2,20.0,0.0,4.0,16.0,16.0,2.0,0.0,0.0,0.0,0.0,...,33.0,0.0,0.0,101.0,1.0,1.0,3.0,70.0,0.0,benign
3,27.0,0.0,13.0,14.0,14.0,3.0,0.0,0.0,0.0,0.0,...,15.0,0.0,0.0,151.0,1.0,1.0,1.0,55.0,0.0,benign
4,39.0,2.0,12.0,27.0,27.0,2.0,0.0,0.0,0.0,0.0,...,10.0,0.0,0.0,332.0,1.0,1.0,0.0,321.0,0.0,benign


Target 정의
- 악성 URL이면 1 아니면 0

In [4]:
data['target'] = data['Result_v1'].apply(lambda x: 1 if x == 'malicious' else 0)

결측치 처리 
- 2건 밖에 없으므로 Drop

In [5]:
data = data.dropna()
data = data.drop_duplicates(keep='first')

In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3231 entries, 0 to 3663
Data columns (total 25 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   url_len                  3231 non-null   float64
 1   url_num_hyphens_dom      3231 non-null   float64
 2   url_path_len             3231 non-null   float64
 3   url_domain_len           3231 non-null   float64
 4   url_hostname_len         3231 non-null   float64
 5   url_num_dots             3231 non-null   float64
 6   url_num_underscores      3231 non-null   float64
 7   url_query_len            3231 non-null   float64
 8   url_num_query_para       3231 non-null   float64
 9   url_ip_present           3231 non-null   float64
 10  url_entropy              3231 non-null   float64
 11  url_chinese_present      3231 non-null   float64
 12  url_port                 3231 non-null   float64
 13  html_num_tags('iframe')  3231 non-null   float64
 14  html_num_tags('script') 

새로운 Feature 추가 : `html_num_tags`  

HTML 총 Tag 개수

In [7]:
html_tags = ['html_num_tags(\'iframe\')', 'html_num_tags(\'script\')', 'html_num_tags(\'embed\')',
       'html_num_tags(\'object\')', 'html_num_tags(\'div\')',
       'html_num_tags(\'head\')', 'html_num_tags(\'body\')',
       'html_num_tags(\'form\')', 'html_num_tags(\'a\')',
       'html_num_tags(\'applet\')']
html_num_tags = []
for idx, row in data.iterrows():
    html_num_tags += [sum(row[html_tags])]
data['html_num_tags'] = html_num_tags

In [8]:
drop_cols = ["Result_v1", 'url_chinese_present', "html_num_tags('applet')"]
less_data = ["html_num_tags('object')", "html_num_tags('embed')"]

less_weak_data = ['url_num_query_para', "html_num_tags('head')" ,"html_num_tags('body')", 'url_port']

weak_relations = ['url_len', 'url_query_len', 'url_num_dots','url_entropy']

# 데이터 분포가 고르지 않거나 낮은 상관관계를 보이는 변수 제거
cols = list(filter(lambda x: not x in drop_cols + less_data + less_weak_data + weak_relations, data.columns))
data_1 = data[cols]

# 상관 관계 약한 변수들 제외
cols = list(filter(lambda x: not x in drop_cols + less_weak_data + weak_relations, data.columns))
data_2= data[cols]

# 데이터 분포가 고르지 않은 변수 제외
cols = list(filter(lambda x: not x in drop_cols + less_data + less_weak_data, data.columns))
data_3= data[cols]

# 데이터 분포가 고르지 않고 낮은 상관관계를 보이는 변수 제거
cols = list(filter(lambda x: not x in drop_cols + less_weak_data, data.columns))
data_4= data[cols]

# 의미 없는 변수들만 제거
cols = list(filter(lambda x: not x in drop_cols, data.columns))
data_5 =data[cols]

In [10]:
data_1.to_csv("data-comp/data_1.csv", index=False)
data_2.to_csv("data-comp/data_2.csv", index=False)
data_3.to_csv("data-comp/data_3.csv", index=False)
data_4.to_csv("data-comp/data_4.csv", index=False)
data_5.to_csv("data-comp/data_5.csv", index=False)