In [1]:
import pandas as pd
import re
from datetime import datetime

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
import pickle

In [2]:
df = pd.read_csv("urldata.csv")
df = df.drop(columns=['result']) # 불필요한 column 제거

In [7]:
df

Unnamed: 0.1,Unnamed: 0,url,label
0,0,https://www.google.com,benign
1,1,https://www.youtube.com,benign
2,2,https://www.facebook.com,benign
3,3,https://www.baidu.com,benign
4,4,https://www.wikipedia.org,benign
...,...,...,...
450171,450171,http://ecct-it.com/docmmmnn/aptgd/index.php,malicious
450172,450172,http://faboleena.com/js/infortis/jquery/plugin...,malicious
450173,450173,http://faboleena.com/js/infortis/jquery/plugin...,malicious
450174,450174,http://atualizapj.com/,malicious


In [9]:
# 특질 추출 함수
def extract_features(url):
    # Feature 1: URL 길이
    url_length = len(url)

    # Feature 2: 특정 특수문자 개수 (!, #, @, %, - 등)
    special_char_count = len(re.findall(r'[!#@%\-_]', url))

    # Feature 3: 특정 단어 포함 여부 (login, admin, confirm, .exe, 000webhost 등)
    specific_words = ['login', 'admin', 'confirm', '.exe', '000webhost', 'secure', 'account', 'update', 'verify']
    contains_specific_word = any(word in url for word in specific_words)

    # Feature 4: 숫자 구성 비율
    num_digits = sum(c.isdigit() for c in url)
    digit_ratio = num_digits / len(url)

    # Feature 5: @ 기호 이후 도메인 여부
    contains_at_symbol_domain = int('@' in url and re.search(r'@[\w.-]+', url) is not None)

    # Feature 6: Path 및 Query String 길이
    path_length = len(re.findall(r'/[\w.-]+', url))  # 경로 요소 길이
    query_length = len(re.findall(r'\?.+', url))     # 쿼리 스트링 길이

    # Feature 7: N-gram 특질 (bigram 예시)
    ngrams = re.findall(r'..', url)  # Bigram
    ngram_count = len(ngrams)

    return (
        url_length, special_char_count, int(contains_specific_word), digit_ratio,
        contains_at_symbol_domain, path_length, query_length, ngram_count
    )

In [10]:
# 각 특질 열을 추가
df[['url_length', 'special_char_count', 'contains_specific_word', 'digit_ratio',
    'contains_at_symbol_domain', 'path_length', 'query_length', 'ngram_count']] = df['url'].apply(lambda x: pd.Series(extract_features(x)))

# Feature와 Label 정의
X = df[['url_length', 'special_char_count', 'contains_specific_word', 'digit_ratio',
        'contains_at_symbol_domain', 'path_length', 'query_length', 'ngram_count']]
y = df['label']  # 클래스 컬럼 (benign, malicious)

In [11]:
# 학습 및 테스트 데이터 분할
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 모델 학습
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# 예측
y_pred = model.predict(X_test)

In [12]:
# 결과 평가
from sklearn.metrics import accuracy_score, classification_report
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.8806366342351948

Classification Report:
               precision    recall  f1-score   support

      benign       0.89      0.96      0.92     68921
   malicious       0.83      0.62      0.71     21115

    accuracy                           0.88     90036
   macro avg       0.86      0.79      0.82     90036
weighted avg       0.88      0.88      0.87     90036



In [13]:
# 모델 저장
with open("model.pkl", "wb") as model_file:
    pickle.dump(model, model_file)