In [1]:
import numpy as np
import pandas as pd
import os

In [2]:
train_dir = '/kaggle/input/dsco/train.csv'
test_dir = '/kaggle/input/dsco/test.csv'
df_train = pd.read_csv(train_dir, index_col="URL")
df_test = pd.read_csv(test_dir, index_col="URL")

In [3]:
df_train.head()

Unnamed: 0_level_0,url_length,has_ip_address,dot_count,https_flag,url_entropy,token_count,subdomain_count,query_param_count,tld_length,path_length,has_hyphen_in_domain,number_of_digits,tld_popularity,suspicious_file_extension,domain_name_length,percentage_numeric_chars,ClassLabel
URL,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
https://www.womensweekly.com.sg,31,0,3,1,3.46132,6.0,2,1,2,0.0,0,0,0,0,3,0.0,1.0
http://116.53.34.145:34075/i,28,1,3,0,3.645593,7.0,2,1,9,2.0,0,15,0,0,2,53.571429,0.0
http://58.23.215.31:8765/wzoptup.exe,36,1,4,0,4.086049,8.0,2,1,7,11.44075,0,13,0,1,3,36.111111,0.0
https://www.dudpro.co.il,24,0,3,1,3.772055,6.0,2,1,2,0.0,0,0,0,0,2,0.0,1.0
http://117.201.113.115:53518/i,30,1,3,0,3.819549,11.2,2,1,9,2.0,0,17,0,0,3,0.371737,0.0


In [4]:
df_test.head()


Unnamed: 0_level_0,ID,url_length,has_ip_address,dot_count,https_flag,url_entropy,token_count,subdomain_count,query_param_count,tld_length,path_length,has_hyphen_in_domain,number_of_digits,tld_popularity,suspicious_file_extension,domain_name_length,percentage_numeric_chars
URL,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
https://www.fastpost.com,1,24,0,2,1,3.522055,5.917567,2,1,3,0.0,0,0,1,0,3,0.0
http://proxy.amazonscouts.com/revada/66df1acad4359_res_out.exe,2,62,0,3,0,3.8125,8.0,1,1,3,33.0,0,7,1,1,0,7.194527
http://42.179.239.133:58058/bin.sh,3,34,1,4,0,3.695947,8.0,2,1,9,6.673771,0,16,0,0,3,47.058824
https://www.saddhamma.org,4,25,0,2,1,3.703465,5.917567,1,1,3,0.0,0,0,0,0,9,0.0
https://www.changeip.com,5,24,0,2,1,3.7238,5.0,1,1,3,0.0,0,0,1,0,8,0.0


In [5]:
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, roc_curve, auc
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from skopt import BayesSearchCV
from skopt.space import Integer, Real

In [6]:
df_coba = pd.concat([df_train, df_test], axis = 0)

In [7]:
X = df_train.drop(columns='ClassLabel')
y = df_train['ClassLabel']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)

model = RandomForestClassifier(random_state=42)

In [8]:
X_train.shape

(64779, 16)

In [9]:
search_spaces = {
    'n_estimators': Integer(100, 1000),
    'max_depth': Integer(3, 30),
    'min_samples_split': Integer(2, 20),
    'min_samples_leaf': Integer(1, 10),
    'max_features': Real(0.1, 1.0)  # rasio fitur
}

bayes_opt = BayesSearchCV(
    estimator=model,
    search_spaces=search_spaces,
    n_iter=15,              # jumlah iterasi pencarian
    cv=5,                   # 5-fold cross validation
    scoring='roc_auc',      # metric evaluasi
    n_jobs=-1,
    random_state=42
)

bayes_opt.fit(X_train, y_train)

print("Best score (ROC AUC):", bayes_opt.best_score_)
print("Best hyperparameters:", bayes_opt.best_params_)

Best score (ROC AUC): 0.9928888492475941
Best hyperparameters: OrderedDict([('max_depth', 15), ('max_features', 0.9268502695024393), ('min_samples_leaf', 2), ('min_samples_split', 10), ('n_estimators', 269)])


In [10]:
y_pred = bayes_opt.predict(X_test)
output = pd.DataFrame({
    "ID": range(1, len(y_pred) + 1),
    "ClassLabel": y_pred
})

output.to_csv("submission.csv",  index=False, sep=';', encoding = 'utf-8')