In [384]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
import matplotlib.pyplot as plt
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from sklearn.base import BaseEstimator, TransformerMixin
from scipy import stats
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

from sklearn.pipeline import Pipeline
from imblearn.pipeline import Pipeline as ImbPipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder, OneHotEncoder
from imblearn.over_sampling import SMOTE, RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from collections import Counter

In [385]:
df = pd.read_csv('https://drive.google.com/uc?id=1w8DnxXGwaF1dLKlMxmXWVQFOH4vcvfTp')
df.head()

Unnamed: 0,id,FILENAME,URL,URLLength,Domain,DomainLength,IsDomainIP,TLD,CharContinuationRate,TLDLegitimateProb,...,Pay,Crypto,HasCopyrightInfo,NoOfImage,NoOfCSS,NoOfJS,NoOfSelfRef,NoOfEmptyRef,NoOfExternalRef,label
0,1,,https://www.northcm.ac.th,24.0,www.northcm.ac.th,17.0,0.0,,0.8,,...,0.0,0.0,1.0,,3.0,,69.0,,,1
1,4,8135291.txt,http://uqr.to/1il1z,,,,,to,1.0,0.000896,...,,0.0,0.0,,,,,,1.0,0
2,5,586561.txt,https://www.woolworthsrewards.com.au,35.0,www.woolworthsrewards.com.au,28.0,0.0,au,0.857143,,...,1.0,0.0,1.0,33.0,7.0,8.0,15.0,,2.0,1
3,6,,,31.0,,,,com,0.5625,0.522907,...,1.0,0.0,1.0,24.0,5.0,14.0,,,,1
4,11,412632.txt,,,www.nyprowrestling.com,22.0,0.0,,1.0,,...,0.0,0.0,1.0,,,14.0,,0.0,,1


In [386]:
categorical_columns = df.select_dtypes(include=['object']).columns.tolist()

#Categorical bersifat numeric diambil dari metadata yang ada di artikel https://www.sciencedirect.com/science/article/pii/S0167404823004558#se0060
numerical_categorical_columns = [
    "IsDomainIP",         #1/0
    "HasObfuscation",     #1/0
    "IsHTTPS",            #1/0
    "HasTitle",           #1/0
    "HasFavicon",         #1/0
    "IsResponsive",       #1/0
    "HasExternalFormSubmit",  #1/0
    "HasSocialNet",       #1/0
    "HasSubmitButton",    #1/0
    "HasHiddenFields",    #1/0
    "HasPasswordField",   #1/0
    "Bank",               #1/0
    "Pay",                #1/0
    "Crypto",             #1/0
    "HasCopyrightInfo",   #1/0
    "Robots",             #1/0, asumsi bahwa 1/0 merupakan ada atau tidaknya robot karena tidak ada di metadata
    "HasDescription",     #1/0
]

# Gabungkan semua kolom kategori
categorical_columns += numerical_categorical_columns

nonCategorical_columns = [col for col in df.columns if col not in categorical_columns]
nonCategorical_df = pd.DataFrame(nonCategorical_columns)

categorical_df = pd.DataFrame(categorical_columns)

print("Jumlah Categorical Features:", len(categorical_columns))
print("Categorical Features:", categorical_columns)
print("Jumlah Non-Categorical Features:", len(nonCategorical_columns))
print("Non-Categorical Features:", nonCategorical_columns)

Jumlah Categorical Features: 22
Categorical Features: ['FILENAME', 'URL', 'Domain', 'TLD', 'Title', 'IsDomainIP', 'HasObfuscation', 'IsHTTPS', 'HasTitle', 'HasFavicon', 'IsResponsive', 'HasExternalFormSubmit', 'HasSocialNet', 'HasSubmitButton', 'HasHiddenFields', 'HasPasswordField', 'Bank', 'Pay', 'Crypto', 'HasCopyrightInfo', 'Robots', 'HasDescription']
Jumlah Non-Categorical Features: 34
Non-Categorical Features: ['id', 'URLLength', 'DomainLength', 'CharContinuationRate', 'TLDLegitimateProb', 'URLCharProb', 'TLDLength', 'NoOfSubDomain', 'NoOfObfuscatedChar', 'ObfuscationRatio', 'NoOfLettersInURL', 'LetterRatioInURL', 'NoOfDegitsInURL', 'DegitRatioInURL', 'NoOfEqualsInURL', 'NoOfQMarkInURL', 'NoOfAmpersandInURL', 'NoOfOtherSpecialCharsInURL', 'SpacialCharRatioInURL', 'LineOfCode', 'LargestLineLength', 'DomainTitleMatchScore', 'URLTitleMatchScore', 'NoOfURLRedirect', 'NoOfSelfRedirect', 'NoOfPopup', 'NoOfiFrame', 'NoOfImage', 'NoOfCSS', 'NoOfJS', 'NoOfSelfRef', 'NoOfEmptyRef', 'NoOfExt

# 1. Data Cleaning

## I. Handling Missing Data

In [387]:
class FeatureSelector(BaseEstimator, TransformerMixin):
    def __init__(self, categorical_columns, numerical_columns, missing_threshold=70, correlation_threshold=0.1):
        """
        Parameters:
        -----------
        categorical_columns : list
            List nama kolom kategorikal
        numerical_columns : list
            List nama kolom numerik
        missing_threshold : float, default=70
            Persentase missing values yang diperbolehkan (kolom akan dihapus jika melebihi)
        correlation_threshold : float, default=0.1
            Minimum korelasi dengan target yang dibutuhkan (kolom akan dihapus jika kurang dari ini)
        """
        self.categorical_columns = categorical_columns
        self.numerical_columns = numerical_columns
        self.missing_threshold = missing_threshold
        self.correlation_threshold = correlation_threshold
        self.columns_to_drop = None
        self.feature_scores = None
    
    def calculate_correlation_with_label(self, X, column, y):
        """Menghitung korelasi antara fitur dan label"""
        try:
            if column in self.categorical_columns:
                # Spearman correlation untuk kolom kategorikal
                try:
                    col_data = pd.to_numeric(X[column], errors='coerce')
                    correlation, _ = stats.spearmanr(col_data, y, nan_policy='omit')
                    return abs(correlation) if not np.isnan(correlation) else 0
                except:
                    return 0
            else:
                # Pearson correlation untuk kolom numerik
                correlation = abs(X[column].corr(y))
                return correlation if not np.isnan(correlation) else 0
        except Exception as e:
            print(f"Warning: Could not calculate correlation for {column}: {str(e)}")
            return 0

    def analyze_features(self, X, y):
        """Menganalisis fitur berdasarkan missing values dan korelasi dengan target"""
        # Hitung persentase missing values
        missing_percentages = (X.isnull().sum() / len(X)) * 100
        
        results = []
        for column in X.columns:
            if column == 'label':
                continue
                
            correlation = self.calculate_correlation_with_label(X, column, y)
            missing_pct = missing_percentages[column]
            unique_count = X[column].nunique()
            
            results.append({
                'column': column,
                'missing_percentage': missing_pct,
                'correlation_with_label': correlation,
                'unique_values': unique_count,
                'is_categorical': column in self.categorical_columns,
                'score': missing_pct - (correlation * 100)  # Score untuk penentuan drop
            })
        
        return pd.DataFrame(results)

    def fit(self, X, y=None):
        """
        Menganalisis fitur dan menentukan kolom mana yang akan dihapus.
        
        Parameters:
        -----------
        X : pandas DataFrame
            Data input
        y : pandas Series atau numpy array
            Target variable
        """
        if y is None:
            raise ValueError("Target variable (y) harus disediakan untuk feature selection")
        
        # Analisis fitur
        self.feature_scores = self.analyze_features(X, y)
        
        # Identifikasi kolom yang akan dihapus
        self.columns_to_drop = self.feature_scores[
            (self.feature_scores['missing_percentage'] > self.missing_threshold) |
            (self.feature_scores['correlation_with_label'] < self.correlation_threshold)
        ]['column'].tolist()

        self._print_summary(X)
        
        return self
    
    def transform(self, X):
        """
        Menghapus kolom yang telah diidentifikasi untuk dihapus.
        
        Parameters:
        -----------
        X : pandas DataFrame
            Data input
            
        Returns:
        --------
        pandas DataFrame
            Data dengan kolom yang sudah dihapus
        """
        if self.columns_to_drop is None:
            raise ValueError("Transformer belum di-fit. Panggil fit() terlebih dahulu.")
        
        return X.drop(columns=self.columns_to_drop, errors='ignore')
    
    def _print_summary(self, X):
        """Mencetak ringkasan hasil feature selection"""
        print("\nFeature Selection Summary:")
        print("-" * 50)
        print(f"Total features awal: {X.shape[1]}")
        print(f"Features yang akan dihapus: {len(self.columns_to_drop)}")
        print(f"Features yang dipertahankan: {X.shape[1] - len(self.columns_to_drop)}")
        
        if len(self.columns_to_drop) > 0:
            print("\nFeatures yang dihapus:")
            for col in self.columns_to_drop:
                feat_info = self.feature_scores[self.feature_scores['column'] == col].iloc[0]
                print(f"\n- {col}")
                print(f"  Missing values: {feat_info['missing_percentage']:.2f}%")
                print(f"  Correlation with target: {feat_info['correlation_with_label']:.4f}")
        
        # Top features berdasarkan korelasi
        print("\nTop 5 features berdasarkan korelasi dengan target:")
        top_features = self.feature_scores.nsmallest(5, 'score')
        for _, feat in top_features.iterrows():
            print(f"- {feat['column']}: correlation = {feat['correlation_with_label']:.4f}, "
                  f"missing = {feat['missing_percentage']:.2f}%")

In [389]:
class MissingValueHandler(BaseEstimator, TransformerMixin):
    def __init__(self, categorical_columns, numerical_columns, strategy='advanced'):
        """
        Parameters:
        -----------
        categorical_columns : list
            List nama kolom kategorikal
        numerical_columns : list
            List nama kolom numerik
        strategy : str, default='advanced'
            Strategi untuk mengisi missing values:
            - 'simple': mean untuk numerik, mode untuk kategorikal
            - 'advanced': median untuk numerik yang skewed, mean untuk yang normal,
                         mode untuk kategorikal dengan kardinalitas rendah,
                         'MISSING' untuk kategorikal dengan kardinalitas tinggi
        """
        self.categorical_columns = categorical_columns
        self.numerical_columns = numerical_columns
        self.strategy = strategy
        self.imputer_dict = {}
        
    def analyze_missing(self, X):
        """Menganalisis pola missing values pada dataset"""
        # Hitung persentase missing values per kolom
        missing_percentages = (X.isnull().sum() / len(X)) * 100
        
        missing_info = []
        for col in X.columns:
            if missing_percentages[col] > 0:
                info = {
                    'column': col,
                    'missing_percentage': missing_percentages[col],
                    'data_type': X[col].dtype,
                }
                
                if col in self.numerical_columns:
                    non_null_data = X[col].dropna()
                    info.update({
                        'mean': non_null_data.mean(),
                        'median': non_null_data.median(),
                        'skewness': non_null_data.skew()
                    })
                elif col in self.categorical_columns:
                    info.update({
                        'mode': X[col].mode().iloc[0] if not X[col].mode().empty else 'No mode',
                        'unique_values': X[col].nunique(),
                        'cardinality_ratio': X[col].nunique() / len(X)
                    })
                
                missing_info.append(info)
        
        return pd.DataFrame(missing_info)
    
    def fit(self, X, y=None):
        """
        Mempelajari parameter imputation dari data training
        
        Parameters:
        -----------
        X : pandas DataFrame
            Data input
        y : pandas Series atau numpy array, optional
            Target variable (tidak digunakan)
        """
        # Analisis missing values
        self.missing_analysis = self.analyze_missing(X)

        self._print_initial_summary(X)
        
        # Imputation berdasarkan strategi
        if self.strategy == 'simple':
            self._fit_simple(X)
        else:
            self._fit_advanced(X)
            
        self._print_imputation_summary()
        
        return self
    
    def _fit_simple(self, X):
        """Implementasi simple imputation strategy"""
        for col in X.columns:
            if col in self.numerical_columns:
                # Mean untuk kolom numerik
                self.imputer_dict[col] = {
                    'method': 'mean',
                    'value': X[col].mean()
                }
            elif col in self.categorical_columns:
                # mode untuk kolom kategorikal
                mode_val = X[col].mode().iloc[0] if not X[col].mode().empty else "MISSING"
                self.imputer_dict[col] = {
                    'method': 'mode',
                    'value': mode_val
                }
    
    def _fit_advanced(self, X):
        """Implementasi advanced imputation strategy"""
        for col in X.columns:
            if col in self.numerical_columns:
                non_null_data = X[col].dropna()
                if abs(non_null_data.skew()) > 1:  # Data skewed
                    self.imputer_dict[col] = {
                        'method': 'median',
                        'value': non_null_data.median()
                    }
                else:  # Distribusi normal
                    self.imputer_dict[col] = {
                        'method': 'mean',
                        'value': non_null_data.mean()
                    }
                    
            elif col in self.categorical_columns:
                cardinality_ratio = X[col].nunique() / len(X)
                if cardinality_ratio < 0.05:  # Kardinalitas rendah
                    self.imputer_dict[col] = {
                        'method': 'mode',
                        'value': X[col].mode().iloc[0] if not X[col].mode().empty else "MISSING"
                    }
                else:  # Kardinalitas tinggi
                    self.imputer_dict[col] = {
                        'method': 'constant',
                        'value': "MISSING"
                    }
    
    def transform(self, X):
        """
        Mengaplikasikan imputation pada data
        
        Parameters:
        -----------
        X : pandas DataFrame
            Data input
            
        Returns:
        --------
        pandas DataFrame
            Data dengan missing values yang sudah diisi
        """
        if not self.imputer_dict:
            raise ValueError("Transformer belum di-fit. Panggil fit() terlebih dahulu.")
            
        X_imputed = X.copy()
        
        for col, imputer_info in self.imputer_dict.items():
            if col in X.columns:
                X_imputed[col].fillna(imputer_info['value'], inplace=True)
        
        # Verifikasi hasil
        remaining_missing = X_imputed.isnull().sum()
        if remaining_missing.sum() > 0:
            print("\nWarning: Masih terdapat missing values:")
            print(remaining_missing[remaining_missing > 0])
            
        return X_imputed
    
    def _print_initial_summary(self, X):
        """Mencetak ringkasan awal missing values"""
        print("\nMissing Value Analysis:")
        print("-" * 50)
        
        total_missing = X.isnull().sum().sum()
        total_cells = X.size
        
        print(f"Total missing values: {total_missing:,}")
        print(f"Total cells: {total_cells:,}")
        print(f"Overall missing percentage: {(total_missing/total_cells)*100:.2f}%")
        
        if not self.missing_analysis.empty:
            print("\nKolom dengan missing values:")
            for _, row in self.missing_analysis.iterrows():
                print(f"\n- {row['column']}")
                print(f"  Missing: {row['missing_percentage']:.2f}%")
                if 'mean' in row:
                    print(f"  Mean: {row['mean']:.2f}")
                    print(f"  Median: {row['median']:.2f}")
                    print(f"  Skewness: {row['skewness']:.2f}")
                if 'unique_values' in row:
                    print(f"  Unique values: {row['unique_values']}")
                    print(f"  Cardinality ratio: {row['cardinality_ratio']:.4f}")
    
    def _print_imputation_summary(self):
        """Mencetak ringkasan metode imputation yang digunakan"""
        print("\nImputation Methods:")
        print("-" * 50)
        
        for col, info in self.imputer_dict.items():
            print(f"\n{col}:")
            print(f"  Method: {info['method']}")
            if isinstance(info['value'], (int, float)):
                print(f"  Value: {info['value']:.4f}")
            else:
                print(f"  Value: {info['value']}")

### II. Dealing with Outliers

In [390]:
class OutlierHandler(BaseEstimator, TransformerMixin):
    def __init__(self, numerical_columns, method='zscore', threshold=3, strategy='clip'):
        """
        Parameters:
        -----------
        numerical_columns : list
            List nama kolom numerik yang akan dihandle outliernya
        method : str, default='zscore'
            Metode deteksi outlier ('zscore', 'iqr')
        threshold : float, default=3
            Threshold untuk menentukan outlier:
            - Untuk zscore: jumlah standar deviasi dari mean
            - Untuk IQR: multiplier untuk IQR (biasanya 1.5)
        strategy : str, default='clip'
            Strategi handling outlier:
            - 'clip': clip nilai ke batas atas/bawah
            - 'remove': hapus baris dengan outlier (hanya untuk fit_transform)
        """
        self.numerical_columns = numerical_columns
        self.method = method
        self.threshold = threshold
        self.strategy = strategy
        self.bounds = {}
        
    def detect_outliers(self, X, column):
        """
        Mendeteksi outlier dalam satu kolom
        
        Returns:
        --------
        numpy array
            Boolean mask dimana True menandakan outlier
        """
        if self.method == 'zscore':
            z_scores = np.abs(stats.zscore(X[column].dropna()))
            return z_scores > self.threshold
            
        elif self.method == 'iqr':
            Q1 = X[column].quantile(0.25)
            Q3 = X[column].quantile(0.75)
            IQR = Q3 - Q1
            lower_bound = Q1 - (self.threshold * IQR)
            upper_bound = Q3 + (self.threshold * IQR)
            return (X[column] < lower_bound) | (X[column] > upper_bound)
    
    def fit(self, X, y=None):
        """
        Menghitung batas untuk setiap kolom numerik
        """
        self.outlier_stats = {}
        
        print("\nOutlier Analysis:")
        print("-" * 50)
        
        for column in self.numerical_columns:
            if column in X.columns:
                # Hitung statistik dasar
                stats_dict = {
                    'mean': X[column].mean(),
                    'std': X[column].std(),
                    'min': X[column].min(),
                    'max': X[column].max(),
                    'Q1': X[column].quantile(0.25),
                    'Q3': X[column].quantile(0.75)
                }
                stats_dict['IQR'] = stats_dict['Q3'] - stats_dict['Q1']
                
                # Deteksi outlier
                outlier_mask = self.detect_outliers(X, column)
                non_null_data = X[column].dropna()
                n_outliers = outlier_mask.sum()
                
                stats_dict.update({
                    'n_outliers': n_outliers,
                    'outlier_percentage': (n_outliers / len(non_null_data)) * 100
                })
                
                # Tentukan bounds
                if self.method == 'zscore':
                    self.bounds[column] = {
                        'lower': stats_dict['mean'] - (self.threshold * stats_dict['std']),
                        'upper': stats_dict['mean'] + (self.threshold * stats_dict['std'])
                    }
                else:  # IQR method
                    self.bounds[column] = {
                        'lower': stats_dict['Q1'] - (self.threshold * stats_dict['IQR']),
                        'upper': stats_dict['Q3'] + (self.threshold * stats_dict['IQR'])
                    }
                
                stats_dict.update({
                    'lower_bound': self.bounds[column]['lower'],
                    'upper_bound': self.bounds[column]['upper']
                })
                
                self.outlier_stats[column] = stats_dict
                
                self._print_column_summary(column, stats_dict)
        
        return self
    
    def transform(self, X):
        """
        Menangani outlier sesuai strategy yang dipilih
        """
        if not self.bounds:
            raise ValueError("Transformer belum di-fit. Panggil fit() terlebih dahulu.")
        
        X_transformed = X.copy()
        
        for column, bounds in self.bounds.items():
            if column in X.columns:
                if self.strategy == 'clip':
                    X_transformed[column] = X_transformed[column].clip(
                        lower=bounds['lower'],
                        upper=bounds['upper']
                    )
                elif self.strategy == 'remove' and hasattr(self, 'outlier_stats'):
                    # Mask untuk baris yang akan dipertahankan
                    keep_mask = ~(
                        (X_transformed[column] < bounds['lower']) |
                        (X_transformed[column] > bounds['upper'])
                    )
                    X_transformed = X_transformed[keep_mask]
        
        if self.strategy == 'remove':
            print(f"\nShape after removing outliers: {X_transformed.shape}")
            
        return X_transformed
    
    def _print_column_summary(self, column, stats):
        """Mencetak ringkasan statistik dan outlier untuk satu kolom"""
        print(f"\nColumn: {column}")
        print(f"Original range: [{stats['min']:.2f}, {stats['max']:.2f}]")
        print(f"Mean: {stats['mean']:.2f}")
        print(f"Std: {stats['std']:.2f}")
        print(f"Q1: {stats['Q1']:.2f}")
        print(f"Q3: {stats['Q3']:.2f}")
        print(f"IQR: {stats['IQR']:.2f}")
        print(f"Outlier bounds: [{stats['lower_bound']:.2f}, {stats['upper_bound']:.2f}]")
        print(f"Number of outliers: {stats['n_outliers']}")
        print(f"Outlier percentage: {stats['outlier_percentage']:.2f}%")
    
    def get_outlier_summary(self):
        """Mengembalikan ringkasan outlier dalam bentuk DataFrame"""
        if not hasattr(self, 'outlier_stats'):
            raise ValueError("No outlier statistics available. Run fit() first.")
        
        summary_data = []
        for column, stats in self.outlier_stats.items():
            summary_data.append({
                'column': column,
                'outlier_count': stats['n_outliers'],
                'outlier_percentage': stats['outlier_percentage'],
                'lower_bound': stats['lower_bound'],
                'upper_bound': stats['upper_bound']
            })
        
        return pd.DataFrame(summary_data)

### III. Remove Duplicates

In [391]:
class DuplicateHandler(BaseEstimator, TransformerMixin):
    def __init__(self, subset=None, keep='first', verbose=True):
        """
        Parameters:
        -----------
        subset : list atau None, default=None
            Kolom yang digunakan untuk identifikasi duplikat.
            None berarti menggunakan semua kolom.
        keep : {'first', 'last', False}, default='first'
            - 'first' : Pertahankan kemunculan pertama dari duplikat
            - 'last' : Pertahankan kemunculan terakhir dari duplikat
            - False : Hapus semua duplikat
        verbose : bool, default=True
            Jika True, print informasi tentang duplikat yang ditemukan
        """
        self.subset = subset
        self.keep = keep
        self.verbose = verbose
        self.duplicate_info = None
        
    def analyze_duplicates(self, X):
        """
        Menganalisis duplikat dalam dataset
        
        Parameters:
        -----------
        X : pandas DataFrame
            Data yang akan dianalisis
            
        Returns:
        --------
        dict
            Dictionary berisi informasi tentang duplikat
        """
        # Identifikasi duplikat
        duplicates = X.duplicated(subset=self.subset, keep=False)
        
        # Hitung statistik duplikat
        n_duplicates = duplicates.sum()
        duplicate_rows = X[duplicates].index.tolist()
        
        duplicate_combinations = None
        if self.subset is not None and n_duplicates > 0:
            duplicate_combinations = (X[duplicates][self.subset]
                                   .value_counts()
                                   .reset_index()
                                   .rename(columns={0: 'count'})
                                   .query('count > 1'))
        
        return {
            'total_rows': len(X),
            'n_duplicates': n_duplicates,
            'duplicate_percentage': (n_duplicates / len(X)) * 100,
            'duplicate_rows': duplicate_rows,
            'duplicate_combinations': duplicate_combinations
        }
    
    def fit(self, X, y=None):
        """
        Menganalisis duplikat dalam data training
        
        Parameters:
        -----------
        X : pandas DataFrame
            Data input
        y : pandas Series atau numpy array, optional
            Target variable (tidak digunakan)
        """
        # Analisis duplikat
        self.duplicate_info = self.analyze_duplicates(X)
        
        if self.verbose:
            self._print_duplicate_summary()
        
        return self
    
    def transform(self, X):
        """
        Menghapus duplikat dari data
        
        Parameters:
        -----------
        X : pandas DataFrame
            Data input
            
        Returns:
        --------
        pandas DataFrame
            Data tanpa duplikat
        """

        initial_shape = X.shape
        
        # Hapus duplikat
        X_no_duplicates = X.drop_duplicates(subset=self.subset, keep=self.keep)
        
        if self.verbose:
            rows_removed = initial_shape[0] - X_no_duplicates.shape[0]
            if rows_removed > 0:
                print(f"\nRemoved {rows_removed:,} duplicate rows")
                print(f"Final shape: {X_no_duplicates.shape}")
        
        return X_no_duplicates
    
    def _print_duplicate_summary(self):
        """Mencetak ringkasan analisis duplikat"""
        if self.duplicate_info is None:
            raise ValueError("No duplicate information available. Run fit() first.")
            
        print("\nDuplicate Analysis Summary:")
        print("-" * 50)
        print(f"Total rows: {self.duplicate_info['total_rows']:,}")
        print(f"Duplicate rows: {self.duplicate_info['n_duplicates']:,}")
        print(f"Duplicate percentage: {self.duplicate_info['duplicate_percentage']:.2f}%")
        
        if (self.subset is not None and 
            self.duplicate_info['duplicate_combinations'] is not None and 
            not self.duplicate_info['duplicate_combinations'].empty):
            
            print("\nDuplicate value combinations:")
            print(self.duplicate_info['duplicate_combinations'].to_string(index=False))
    
    def get_duplicate_summary(self):
        """
        Mengembalikan ringkasan duplikat dalam bentuk dictionary
        
        Returns:
        --------
        dict
            Dictionary berisi informasi tentang duplikat
        """
        if self.duplicate_info is None:
            raise ValueError("No duplicate information available. Run fit() first.")
        
        return self.duplicate_info

### IV. Feature Engineering

**Feature engineering** involves creating new features (input variables) or transforming existing ones to improve the performance of machine learning models. Feature engineering aims to enhance the model's ability to learn patterns and make accurate predictions from the data. It's often said that "good features make good models."

1. **Feature Selection:** Feature engineering can involve selecting the most relevant and informative features from the dataset. Removing irrelevant or redundant features not only simplifies the model but also reduces the risk of overfitting.

2. **Creating New Features:** Sometimes, the existing features may not capture the underlying patterns effectively. In such cases, engineers create new features that provide additional information. For example:
   
   - **Polynomial Features:** Engineers may create new features by taking the square, cube, or other higher-order terms of existing numerical features. This can help capture nonlinear relationships.
   
   - **Interaction Features:** Interaction features are created by combining two or more existing features. For example, if you have features "length" and "width," you can create an "area" feature by multiplying them.

3. **Binning or Discretization:** Continuous numerical features can be divided into bins or categories. For instance, age values can be grouped into bins like "child," "adult," and "senior."

4. **Domain-Specific Feature Engineering:** Depending on the domain and problem, engineers may create domain-specific features. For example, in fraud detection, features related to transaction history and user behavior may be engineered to identify anomalies.

Feature engineering is both a creative and iterative process. It requires a deep understanding of the data, domain knowledge, and experimentation to determine which features will enhance the model's predictive power.

In [392]:
class URLWebsiteFeatureEngineer(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
        
    def fit(self, X, y=None):
        return self
        
    def transform(self, X):
        X_new = X.copy()
        
        # 1. URL Complexity Score
        X_new['URLComplexityScore'] = (
            X_new['URLLength'] * 0.3 +
            X_new['NoOfOtherSpecialCharsInURL'] * 0.3 +
            X_new['NoOfQMarkInURL'] * 0.2 +
            X_new['NoOfEqualsInURL'] * 0.2
        )
        
        # 2. Character Distribution Features
        X_new['SpecialCharToLengthRatio'] = (
            (X_new['NoOfEqualsInURL'] + X_new['NoOfQMarkInURL'] + 
             X_new['NoOfOtherSpecialCharsInURL']) / X_new['URLLength']
        ).fillna(0)
        
        # 3. Domain Risk Score
        X_new['DomainRiskScore'] = (
            (X_new['DomainLength'] / X_new['URLLength']) * 0.4 +
            (1 - X_new['DomainTitleMatchScore']) * 0.3 +
            (1 - X_new['URLTitleMatchScore']) * 0.3
        )
        
        # 4. Security Features Composite
        security_features = ['IsHTTPS', 'HasFavicon', 'Robots', 'HasCopyrightInfo']
        X_new['SecurityScore'] = X_new[security_features].sum(axis=1) / len(security_features)
        
        # 5. Content Richness Score
        X_new['ContentRichnessScore'] = (
            X_new['NoOfImage'] * 0.2 +
            X_new['NoOfJS'] * 0.2 +
            X_new['LineOfCode'] * 0.2 +
            X_new['NoOfSelfRef'] * 0.2 +
            X_new['NoOfExternalRef'] * 0.2
        ) / X_new[['NoOfImage', 'NoOfJS', 'LineOfCode', 'NoOfSelfRef', 'NoOfExternalRef']].max().max()
        
        # 6. Interaction Elements Score
        interaction_features = ['HasSubmitButton', 'HasHiddenFields', 'Pay', 'IsResponsive']
        X_new['InteractionScore'] = X_new[interaction_features].sum(axis=1) / len(interaction_features)
        
        # 7. SEO Elements Score
        seo_features = ['HasTitle', 'HasDescription', 'HasSocialNet']
        X_new['SEOScore'] = X_new[seo_features].sum(axis=1) / len(seo_features)
        
        # 8. Numeric to Character Ratio
        X_new['NumericToCharRatio'] = (
            X_new['NoOfDegitsInURL'] / X_new['NoOfLettersInURL']
        ).fillna(0)
        
        # 9. URL Entropy Score
        X_new['URLEntropyScore'] = (
            X_new['CharContinuationRate'] * 0.5 +
            X_new['URLCharProb'] * 0.5
        )
        
        # New features
        new_features = [
            'URLComplexityScore', 'SpecialCharToLengthRatio', 'DomainRiskScore',
            'SecurityScore', 'ContentRichnessScore', 'InteractionScore',
            'SEOScore', 'NumericToCharRatio', 'URLEntropyScore'
        ]
            
        return X_new

In [393]:
def handle_infinite_values(df, columns):
    """
    Handle infinite values in specified columns by replacing them with NaN
    and then filling with column mean
    """
    df = df.copy()
    for col in columns:
        mask = np.isinf(df[col])
        if mask.any():
            df.loc[mask, col] = np.nan

        if df[col].isna().any():
            df[col] = df[col].fillna(df[col].mean())
    
    return df

# Data Preprosesing

In [394]:
class FeatureScaler:
    def __init__(self, method='standard'):
        self.method = method
        self.scaler = None
        self.numeric_columns = None

    def fit(self, X):
        # Store numeric columns
        self.numeric_columns = X.select_dtypes(include=['float64', 'int64']).columns
        
        if self.method == 'standard':
            from sklearn.preprocessing import StandardScaler
            self.scaler = StandardScaler()
        elif self.method == 'minmax':
            from sklearn.preprocessing import MinMaxScaler
            self.scaler = MinMaxScaler()
        else:
            raise ValueError("Method must be either 'standard' or 'minmax'")
        
        # Fit scaler untuk kolom numerical
        if len(self.numeric_columns) > 0:
            self.scaler.fit(X[self.numeric_columns])
        
        return self

    def transform(self, X):
        X_scaled = X.copy()
        
        # Transform untuk kolom numerical
        if len(self.numeric_columns) > 0:
            # Memastikan tidak ada infinite value
            X_temp = X[self.numeric_columns].replace([np.inf, -np.inf], np.nan)
            X_temp = X_temp.fillna(X_temp.mean())
            
            X_scaled[self.numeric_columns] = self.scaler.transform(X_temp)
        
        return X_scaled

    def fit_transform(self, X):
        return self.fit(X).transform(X)

In [395]:
class FeatureEncoder(BaseEstimator, TransformerMixin):
    """
    A sklearn-compatible transformer for feature encoding.
    """
    def __init__(self, method='label'):
        """
        Parameters:
        -----------
        method : str, default='label'
            Encoding method to use ('label' or 'onehot')
        """
        self.method = method
        self.encoders = {}
        self.categorical_columns = None
        self.feature_names_out_ = None
    
    def fit(self, X, y=None):
        self.categorical_columns = X.select_dtypes(include=['object']).columns
        
        if self.method == 'label':
            for col in self.categorical_columns:
                self.encoders[col] = LabelEncoder()
                self.encoders[col].fit(X[col].astype(str))
        else:  # onehot
            for col in self.categorical_columns:
                self.encoders[col] = OneHotEncoder(sparse=False, handle_unknown='ignore')
                self.encoders[col].fit(X[[col]])
        
        if self.method == 'onehot':
            self.feature_names_out_ = []
            for col in X.columns:
                if col in self.categorical_columns:
                    self.feature_names_out_.extend(
                        [f"{col}_{cat}" for cat in self.encoders[col].get_feature_names_out([col])]
                    )
                else:
                    self.feature_names_out_.append(col)
        else:
            self.feature_names_out_ = list(X.columns)
        
        return self
    
    def transform(self, X):
        X_encoded = X.copy()
        
        if self.method == 'label':
            # Transform menggunakan label encoder
            for col in self.categorical_columns:
                X_encoded[col] = self.encoders[col].transform(X[col].astype(str))
        else:  # onehot
            # Transform menggunakan one-hot encoder
            for col in self.categorical_columns:
                # Get encoded column names
                encoded_features = [f"{col}_{cat}" for cat in 
                                 self.encoders[col].get_feature_names_out([col])]
                
                # Transform dan add new columns
                encoded_cols = self.encoders[col].transform(X[[col]])
                encoded_df = pd.DataFrame(encoded_cols, columns=encoded_features, 
                                      index=X.index)
                
                # Drop original column dan add hasil encoded 
                X_encoded = X_encoded.drop(col, axis=1)
                X_encoded = pd.concat([X_encoded, encoded_df], axis=1)
        
        return X_encoded
    
    def get_feature_names_out(self, feature_names_in=None):
        """Return feature names for output features."""
        return self.feature_names_out_

In [396]:
class ImbalanceHandler(BaseEstimator, TransformerMixin):
    """
    A sklearn-compatible transformer for handling imbalanced datasets.
    Note: This transformer modifies both X and y, so it should typically
    be used in a separate pipeline for preprocessing.
    """
    def __init__(self, method='smote', sampling_ratio=1.0):
        """
        Parameters:
        -----------
        method : str, default='smote'
            Resampling method to use ('smote', 'oversample', or 'undersample')
        sampling_ratio : float, default=1.0
            Ratio of samples in minority class to majority class
        """
        self.method = method
        self.sampling_ratio = sampling_ratio
        
        # Initialize resampler berdasarkan method
        if method == 'smote':
            self.resampler = SMOTE(sampling_strategy=sampling_ratio, random_state=42)
        elif method == 'oversample':
            self.resampler = RandomOverSampler(sampling_strategy=sampling_ratio, random_state=42)
        elif method == 'undersample':
            self.resampler = RandomUnderSampler(sampling_strategy=sampling_ratio, random_state=42)
        else:
            raise ValueError("Method must be 'smote', 'oversample', or 'undersample'")
    
    def fit(self, X, y):
        self.feature_names_ = X.columns if isinstance(X, pd.DataFrame) else None
        
        print("Class distribution before resampling:")
        print(Counter(y))
        
        self.resampler.fit(X, y)
        return self
    
    def transform(self, X, y=None):

        if y is None:
            return X
            
        # Perform resampling
        X_resampled, y_resampled = self.resampler.fit_resample(X, y)
        
        print("\nClass distribution after resampling:")
        print(Counter(y_resampled))
        
        if isinstance(X, pd.DataFrame):
            X_resampled = pd.DataFrame(X_resampled, columns=self.feature_names_)
        
        return X_resampled, y_resampled
    
    def fit_transform(self, X, y):
        return self.fit(X, y).transform(X, y)

Pipeline

In [397]:
# Pipeline untuk data cleaning
cleaning_pipe = Pipeline([
    ("feature_selector", FeatureSelector(
        categorical_columns=categorical_columns,
        numerical_columns=None, 
        missing_threshold=70,
        correlation_threshold=0.1
    )),
    ("missing_handler", MissingValueHandler(
        categorical_columns=categorical_columns,
        numerical_columns=None, 
        strategy='advanced'
    )),
    ("outlier_handler", OutlierHandler(
        numerical_columns=None, 
        method='zscore',
        threshold=3,
        strategy='clip'
    )),
    ("duplicate_handler", DuplicateHandler(
        subset=None, 
        keep='first'
    ))
])

# Pipeline untuk preprocessing fitur
feature_pipe = Pipeline([
    ("scaler", FeatureScaler(method='standard')),
    ("encoder", FeatureEncoder(method='label'))
])

# Pipeline untuk handling imbalanced data
imbalance_pipe = Pipeline([
    ("imbalance", ImbalanceHandler(method='smote', sampling_ratio=1.0))
])

def apply_complete_preprocessing(train_set, val_set, y_train=None):
    """
    Menerapkan complete preprocessing pipeline pada data.
    
    Parameters:
    -----------
    train_set : pandas.DataFrame
        Dataset training
    val_set : pandas.DataFrame
        Dataset validasi
    y_train : pandas.Series, optional
        Label training untuk imbalance handling
        
    Returns:
    --------
    tuple
        (processed_train, processed_val, y_resampled)
        y_resampled hanya direturn jika y_train disediakan
    """
    try:
        print("Step 1: Applying data cleaning pipeline...")
        cleaned_train = cleaning_pipe.fit_transform(train_set)
        cleaned_val = cleaning_pipe.transform(val_set)
        
        print("\nStep 2: Applying feature preprocessing pipeline...")
        processed_train = feature_pipe.fit_transform(cleaned_train)
        processed_val = feature_pipe.transform(cleaned_val)
        
        # Jika y_train disediakan, lakukan imbalance handling
        if y_train is not None:
            print("\nStep 3: Applying imbalance handling...")
            processed_train, y_resampled = imbalance_pipe.fit_transform(processed_train, y_train)
            print("\nAll preprocessing steps completed successfully!")
            return processed_train, processed_val, y_resampled
        
        print("\nPreprocessing steps completed successfully!")
        return processed_train, processed_val
    
    except Exception as e:
        print(f"\nError during preprocessing: {str(e)}")
        raise


KNN

In [398]:
class KNNFromScratch:
    """
    K-Nearest Neighbors classifier implementation from scratch.
    """
    def __init__(self, k=3, metric='euclidean'):
        """
        Parameters:
        -----------
        k : int, default=3
            Number of neighbors
        metric : str, default='euclidean'
            Distance metric to use ('euclidean', 'manhattan', or 'minkowski')
        """
        self.k = k
        self.metric = metric.lower()
        
    def fit(self, X, y):
        """
        Fit the model using X as training data and y as target values.

        Parameters:
        -----------
        X : numpy array of shape (n_samples, n_features)
            Training data
        y : numpy array of shape (n_samples,)
            Target values
        """
        self.X_train = np.asarray(X)
        self.y_train = np.asarray(y)
        return self
    
    def _calculate_distance(self, x1, x2):
        """Calculate distance between two points using specified metric."""
        if self.metric == 'euclidean':
            return np.sqrt(np.sum((x1 - x2) ** 2))
        elif self.metric == 'manhattan':
            return np.sum(np.abs(x1 - x2))
        elif self.metric == 'minkowski':
            # Using p=3 for Minkowski distance
            return np.power(np.sum(np.abs(x1 - x2) ** 3), 1/3)
        else:
            raise ValueError("Metric must be 'euclidean', 'manhattan', or 'minkowski'")
    
    def _predict_single(self, x):
        """Predict class for a single sample."""
        distances = [self._calculate_distance(x, x_train) for x_train in self.X_train]
        
        k_indices = np.argsort(distances)[:self.k]
        
        k_nearest_labels = self.y_train[k_indices]
        
        most_common = Counter(k_nearest_labels).most_common(1)
        return most_common[0][0]
    
    def predict(self, X):
        """
        Predict class labels for samples in X.

        Parameters:
        -----------
        X : numpy array of shape (n_samples, n_features)
            Samples to predict

        Returns:
        --------
        numpy array of shape (n_samples,)
            Predicted class labels
        """
        X = np.asarray(X)
        y_pred = np.array([self._predict_single(x) for x in X])
        return y_pred

NB

In [399]:
class GaussianNaiveBayesFromScratch:
    """
    Gaussian Naive Bayes classifier implementation from scratch.
    """
    def __init__(self):
        self.classes = None
        self.priors = None
        self.means = None
        self.vars = None
    
    def fit(self, X, y):
        """
        Fit Gaussian Naive Bayes according to X, y.

        Parameters:
        -----------
        X : numpy array of shape (n_samples, n_features)
            Training data
        y : numpy array of shape (n_samples,)
            Target values
        """
        n_samples, n_features = X.shape
        self.classes = np.unique(y)
        n_classes = len(self.classes)
        
        self.means = np.zeros((n_classes, n_features))
        self.vars = np.zeros((n_classes, n_features))
        self.priors = np.zeros(n_classes)
        
        for idx, c in enumerate(self.classes):
            X_c = X[y == c]
            self.means[idx, :] = X_c.mean(axis=0)
            self.vars[idx, :] = X_c.var(axis=0)
            self.priors[idx] = X_c.shape[0] / n_samples
        
        return self
    
    def _calculate_likelihood(self, x, mean, var):
        """Calculate Gaussian likelihood P(x|class)."""
        eps = 1e-10
        exponent = -0.5 * np.log(2 * np.pi * (var + eps)) - ((x - mean) ** 2) / (2 * (var + eps))
        return np.sum(exponent)
    
    def _predict_single(self, x):
        """Predict class for a single sample."""
        posteriors = []
        
        for idx, _ in enumerate(self.classes):
            prior = np.log(self.priors[idx])
            likelihood = self._calculate_likelihood(x, self.means[idx, :], self.vars[idx, :])
            posterior = prior + likelihood
            posteriors.append(posterior)
        
        return self.classes[np.argmax(posteriors)]
    
    def predict(self, X):
        """
        Predict class labels for samples in X.

        Parameters:
        -----------
        X : numpy array of shape (n_samples, n_features)
            Samples to predict

        Returns:
        --------
        numpy array of shape (n_samples,)
            Predicted class labels
        """
        X = np.asarray(X)
        y_pred = np.array([self._predict_single(x) for x in X])
        return y_pred

    def predict_proba(self, X):
        """
        Return probability estimates for samples in X.

        Parameters:
        -----------
        X : numpy array of shape (n_samples, n_features)
            Samples to predict

        Returns:
        --------
        numpy array of shape (n_samples, n_classes)
            Probability estimates for each class
        """
        X = np.asarray(X)
        probas = []
        
        for x in X:
            posteriors = []
            for idx, _ in enumerate(self.classes):
                prior = np.log(self.priors[idx])
                likelihood = self._calculate_likelihood(x, self.means[idx, :], self.vars[idx, :])
                posteriors.append(prior + likelihood)
            
            posteriors = np.array(posteriors)
            posteriors = np.exp(posteriors - np.max(posteriors))
            posteriors = posteriors / np.sum(posteriors)
            probas.append(posteriors)
            
        return np.array(probas)

# 2. Split Training Set and Validation Set

Splitting the training and validation set works as an early diagnostic towards the performance of the model we train. This is done before the preprocessing steps to **avoid data leakage inbetween the sets**. If you want to use k-fold cross-validation, split the data later and do the cleaning and preprocessing separately for each split.

Note: For training, you should use the data contained in the `train` folder given by the TA. The `test` data is only used for kaggle submission.

In [400]:
# train_set, val_set = train_test_split(df_sample, test_size=0.2, random_state=42)

In [401]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
import pickle
from sklearn.pipeline import Pipeline

# Load dataset
print("Loading dataset...")

# Mengambil 30% sampel dari data
sample_size = int(len(df) * 0.3)
df_sample = df.sample(n=sample_size, random_state=42)

# Memisahkan fitur dan target dari sampel
X = df_sample.drop('label', axis=1)
y = df_sample['label']

print(f"Ukuran dataset original: {len(df)} sampel")
print(f"Ukuran dataset sampel: {len(df_sample)} sampel")
print(f"Distribusi kelas dalam sampel:\n{y.value_counts(normalize=True).round(3)}")

# Split data sampel menjadi training dan validation set
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

print("\nUkuran data training:", len(X_train))
print("Ukuran data validasi:", len(X_val))

# Identifikasi kolom kategorikal dan numerikal
categorical_columns = []
numerical_columns = []

for column in X.columns:
    if X[column].dtype == 'object':
        categorical_columns.append(column)
    else:
        numerical_columns.append(column)

# kolom kategorikal numerik
numerical_categorical_columns = [
    "IsDomainIP", "HasObfuscation", "IsHTTPS", "HasTitle", "HasFavicon",
    "IsResponsive", "HasExternalFormSubmit", "HasSocialNet", "HasSubmitButton",
    "HasHiddenFields", "HasPasswordField", "Bank", "Pay", "Crypto",
    "HasCopyrightInfo", "Robots", "HasDescription"
]

# Update categorical_columns
categorical_columns.extend(numerical_categorical_columns)
numerical_columns = [col for col in numerical_columns if col not in numerical_categorical_columns]

print("\nJumlah kolom kategorikal:", len(categorical_columns))
print("Jumlah kolom numerikal:", len(numerical_columns))

# 1. Missing Value Handling
print("\nHandling missing values...")
missing_handler = MissingValueHandler(
    categorical_columns=categorical_columns,
    numerical_columns=numerical_columns,
    strategy='advanced'
)
X_train_clean = missing_handler.fit_transform(X_train)
X_val_clean = missing_handler.transform(X_val)

# 2. Outlier Handling
print("\nHandling outliers...")
outlier_handler = OutlierHandler(
    numerical_columns=numerical_columns,
    method='zscore',
    threshold=3,
    strategy='clip'
)
X_train_clean = outlier_handler.fit_transform(X_train_clean)
X_val_clean = outlier_handler.transform(X_val_clean)

# 3. Feature Engineering
print("\nApplying feature engineering...")
feature_engineer = URLWebsiteFeatureEngineer()
X_train_featured = feature_engineer.fit_transform(X_train_clean)
X_val_featured = feature_engineer.transform(X_val_clean)

# Setelah Outlier Handling dan sebelum Feature Scaling
print("\nHandling infinite values...")
X_train_featured = handle_infinite_values(X_train_featured, numerical_columns)
X_val_featured = handle_infinite_values(X_val_featured, numerical_columns)

# 4. Feature Scaling
print("\nScaling features...")
scaler = FeatureScaler(method='standard')
X_train_scaled = scaler.fit_transform(X_train_featured)
X_val_scaled = scaler.transform(X_val_featured)

# 5. Feature Encoding
print("\nEncoding categorical features...")
encoder = FeatureEncoder(method='label')
X_train_encoded = encoder.fit_transform(X_train_scaled)
X_val_encoded = encoder.transform(X_val_scaled)

# 6. Handle Imbalanced Data
print("\nHandling class imbalance...")
imbalance_handler = ImbalanceHandler(method='smote', sampling_ratio=1.0)
X_train_balanced, y_resampled = imbalance_handler.fit_resample(X_train_encoded, y_train)

# Training dan evaluasi model
print("\n=== Evaluasi Model dengan 30% Dataset ===")

# 1. KNN from scratch
print("\nTraining KNN from scratch...")
knn = KNNFromScratch(k=5, metric='euclidean')
knn.fit(X_train_balanced, y_resampled)
knn_pred = knn.predict(X_val_encoded)

print("\nKNN From Scratch Results:")
print("-----------------------")
print(classification_report(y_val, knn_pred))
print("\nConfusion Matrix:")
print(confusion_matrix(y_val, knn_pred))

# 2. Naive Bayes from scratch
print("\nTraining Naive Bayes from scratch...")
nb = GaussianNaiveBayesFromScratch()
nb.fit(X_train_balanced, y_resampled)
nb_pred = nb.predict(X_val_encoded)

print("\nNaive Bayes From Scratch Results:")
print("-------------------------------")
print(classification_report(y_val, nb_pred))
print("\nConfusion Matrix:")
print(confusion_matrix(y_val, nb_pred))

# 3. Perbandingan dengan implementasi scikit-learn
print("\n=== Perbandingan dengan Scikit-learn ===")

# KNN scikit-learn
print("\nTraining scikit-learn KNN...")
sklearn_knn = KNeighborsClassifier(n_neighbors=5, metric='euclidean')
sklearn_knn.fit(X_train_balanced, y_resampled)
sklearn_knn_pred = sklearn_knn.predict(X_val_encoded)

print("\nScikit-learn KNN Results:")
print("-----------------------")
print(classification_report(y_val, sklearn_knn_pred))

# Naive Bayes scikit-learn
print("\nTraining scikit-learn Naive Bayes...")
sklearn_nb = GaussianNB()
sklearn_nb.fit(X_train_balanced, y_resampled)
sklearn_nb_pred = sklearn_nb.predict(X_val_encoded)

print("\nScikit-learn Naive Bayes Results:")
print("------------------------------")
print(classification_report(y_val, sklearn_nb_pred))

# Menyimpan model terbaik
accuracies = {
    'KNN_scratch': accuracy_score(y_val, knn_pred),
    'NB_scratch': accuracy_score(y_val, nb_pred),
    'KNN_sklearn': accuracy_score(y_val, sklearn_knn_pred),
    'NB_sklearn': accuracy_score(y_val, sklearn_nb_pred)
}

best_model_name = max(accuracies, key=accuracies.get)
print(f"\nModel terbaik: {best_model_name} dengan akurasi: {accuracies[best_model_name]:.4f}")

# Memilih model terbaik untuk disimpan
best_model = None
if best_model_name == 'KNN_scratch':
    best_model = knn
elif best_model_name == 'NB_scratch':
    best_model = nb
elif best_model_name == 'KNN_sklearn':
    best_model = sklearn_knn
else:
    best_model = sklearn_nb

# Menyimpan model terbaik
filename = f'best_model_{best_model_name}.pkl'
with open(filename, 'wb') as f:
    pickle.dump(best_model, f)
print(f"\nModel terbaik telah disimpan sebagai '{filename}'")

Loading dataset...
Ukuran dataset original: 140404 sampel
Ukuran dataset sampel: 42121 sampel
Distribusi kelas dalam sampel:
label
1    0.924
0    0.076
Name: proportion, dtype: float64

Ukuran data training: 33696
Ukuran data validasi: 8425

Jumlah kolom kategorikal: 22
Jumlah kolom numerikal: 33

Handling missing values...

Missing Value Analysis:
--------------------------------------------------
Total missing values: 715,469
Total cells: 1,853,280
Overall missing percentage: 38.61%

Kolom dengan missing values:

- FILENAME
  Missing: 41.25%
  Mean: nan
  Median: nan
  Skewness: nan
  Unique values: 19798.0
  Cardinality ratio: 0.5875

- URL
  Missing: 31.39%
  Mean: nan
  Median: nan
  Skewness: nan
  Unique values: 23120.0
  Cardinality ratio: 0.6861

- URLLength
  Missing: 43.14%
  Mean: 27.92
  Median: 26.00
  Skewness: 101.30
  Unique values: nan
  Cardinality ratio: nan

- Domain
  Missing: 50.34%
  Mean: nan
  Median: nan
  Skewness: nan
  Unique values: 16672.0
  Cardinality

ValueError: Input X contains infinity or a value too large for dtype('float64').

In [352]:
print("Checking for infinity values after cleaning...")
if np.isinf(X_train_featured[scaler.numeric_columns]).any().any():
    print("Infinity values found in X_train_featured after cleaning.")
else:
    print("No infinity values found in X_train_featured after cleaning.")

Checking for infinity values after cleaning...
Infinity values found in X_train_featured after cleaning.
