### Benign predictions

In [12]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from sklearn.feature_selection import SelectKBest, mutual_info_classif 
from sklearn.preprocessing import MaxAbsScaler
import numpy as np

X_train = pd.read_csv("../datasets/feature_updated_dataset_X_reduced_further.csv")
y_train = pd.read_csv("../datasets/feature_updated_dataset_y.csv")
X_test = pd.read_csv("../datasets/benign_dataset_X_reduced_further.csv")
y_train = y_train.values.ravel()

scaler = MaxAbsScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
# model = RandomForestClassifier(n_estimators=50, max_depth=20, random_state=69)

column_names = ['www', 'url_length', 'digit_count', 'percentage_count', 
                'bs_count', 'equals_count', 'digit_letter_ratio', 
                
                'pd_at_count', 'pd_hyphen_count', 'pd_in_alex_top_1m']
feature_set = set()
for i in range(1, X_train.shape[1] + 1):
    # Compare current score with previous score and filter out those below previous_score
    previous_f1 = 0
    features_with_improvement = []

    model = RandomForestClassifier(n_estimators=50, max_depth=20, random_state=69)
    select_feature_model = SelectKBest(mutual_info_classif, k=i)
    X_train_partition = select_feature_model.fit_transform(X_train, y_train)
    model.fit(X_train_partition, y_train)
    X_test_partition = select_feature_model.transform(X_test)
    benign_pred = model.predict(X_test_partition)

    feature_importances = model.feature_importances_
    indices = np.argsort(feature_importances)[::-1]
    curr_feature_set = set()
    for j in indices:
        curr_feature_set.add(column_names[j])
    new_feature = curr_feature_set - feature_set
    feature_set = curr_feature_set
    count = np.count_nonzero(benign_pred == 0)
    
    # Verifying model fit
    y_test = np.zeros((345738,))
    accuracy = accuracy_score(y_test, benign_pred)
    f1 = f1_score(y_test, benign_pred, average='weighted')
    if f1 > previous_f1:
        previous_f1 = f1
        features_with_improvement.append(new_feature)

    print(f"Accuracy: {accuracy} | F1 score: {f1} | {model.__class__.__name__} | Number of features: {i} | Added: {new_feature} | Features: {curr_feature_set}")
    print(np.count_nonzero(benign_pred == 0)/len(benign_pred))
    print(np.count_nonzero(benign_pred == 1)/len(benign_pred))
    print(np.count_nonzero(benign_pred == 2)/len(benign_pred))
    print(np.count_nonzero(benign_pred == 3)/len(benign_pred))

print(f"Features with improvement: {features_with_improvement}")


KeyboardInterrupt: 