In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
import cupy as cp
import warnings
from sklearnex import patch_sklearn 

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

def optimize_dtypes(df):
    for col in df.columns:
        if df[col].dtype == 'float64':
            df[col] = pd.to_numeric(df[col], downcast='float')
        elif df[col].dtype == 'int64':
            df[col] = pd.to_numeric(df[col], downcast='integer')
    return df

def mainBareBones():
    # Load dataset
    print("Loading dataset...")
    try:
        df = pd.read_csv('Active_Wiretap_dataset.csv', header=None, low_memory=False)
        labels = pd.read_csv('Active_Wiretap_labels.csv', names=['label'], header=None, low_memory=False)

        min_rows = min(len(df), len(labels))
        df = df.iloc[:min_rows]
        labels = labels.iloc[:min_rows]
        labels['label'] = pd.to_numeric(labels['label'], errors='coerce')

        print(f"Dataset shape: {df.shape}")
        print(f"Labels shape: {labels.shape}")
        print(f"Label values found: {labels['label'].unique()}")
    except Exception as e:
        print(f"Error loading data: {str(e)}")
        exit()

    # Preprocess and clean data
    print("\nPreprocessing data...")
    df.columns = [f"feature_{i}" for i in range(df.shape[1])]
    df['label'] = labels['label']
    df = optimize_dtypes(df)
    df = df.dropna()
    df = df[df['label'].notna()]
    print(f"Shape after cleaning: {df.shape}")

    # Split features and labels
    X = df.drop('label', axis=1)
    y = df['label']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

    # Feature selection
    print("Performing feature selection...")
    selector = SelectKBest(score_func=f_classif, k=min(30, X.shape[1]))
    X_train_selected = selector.fit_transform(X_train, y_train)
    X_test_selected = selector.transform(X_test)

    # Train model
    print("\nTraining the model...")
    model = RandomForestClassifier(
        n_estimators=100,
        random_state=42,
        n_jobs=-1,
        max_depth=10,
        class_weight='balanced'
    )
    model.fit(X_train_selected, y_train)

    # Predictions and evaluation
    print("Making predictions...")
    y_pred = model.predict(X_test_selected)

    print("\nModel Evaluation:")
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))
    print("\nConfusion Matrix:")
    print(confusion_matrix(y_test, y_pred))

    print("\nProcess completed!")
    print(f"Total samples: {len(df)}")
    print(f"Number of features: {X.shape[1]}")
    print(f"Number of selected features: {X_train_selected.shape[1]}")
    print(f"Label distribution:\n{y.value_counts()}")

if __name__ == "__main__":
    mainBareBones()


Loading dataset...
Dataset shape: (2278689, 115)
Labels shape: (2278689, 1)
Label values found: [nan  0.  1.]

Preprocessing data...
Shape after cleaning: (2278688, 116)
Performing feature selection...

Training the model...
Making predictions...

Model Evaluation:

Classification Report:
              precision    recall  f1-score   support

         0.0       1.00      0.96      0.98    406644
         1.0       0.94      1.00      0.97    276963

    accuracy                           0.98    683607
   macro avg       0.97      0.98      0.97    683607
weighted avg       0.98      0.98      0.98    683607


Confusion Matrix:
[[390148  16496]
 [   229 276734]]

Process completed!
Total samples: 2278688
Number of features: 115
Number of selected features: 30
Label distribution:
label
0.0    1355473
1.0     923215
Name: count, dtype: int64


In [2]:
import pandas as pd
import cupy as cp
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

def optimize_dtypes(df):
    for col in df.columns:
        if df[col].dtype == 'float64':
            df[col] = pd.to_numeric(df[col], downcast='float')
        elif df[col].dtype == 'int64':
            df[col] = pd.to_numeric(df[col], downcast='integer')
    return df

def maincuPY():
    # Load dataset
    print("Loading dataset...")
    try:
        df = pd.read_csv('Active_Wiretap_dataset.csv', header=None, low_memory=False)
        labels = pd.read_csv('Active_Wiretap_labels.csv', names=['label'], header=None, low_memory=False)

        # Ensure datasets have the same number of rows
        min_rows = min(len(df), len(labels))
        df = df.iloc[:min_rows]
        labels = labels.iloc[:min_rows]
        labels['label'] = pd.to_numeric(labels['label'], errors='coerce')

        print(f"Dataset shape: {df.shape}")
        print(f"Labels shape: {labels.shape}")
        print(f"Label values found: {labels['label'].unique()}")
    except Exception as e:
        print(f"Error loading data: {str(e)}")
        exit()

    # Preprocess and clean data
    print("\nPreprocessing data...")
    df.columns = [f"feature_{i}" for i in range(df.shape[1])]
    df['label'] = labels['label']
    df = optimize_dtypes(df)
    df = df.dropna()
    df = df[df['label'].notna()]
    print(f"Shape after cleaning: {df.shape}")

    # Split features and labels
    X = df.drop('label', axis=1).to_numpy(dtype=cp.float32)
    y = df['label'].to_numpy(dtype=cp.float32)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

    # Feature selection
    print("Performing feature selection...")
    X_train_cpu = cp.asnumpy(X_train)  # Convert to NumPy for sklearn
    y_train_cpu = cp.asnumpy(y_train)
    selector = SelectKBest(score_func=f_classif, k=min(30, X.shape[1]))
    X_train_selected_cpu = selector.fit_transform(X_train_cpu, y_train_cpu)
    X_test_selected_cpu = selector.transform(cp.asnumpy(X_test))

    # Convert back to CuPy
    X_train_selected = cp.array(X_train_selected_cpu, dtype=cp.float32)
    X_test_selected = cp.array(X_test_selected_cpu, dtype=cp.float32)

    # Train model
    print("\nTraining the model...")
    model = RandomForestClassifier(
        n_estimators=100,
        random_state=42,
        n_jobs=-1,
        max_depth=10,
        class_weight='balanced'
    )
    model.fit(cp.asnumpy(X_train_selected), cp.asnumpy(y_train))  # Convert CuPy to NumPy for sklearn

    # Predictions and evaluation
    print("Making predictions...")
    y_pred = model.predict(cp.asnumpy(X_test_selected))  # Predictions on NumPy arrays

    # Evaluate model
    print("\nModel Evaluation:")
    print("\nClassification Report:")
    print(classification_report(cp.asnumpy(y_test), y_pred))
    print("\nConfusion Matrix:")
    print(confusion_matrix(cp.asnumpy(y_test), y_pred))

    # Summary information
    print("\nProcess completed!")
    print(f"Total samples: {len(df)}")
    print(f"Number of features: {X.shape[1]}")
    print(f"Number of selected features: {X_train_selected.shape[1]}")
    print(f"Label distribution:\n{cp.bincount(cp.array(y, dtype=cp.int32))}")

if __name__ == "__main__":
    maincuPY()


Loading dataset...
Dataset shape: (2278689, 115)
Labels shape: (2278689, 1)
Label values found: [nan  0.  1.]

Preprocessing data...
Shape after cleaning: (2278688, 116)
Performing feature selection...


  square_of_sums_alldata = sum(sums_args) ** 2
  square_of_sums_args = [s**2 for s in sums_args]
  ssbn -= square_of_sums_alldata / float(n_samples)
  sswn = sstot - ssbn



Training the model...
Making predictions...

Model Evaluation:

Classification Report:
              precision    recall  f1-score   support

         0.0       1.00      0.96      0.98    406644
         1.0       0.94      1.00      0.97    276963

    accuracy                           0.98    683607
   macro avg       0.97      0.98      0.97    683607
weighted avg       0.98      0.98      0.98    683607


Confusion Matrix:
[[390167  16477]
 [   251 276712]]

Process completed!
Total samples: 2278688
Number of features: 115
Number of selected features: 30
Label distribution:
[1355473  923215]


In [None]:
patch_sklearn() # Apply Intel optimizations

In [3]:
def optimize_dtypes(df):
    for col in df.columns:
        if df[col].dtype == 'float64':
            df[col] = pd.to_numeric(df[col], downcast='float')
        elif df[col].dtype == 'int64':
            df[col] = pd.to_numeric(df[col], downcast='integer')
    return df

def mainIntelex():
    warnings.filterwarnings("ignore", category=FutureWarning)
    # Load dataset
    print("Loading dataset...")
    try:
        df = pd.read_csv('Active_Wiretap_dataset.csv', header=None, low_memory=False)
        labels = pd.read_csv('Active_Wiretap_labels.csv', names=['label'], header=None, low_memory=False)

        min_rows = min(len(df), len(labels))
        df = df.iloc[:min_rows]
        labels = labels.iloc[:min_rows]
        labels['label'] = pd.to_numeric(labels['label'], errors='coerce')

        print(f"Dataset shape: {df.shape}")
        print(f"Labels shape: {labels.shape}")
        print(f"Label values found: {labels['label'].unique()}")
    except Exception as e:
        print(f"Error loading data: {str(e)}")
        exit()

    # Preprocess and clean data
    print("\nPreprocessing data...")
    df.columns = [f"feature_{i}" for i in range(df.shape[1])]
    df['label'] = labels['label']
    df = optimize_dtypes(df)
    df = df.dropna()
    df = df[df['label'].notna()]
    print(f"Shape after cleaning: {df.shape}")

    # Split features and labels
    X = df.drop('label', axis=1)
    y = df['label']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

    # Feature selection
    print("Performing feature selection...")
    selector = SelectKBest(score_func=f_classif, k=min(30, X.shape[1]))
    X_train_selected = selector.fit_transform(X_train, y_train)
    X_test_selected = selector.transform(X_test)

    print("\nTraining the model...")
    model = RandomForestClassifier(
        n_estimators=100,
        random_state=42,
        n_jobs=-1,
        max_depth=10,
        class_weight='balanced'
    )
    model.fit(X_train_selected, y_train)

    # Predictions and evaluation
    print("Making predictions...")
    y_pred = model.predict(X_test_selected)

    print("\nModel Evaluation:")
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))
    print("\nConfusion Matrix:")
    print(confusion_matrix(y_test, y_pred))

    print("\nProcess completed!")
    print(f"Total samples: {len(df)}")
    print(f"Number of features: {X.shape[1]}")
    print(f"Number of selected features: {X_train_selected.shape[1]}")
    print(f"Label distribution:\n{y.value_counts()}")

if __name__ == "__main__":
    mainIntelex()


Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


Loading dataset...
Dataset shape: (2278689, 115)
Labels shape: (2278689, 1)
Label values found: [nan  0.  1.]

Preprocessing data...
Shape after cleaning: (2278688, 116)
Performing feature selection...

Training the model...
Making predictions...

Model Evaluation:

Classification Report:
              precision    recall  f1-score   support

         0.0       1.00      0.96      0.98    406644
         1.0       0.94      1.00      0.97    276963

    accuracy                           0.97    683607
   macro avg       0.97      0.98      0.97    683607
weighted avg       0.98      0.97      0.98    683607


Confusion Matrix:
[[389847  16797]
 [   355 276608]]

Process completed!
Total samples: 2278688
Number of features: 115
Number of selected features: 30
Label distribution:
label
0.0    1355473
1.0     923215
Name: count, dtype: int64
