In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearnex import patch_sklearn
import cupy as cp 
import warnings

In [2]:
def optimize_dtypes(df):
    for col in df.columns:
        if df[col].dtype == 'float64':
            df[col] = pd.to_numeric(df[col], downcast='float')
        elif df[col].dtype == 'int64':
            df[col] = pd.to_numeric(df[col], downcast='integer')
    return df

def mainBareBones():
    # Load dataset
    print("Loading dataset...")
    try:
        df = pd.read_csv('Active_Wiretap_dataset.csv', header=None, low_memory=False)
        labels = pd.read_csv('Active_Wiretap_labels.csv', names=['label'], header=None, low_memory=False)

        min_rows = min(len(df), len(labels))
        df = df.iloc[:min_rows]
        labels = labels.iloc[:min_rows]
        labels['label'] = pd.to_numeric(labels['label'], errors='coerce')

        print(f"Dataset shape: {df.shape}")
        print(f"Labels shape: {labels.shape}")
        print(f"Label values found: {labels['label'].unique()}")
    except Exception as e:
        print(f"Error loading data: {str(e)}")
        exit()

    # Preprocess and clean data
    print("\nPreprocessing data...")
    df.columns = [f"feature_{i}" for i in range(df.shape[1])]
    df['label'] = labels['label']
    df = optimize_dtypes(df)
    df = df.dropna()
    df = df[df['label'].notna()]
    print(f"Shape after cleaning: {df.shape}")


    # Split features and labels
    X = df.drop('label', axis=1)
    y = df['label']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

    # Feature scaling
    print("Scaling features...")
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    # Feature selection
    print("Performing feature selection...")
    selector = SelectKBest(score_func=f_classif, k=min(30, X.shape[1]))
    X_train_selected = selector.fit_transform(X_train, y_train)
    X_test_selected = selector.transform(X_test)

    # Train Logistic Regression model
    print("\nTraining the Logistic Regression model...")
    model = LogisticRegression(
        solver='liblinear',
        class_weight='balanced',
        random_state=42
    )
    model.fit(X_train_selected, y_train)

    # Predictions and evaluation
    print("Making predictions...")
    y_pred = model.predict(X_test_selected)

    print("\nModel Evaluation:")
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))
    print("\nConfusion Matrix:")
    print(confusion_matrix(y_test, y_pred))

    print("\nProcess completed!")
    print(f"Total samples: {len(df)}")
    print(f"Number of features: {X.shape[1]}")
    print(f"Number of selected features: {X_train_selected.shape[1]}")
    print(f"Label distribution:\n{y.value_counts()}")

if __name__ == "__main__":
    mainBareBones()


Loading dataset...
Dataset shape: (2278689, 115)
Labels shape: (2278689, 1)
Label values found: [nan  0.  1.]

Preprocessing data...
Shape after cleaning: (2278688, 116)
Scaling features...
Performing feature selection...

Training the Logistic Regression model...
Making predictions...

Model Evaluation:

Classification Report:
              precision    recall  f1-score   support

         0.0       0.97      0.95      0.96    406644
         1.0       0.92      0.95      0.94    276963

    accuracy                           0.95    683607
   macro avg       0.95      0.95      0.95    683607
weighted avg       0.95      0.95      0.95    683607


Confusion Matrix:
[[385293  21351]
 [ 13689 263274]]

Process completed!
Total samples: 2278688
Number of features: 115
Number of selected features: 30
Label distribution:
label
0.0    1355473
1.0     923215
Name: count, dtype: int64


In [3]:
import pandas as pd
import cupy as cp  # Import CuPy for GPU-based computation
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler

def optimize_dtypes(df):
    for col in df.columns:
        if df[col].dtype == 'float64':
            df[col] = pd.to_numeric(df[col], downcast='float')
        elif df[col].dtype == 'int64':
            df[col] = pd.to_numeric(df[col], downcast='integer')
    return df

def mainWithCuPy():
    # Load dataset
    print("Loading dataset...")
    try:
        df = pd.read_csv('Active_Wiretap_dataset.csv', header=None, low_memory=False)
        labels = pd.read_csv('Active_Wiretap_labels.csv', names=['label'], header=None, low_memory=False)

        min_rows = min(len(df), len(labels))
        df = df.iloc[:min_rows]
        labels = labels.iloc[:min_rows]
        labels['label'] = pd.to_numeric(labels['label'], errors='coerce')

        print(f"Dataset shape: {df.shape}")
        print(f"Labels shape: {labels.shape}")
        print(f"Label values found: {labels['label'].unique()}")
    except Exception as e:
        print(f"Error loading data: {str(e)}")
        exit()

    # Preprocess and clean data
    print("\nPreprocessing data...")
    df.columns = [f"feature_{i}" for i in range(df.shape[1])]
    df['label'] = labels['label']
    df = optimize_dtypes(df)
    df = df.dropna()
    df = df[df['label'].notna()]
    print(f"Shape after cleaning: {df.shape}")


    # Transfer data to GPU using CuPy
    print("\nTransferring data to GPU...")
    X = cp.array(df.drop('label', axis=1).values)
    y = cp.array(df['label'].values)

    # Split features and labels
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

    # Feature scaling
    print("Scaling features on GPU...")
    scaler = StandardScaler()
    X_train = scaler.fit_transform(cp.asnumpy(X_train))  # Convert to NumPy for compatibility
    X_test = scaler.transform(cp.asnumpy(X_test))
    X_train = cp.array(X_train)  # Convert back to CuPy
    X_test = cp.array(X_test)

    # Feature selection
    print("Performing feature selection on GPU...")
    selector = SelectKBest(score_func=f_classif, k=min(30, X.shape[1]))
    X_train_selected = selector.fit_transform(cp.asnumpy(X_train), cp.asnumpy(y_train))
    X_test_selected = selector.transform(cp.asnumpy(X_test))
    X_train_selected = cp.array(X_train_selected)
    X_test_selected = cp.array(X_test_selected)

    # Train Logistic Regression model
    print("\nTraining the Logistic Regression model")
    model = LogisticRegression(
        solver='liblinear',
        class_weight='balanced',
        random_state=42
    )
    model.fit(cp.asnumpy(X_train_selected), cp.asnumpy(y_train))  # Convert to NumPy for training

    # Predictions and evaluation
    print("Making predictions...")
    y_pred = model.predict(cp.asnumpy(X_test_selected))

    print("\nModel Evaluation:")
    print("\nClassification Report:")
    print(classification_report(cp.asnumpy(y_test), y_pred))
    print("\nConfusion Matrix:")
    print(confusion_matrix(cp.asnumpy(y_test), y_pred))

    print("\nProcess completed!")
    print(f"Total samples: {len(df)}")
    print(f"Number of features: {X.shape[1]}")
    print(f"Number of selected features: {X_train_selected.shape[1]}")
    print(f"Label distribution:\n{df['label'].value_counts()}")

if __name__ == "__main__":
    mainWithCuPy()


Loading dataset...
Dataset shape: (2278689, 115)
Labels shape: (2278689, 1)
Label values found: [nan  0.  1.]

Preprocessing data...
Shape after cleaning: (2278688, 116)

Transferring data to GPU...
Scaling features on GPU...
Performing feature selection on GPU...

Training the Logistic Regression model
Making predictions...

Model Evaluation:

Classification Report:
              precision    recall  f1-score   support

         0.0       0.97      0.95      0.96    406644
         1.0       0.93      0.95      0.94    276963

    accuracy                           0.95    683607
   macro avg       0.95      0.95      0.95    683607
weighted avg       0.95      0.95      0.95    683607


Confusion Matrix:
[[385365  21279]
 [ 13750 263213]]

Process completed!
Total samples: 2278688
Number of features: 115
Number of selected features: 30
Label distribution:
label
0.0    1355473
1.0     923215
Name: count, dtype: int64


In [4]:
patch_sklearn() # Apply Intel optimizations

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


In [5]:
warnings.filterwarnings("ignore", category=FutureWarning, module="sklearn")

def optimize_dtypes(df):
    for col in df.columns:
        if df[col].dtype == 'float64':
            df[col] = pd.to_numeric(df[col], downcast='float')
        elif df[col].dtype == 'int64':
            df[col] = pd.to_numeric(df[col], downcast='integer')
    return df

def mainIntelex():
    # Load dataset
    print("Loading dataset...")
    try:
        df = pd.read_csv('Active_Wiretap_dataset.csv', header=None, low_memory=False)
        labels = pd.read_csv('Active_Wiretap_labels.csv', names=['label'], header=None, low_memory=False)

        min_rows = min(len(df), len(labels))
        df = df.iloc[:min_rows]
        labels = labels.iloc[:min_rows]
        labels['label'] = pd.to_numeric(labels['label'], errors='coerce')

        print(f"Dataset shape: {df.shape}")
        print(f"Labels shape: {labels.shape}")
        print(f"Label values found: {labels['label'].unique()}")
    except Exception as e:
        print(f"Error loading data: {str(e)}")
        exit()

    # Preprocess and clean data
    print("\nPreprocessing data...")
    df.columns = [f"feature_{i}" for i in range(df.shape[1])]
    df['label'] = labels['label']
    df = optimize_dtypes(df)
    df = df.dropna()
    df = df[df['label'].notna()]
    print(f"Shape after cleaning: {df.shape}")
    
    
    # Split features and labels
    X = df.drop('label', axis=1)
    y = df['label']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

    # Feature scaling
    print("Scaling features...")
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    # Feature selection
    print("Performing feature selection...")
    selector = SelectKBest(score_func=f_classif, k=min(30, X.shape[1]))
    X_train_selected = selector.fit_transform(X_train, y_train)
    X_test_selected = selector.transform(X_test)

    # Train Logistic Regression model
    print("\nTraining the Logistic Regression model...")
    model = LogisticRegression(
        solver='liblinear',
        class_weight='balanced',
        random_state=42
    )
    model.fit(X_train_selected, y_train)

    # Predictions and evaluation
    print("Making predictions...")
    y_pred = model.predict(X_test_selected)

    print("\nModel Evaluation:")
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))
    print("\nConfusion Matrix:")
    print(confusion_matrix(y_test, y_pred))

    print("\nProcess completed!")
    print(f"Total samples: {len(df)}")
    print(f"Number of features: {X.shape[1]}")
    print(f"Number of selected features: {X_train_selected.shape[1]}")
    print(f"Label distribution:\n{y.value_counts()}")

if __name__ == "__main__":
    mainIntelex()


Loading dataset...
Dataset shape: (2278689, 115)
Labels shape: (2278689, 1)
Label values found: [nan  0.  1.]

Preprocessing data...
Shape after cleaning: (2278688, 116)
Scaling features...
Performing feature selection...

Training the Logistic Regression model...
Making predictions...

Model Evaluation:

Classification Report:
              precision    recall  f1-score   support

         0.0       0.97      0.95      0.96    406644
         1.0       0.92      0.95      0.94    276963

    accuracy                           0.95    683607
   macro avg       0.95      0.95      0.95    683607
weighted avg       0.95      0.95      0.95    683607


Confusion Matrix:
[[385293  21351]
 [ 13689 263274]]

Process completed!
Total samples: 2278688
Number of features: 115
Number of selected features: 30
Label distribution:
label
0.0    1355473
1.0     923215
Name: count, dtype: int64
