In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.metrics import classification_report, confusion_matrix
import xgboost as xgb

def optimize_dtypes(df):
    for col in df.columns:
        if df[col].dtype == 'float64':
            df[col] = pd.to_numeric(df[col], downcast='float')
        elif df[col].dtype == 'int64':
            df[col] = pd.to_numeric(df[col], downcast='integer')
    return df

def mainBareBones():
    print("Loading dataset...")
    try:
        # Load main dataset
        df = pd.read_csv('Active_Wiretap_dataset.csv', 
                         header=None,
                         low_memory=False)
        
        # Load labels and handle mixed types
        labels = pd.read_csv('Active_Wiretap_labels.csv', 
                            names=['label'], 
                            header=None,
                            low_memory=False)
        
        # Ensure datasets have the same number of rows
        min_rows = min(len(df), len(labels))
        df = df.iloc[:min_rows]
        labels = labels.iloc[:min_rows]
        
        # Convert labels to numeric
        labels['label'] = pd.to_numeric(labels['label'], errors='coerce')
        
        print(f"Dataset shape: {df.shape}")
        print(f"Labels shape: {labels.shape}")
        print(f"Label values found: {labels['label'].unique()}")

    except Exception as e:
        print(f"Error loading data: {str(e)}")
        return

    # Step 2: Preprocess the data
    print("\nPreprocessing data...")
    n_features = df.shape[1]
    column_names = [f"feature_{i}" for i in range(n_features)]
    df.columns = column_names
    df['label'] = labels['label']

    # Step 3: Optimize memory usage
    print("Optimizing memory usage...")
    df = optimize_dtypes(df)

    # Step 4: Clean data
    print("Cleaning data...")
    # Remove any rows with missing values
    df = df.dropna()
    # Remove any rows where the label is NaN
    df = df[df['label'].notna()]

    print(f"Shape after cleaning: {df.shape}")

    # Step 5: Split features and labels
    X = df.drop('label', axis=1)
    y = df['label']

    # Step 6: Split the data
    print("\nSplitting data into train and test sets...")
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

    # Step 7: Feature selection
    print("Performing feature selection...")
    selector = SelectKBest(score_func=f_classif, k=min(30, X.shape[1]))
    X_train_selected = selector.fit_transform(X_train, y_train)
    X_test_selected = selector.transform(X_test)

    # Step 8: Train model
    print("\nTraining the model...")
    dtrain = xgb.DMatrix(X_train_selected, label=y_train)
    dtest = xgb.DMatrix(X_test_selected, label=y_test)
    params = {
        'eval_metric': 'logloss',
        'tree_method': 'hist',
        'device': 'cuda:0'  # Updated this line
    }
    model = xgb.train(params, dtrain)

    # Step 9: Predictions
    print("Making predictions...")
    y_pred = model.predict(dtest)
    y_pred = [round(value) for value in y_pred]

    # Step 10: Evaluation
    print("\nModel Evaluation:")
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))
    print("\nConfusion Matrix:")
    print(confusion_matrix(y_test, y_pred))

    # Additional information
    print("\nProcess completed!")

    print("\nDataset Information:")
    print(f"Total samples: {len(df)}")
    print(f"Number of features: {X.shape[1]}")
    print(f"Number of selected features: {X_train_selected.shape[1]}")
    print(f"Label distribution:\n{y.value_counts()}")

if __name__ == "__main__":
    mainBareBones()


In [1]:
import cupy as cp
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.metrics import classification_report, confusion_matrix
import xgboost as xgb
import numpy as np

def optimize_dtypes(df):
    for col in df.columns:
        if df[col].dtype == 'float64':
            df[col] = pd.to_numeric(df[col], downcast='float')
        elif df[col].dtype == 'int64':
            df[col] = pd.to_numeric(df[col], downcast='integer')
    return df

def main_cupy():
    print("Loading dataset...")
    try:
        # Load main dataset
        df = pd.read_csv('Active_Wiretap_dataset.csv', 
                         header=None,
                         low_memory=False)
        
        # Load labels and handle mixed types
        labels = pd.read_csv('Active_Wiretap_labels.csv', 
                            names=['label'], 
                            header=None,
                            low_memory=False)
        
        # Ensure datasets have the same number of rows
        min_rows = min(len(df), len(labels))
        df = df.iloc[:min_rows]
        labels = labels.iloc[:min_rows]
        
        # Convert labels to numeric
        labels['label'] = pd.to_numeric(labels['label'], errors='coerce')
        
        print(f"Dataset shape: {df.shape}")
        print(f"Labels shape: {labels.shape}")
        print(f"Label values found: {labels['label'].unique()}")

    except Exception as e:
        print(f"Error loading data: {str(e)}")
        return

    # Step 2: Preprocess the data
    print("\nPreprocessing data...")
    n_features = df.shape[1]
    column_names = [f"feature_{i}" for i in range(n_features)]
    df.columns = column_names
    df['label'] = labels['label']

    # Step 3: Optimize memory usage
    print("Optimizing memory usage...")
    df = optimize_dtypes(df)

    # Step 4: Clean data
    print("Cleaning data...")
    # Remove any rows with missing values
    df = df.dropna()
    # Remove any rows where the label is NaN
    df = df[df['label'].notna()]

    print(f"Shape after cleaning: {df.shape}")

    # Step 5: Split features and labels
    X = df.drop('label', axis=1)
    y = df['label']

    # Convert features and labels to CuPy arrays for GPU acceleration
    X = cp.array(X.to_numpy())
    y = cp.array(y.to_numpy())

    # Step 6: Split the data
    print("\nSplitting data into train and test sets...")
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

    # Step 7: Feature selection
    print("Performing feature selection...")
    # Use CuPy functions directly for scoring
    scores = cp.array(f_classif(cp.asnumpy(X_train), cp.asnumpy(y_train))[0])
    indices = cp.argsort(scores)[-min(30, X.shape[1]):][::-1]
    X_train_selected = X_train[:, indices]
    X_test_selected = X_test[:, indices]

    # Verify shapes before training
    print(f"\nX_train_selected shape: {X_train_selected.shape}")
    print(f"y_train shape: {y_train.shape}")

    # Step 8: Train model
    print("\nTraining the model...")
    dtrain = xgb.DMatrix(cp.asnumpy(X_train_selected), label=cp.asnumpy(y_train))
    dtest = xgb.DMatrix(cp.asnumpy(X_test_selected), label=cp.asnumpy(y_test))
    params = {
        'objective': 'binary:logistic',
        'eval_metric': 'logloss',
        'tree_method': 'hist',
        'device': 'cuda'
    }
    model = xgb.train(params, dtrain)

    # Step 9: Predictions
    print("Making predictions...")
    y_pred = model.predict(dtest)
    y_pred = np.round(y_pred) 

    # Step 10: Evaluation
    print("\nModel Evaluation:")
    print("\nClassification Report:")
    print(classification_report(cp.asnumpy(y_test), y_pred))
    print("\nConfusion Matrix:")
    print(confusion_matrix(cp.asnumpy(y_test), y_pred))

    # Additional information
    print("\nProcess completed!")
    print("\nDataset Information:")
    print(f"Total samples: {len(df)}")
    print(f"Number of features: {X.shape[1]}")
    print(f"Number of selected features: {X_train_selected.shape[1]}")
    y_series = pd.Series(cp.asnumpy(y))
    print(f"Label distribution:\n{y_series.value_counts()}")

if __name__ == "__main__":
    main_cupy()


Loading dataset...
Dataset shape: (2278689, 115)
Labels shape: (2278689, 1)
Label values found: [nan  0.  1.]

Preprocessing data...
Optimizing memory usage...
Cleaning data...
Shape after cleaning: (2278688, 116)

Splitting data into train and test sets...
Performing feature selection...

X_train_selected shape: (1595081, 30)
y_train shape: (1595081,)

Training the model...
Making predictions...

Model Evaluation:

Classification Report:
              precision    recall  f1-score   support

         0.0       1.00      0.96      0.98    406644
         1.0       0.94      1.00      0.97    276963

    accuracy                           0.97    683607
   macro avg       0.97      0.98      0.97    683607
weighted avg       0.98      0.97      0.97    683607


Confusion Matrix:
[[389872  16772]
 [   616 276347]]

Process completed!

Dataset Information:
Total samples: 2278688
Number of features: 115
Number of selected features: 30
Label distribution:
0.0    1355473
1.0     923215
Name:

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.metrics import classification_report, confusion_matrix
import xgboost as xgb
from sklearnex import patch_sklearn

# Apply scikit-learn-intelex patches
patch_sklearn()

def optimize_dtypes(df):
    for col in df.columns:
        if df[col].dtype == 'float64':
            df[col] = pd.to_numeric(df[col], downcast='float')
        elif df[col].dtype == 'int64':
            df[col] = pd.to_numeric(df[col], downcast='integer')
    return df

def main_intelex():
    print("Loading dataset...")
    try:
        # Load main dataset
        df = pd.read_csv('Active_Wiretap_dataset.csv', 
                         header=None,
                         low_memory=False)
        
        # Load labels and handle mixed types
        labels = pd.read_csv('Active_Wiretap_labels.csv', 
                            names=['label'], 
                            header=None,
                            low_memory=False)
        
        # Ensure datasets have the same number of rows
        min_rows = min(len(df), len(labels))
        df = df.iloc[:min_rows]
        labels = labels.iloc[:min_rows]
        
        # Convert labels to numeric
        labels['label'] = pd.to_numeric(labels['label'], errors='coerce')
        
        print(f"Dataset shape: {df.shape}")
        print(f"Labels shape: {labels.shape}")
        print(f"Label values found: {labels['label'].unique()}")

    except Exception as e:
        print(f"Error loading data: {str(e)}")
        return

    # Step 2: Preprocess the data
    print("\nPreprocessing data...")
    n_features = df.shape[1]
    column_names = [f"feature_{i}" for i in range(n_features)]
    df.columns = column_names
    df['label'] = labels['label']

    # Step 3: Optimize memory usage
    print("Optimizing memory usage...")
    df = optimize_dtypes(df)

    # Step 4: Clean data
    print("Cleaning data...")
    # Remove any rows with missing values
    df = df.dropna()
    # Remove any rows where the label is NaN
    df = df[df['label'].notna()]

    print(f"Shape after cleaning: {df.shape}")

    # Step 5: Split features and labels
    X = df.drop('label', axis=1)
    y = df['label']

    # Step 6: Split the data
    print("\nSplitting data into train and test sets...")
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

    # Step 7: Feature selection
    print("Performing feature selection...")
    selector = SelectKBest(score_func=f_classif, k=min(30, X.shape[1]))
    X_train_selected = selector.fit_transform(X_train, y_train)
    X_test_selected = selector.transform(X_test)

    # Step 8: Train model
    print("\nTraining the model...")
    dtrain = xgb.DMatrix(X_train_selected, label=y_train)
    dtest = xgb.DMatrix(X_test_selected, label=y_test)
    params = {
        'objective': 'binary:logistic',
        'eval_metric': 'logloss',
        'tree_method': 'hist',
        'device': 'cuda'
    }
    model = xgb.train(params, dtrain)

    # Step 9: Predictions
    print("Making predictions...")
    y_pred = model.predict(dtest)
    y_pred = np.round(y_pred)  # Convert probabilities to binary labels

    # Step 10: Evaluation
    print("\nModel Evaluation:")
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))
    print("\nConfusion Matrix:")
    print(confusion_matrix(y_test, y_pred))

    # Additional information
    print("\nProcess completed!")

    print("\nDataset Information:")
    print(f"Total samples: {len(df)}")
    print(f"Number of features: {X.shape[1]}")
    print(f"Number of selected features: {X_train_selected.shape[1]}")
    y_series = pd.Series(y)
    print(f"Label distribution:\n{y_series.value_counts()}")

if __name__ == "__main__":
    main_intelex()