In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.metrics import classification_report, confusion_matrix
from lightgbm import LGBMClassifier
import warnings
import cupy as cp
from sklearnex import patch_sklearn

In [2]:
#Suppress FutureWarnings
warnings.filterwarnings("ignore", category=FutureWarning)
def optimize_dtypes(df):
    for col in df.columns:
        if df[col].dtype == 'float64':
            df[col] = pd.to_numeric(df[col], downcast='float')
        elif df[col].dtype == 'int64':
            df[col] = pd.to_numeric(df[col], downcast='integer')
    return df

def mainBareBones():
    print("Loading dataset...")
    try:
        # Load main dataset
        df = pd.read_csv('Active_Wiretap_dataset.csv', 
                         header=None,
                         low_memory=False)
        
        # Load labels and handle mixed types
        labels = pd.read_csv('Active_Wiretap_labels.csv', 
                            names=['label'], 
                            header=None,
                            low_memory=False)
        
        # Ensure datasets have the same number of rows
        min_rows = min(len(df), len(labels))
        df = df.iloc[:min_rows]
        labels = labels.iloc[:min_rows]
        
        # Convert labels to numeric
        labels['label'] = pd.to_numeric(labels['label'], errors='coerce')
        
        print(f"Dataset shape: {df.shape}")
        print(f"Labels shape: {labels.shape}")
        print(f"Label values found: {labels['label'].unique()}")

    except Exception as e:
        print(f"Error loading data: {str(e)}")
        return

    # Step 2: Preprocess the data
    print("\nPreprocessing data...")
    n_features = df.shape[1]
    column_names = [f"feature_{i}" for i in range(n_features)]
    df.columns = column_names
    df['label'] = labels['label']

    # Step 3: Optimize memory usage
    print("Optimizing memory usage...")
    df = optimize_dtypes(df)

    # Step 4: Clean data
    print("Cleaning data...")
    # Remove any rows with missing values
    df = df.dropna()
    # Remove any rows where the label is NaN
    df = df[df['label'].notna()]

    print(f"Shape after cleaning: {df.shape}")

    # Step 5: Split features and labels
    X = df.drop('label', axis=1)
    y = df['label']

    # Step 6: Split the data
    print("\nSplitting data into train and test sets...")
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

    # Step 7: Feature selection
    print("Performing feature selection...")
    selector = SelectKBest(score_func=f_classif, k=min(30, X.shape[1]))
    X_train_selected = selector.fit_transform(X_train, y_train)
    X_test_selected = selector.transform(X_test)

    # Step 8: Train model using LightGBM
    print("\nTraining the model...")
    model = LGBMClassifier(
        objective='binary',
        random_state=42,
        force_col_wise=True,
    )
    model.fit(X_train_selected, y_train)

    # Step 9: Predictions
    print("Making predictions...")
    y_pred = model.predict(X_test_selected)

    # Step 10: Evaluation
    print("\nModel Evaluation:")
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))
    print("\nConfusion Matrix:")
    print(confusion_matrix(y_test, y_pred))

    # Additional information
    print("\nProcess completed!")

    print("\nDataset Information:")
    print(f"Total samples: {len(df)}")
    print(f"Number of features: {X.shape[1]}")
    print(f"Number of selected features: {X_train_selected.shape[1]}")
    print(f"Label distribution:\n{y.value_counts()}")

if __name__ == "__main__":
    mainBareBones()

Loading dataset...
Dataset shape: (2278689, 115)
Labels shape: (2278689, 1)
Label values found: [nan  0.  1.]

Preprocessing data...
Optimizing memory usage...
Cleaning data...
Shape after cleaning: (2278688, 116)

Splitting data into train and test sets...
Performing feature selection...

Training the model...
[LightGBM] [Info] Number of positive: 646252, number of negative: 948829
[LightGBM] [Info] Total Bins 7650
[LightGBM] [Info] Number of data points in the train set: 1595081, number of used features: 30
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.405153 -> initscore=-0.384039
[LightGBM] [Info] Start training from score -0.384039
Making predictions...

Model Evaluation:

Classification Report:
              precision    recall  f1-score   support

         0.0       1.00      0.96      0.98    406644
         1.0       0.94      1.00      0.97    276963

    accuracy                           0.98    683607
   macro avg       0.97      0.98      0.97    683607
weighted avg  

In [3]:
import cupy as cp
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.metrics import classification_report, confusion_matrix
from lightgbm import LGBMClassifier

def optimize_dtypes(df):
    for col in df.columns:
        if df[col].dtype == 'float64':
            df[col] = pd.to_numeric(df[col], downcast='float')
        elif df[col].dtype == 'int64':
            df[col] = pd.to_numeric(df[col], downcast='integer')
    return df

def mainCuPY():
    print("Loading dataset...")
    try:
        # Load main dataset
        df = pd.read_csv('Active_Wiretap_dataset.csv', 
                         header=None,
                         low_memory=False)
        
        # Load labels and handle mixed types
        labels = pd.read_csv('Active_Wiretap_labels.csv', 
                            names=['label'], 
                            header=None,
                            low_memory=False)
        
        # Ensure datasets have the same number of rows
        min_rows = min(len(df), len(labels))
        df = df.iloc[:min_rows]
        labels = labels.iloc[:min_rows]
        
        # Convert labels to numeric
        labels['label'] = pd.to_numeric(labels['label'], errors='coerce')
        
        print(f"Dataset shape: {df.shape}")
        print(f"Labels shape: {labels.shape}")
        print(f"Label values found: {labels['label'].unique()}")

    except Exception as e:
        print(f"Error loading data: {str(e)}")
        return

    # Step 2: Preprocess the data
    print("\nPreprocessing data...")
    n_features = df.shape[1]
    column_names = [f"feature_{i}" for i in range(n_features)]
    df.columns = column_names
    df['label'] = labels['label']

    # Step 3: Optimize memory usage
    print("Optimizing memory usage...")
    df = optimize_dtypes(df)

    # Step 4: Clean data
    print("Cleaning data...")
    # Remove any rows with missing values
    df = df.dropna()
    # Remove any rows where the label is NaN
    df = df[df['label'].notna()]

    print(f"Shape after cleaning: {df.shape}")

    # Step 5: Split features and labels
    X = df.drop('label', axis=1)
    y = df['label']

    # Convert features and labels to CuPy arrays
    X = cp.array(X)
    y = cp.array(y)

    # Step 6: Split the data
    print("\nSplitting data into train and test sets...")
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

    # Step 7: Feature selection
    print("Performing feature selection...")
    selector = SelectKBest(score_func=f_classif, k=min(30, X.shape[1]))
    X_train_np = cp.asnumpy(X_train)
    X_test_np = cp.asnumpy(X_test)
    X_train_selected = selector.fit_transform(X_train_np, cp.asnumpy(y_train))
    X_test_selected = selector.transform(X_test_np)

    # Convert selected features back to CuPy
    X_train_selected = cp.array(X_train_selected)
    X_test_selected = cp.array(X_test_selected)

    # Step 8: Train model using LightGBM
    print("\nTraining the model...")
    model = LGBMClassifier(
        objective='binary',
        random_state=42,
        force_col_wise=True,
    )
    model.fit(cp.asnumpy(X_train_selected), cp.asnumpy(y_train))

    # Step 9: Predictions
    print("Making predictions...")
    y_pred = model.predict(cp.asnumpy(X_test_selected))

    # Step 10: Evaluation
    print("\nModel Evaluation:")
    print("\nClassification Report:")
    print(classification_report(cp.asnumpy(y_test), y_pred))
    print("\nConfusion Matrix:")
    print(confusion_matrix(cp.asnumpy(y_test), y_pred))

    # Additional information
    print("\nProcess completed!")

    print("\nDataset Information:")
    print(f"Total samples: {len(df)}")
    print(f"Number of features: {X.shape[1]}")
    print(f"Number of selected features: {X_train_selected.shape[1]}")
    print(f"Label distribution:\n{pd.Series(cp.asnumpy(y)).value_counts()}")

if __name__ == "__main__":
    mainCuPY()

Loading dataset...
Dataset shape: (2278689, 115)
Labels shape: (2278689, 1)
Label values found: [nan  0.  1.]

Preprocessing data...
Optimizing memory usage...
Cleaning data...
Shape after cleaning: (2278688, 116)

Splitting data into train and test sets...
Performing feature selection...

Training the model...
[LightGBM] [Info] Number of positive: 646252, number of negative: 948829
[LightGBM] [Info] Total Bins 7650
[LightGBM] [Info] Number of data points in the train set: 1595081, number of used features: 30
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.405153 -> initscore=-0.384039
[LightGBM] [Info] Start training from score -0.384039
Making predictions...

Model Evaluation:

Classification Report:
              precision    recall  f1-score   support

         0.0       1.00      0.96      0.98    406644
         1.0       0.94      1.00      0.97    276963

    accuracy                           0.98    683607
   macro avg       0.97      0.98      0.97    683607
weighted avg  

In [4]:
patch_sklearn() # Apply Intel optimizations

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


In [5]:
def optimize_dtypes(df):
    for col in df.columns:
        if df[col].dtype == 'float64':
            df[col] = pd.to_numeric(df[col], downcast='float')
        elif df[col].dtype == 'int64':
            df[col] = pd.to_numeric(df[col], downcast='integer')
    return df

def mainIntelex():
    print("Loading dataset...")
    try:
        # Load main dataset
        df = pd.read_csv('Active_Wiretap_dataset.csv', 
                         header=None,
                         low_memory=False)
        
        # Load labels and handle mixed types
        labels = pd.read_csv('Active_Wiretap_labels.csv', 
                            names=['label'], 
                            header=None,
                            low_memory=False)
        
        # Ensure datasets have the same number of rows
        min_rows = min(len(df), len(labels))
        df = df.iloc[:min_rows]
        labels = labels.iloc[:min_rows]
        
        # Convert labels to numeric
        labels['label'] = pd.to_numeric(labels['label'], errors='coerce')
        
        print(f"Dataset shape: {df.shape}")
        print(f"Labels shape: {labels.shape}")
        print(f"Label values found: {labels['label'].unique()}")

    except Exception as e:
        print(f"Error loading data: {str(e)}")
        return

    # Step 2: Preprocess the data
    print("\nPreprocessing data...")
    n_features = df.shape[1]
    column_names = [f"feature_{i}" for i in range(n_features)]
    df.columns = column_names
    df['label'] = labels['label']

    # Step 3: Optimize memory usage
    print("Optimizing memory usage...")
    df = optimize_dtypes(df)

    # Step 4: Clean data
    print("Cleaning data...")
    # Remove any rows with missing values
    df = df.dropna()
    # Remove any rows where the label is NaN
    df = df[df['label'].notna()]

    print(f"Shape after cleaning: {df.shape}")

    # Step 5: Split features and labels
    X = df.drop('label', axis=1)
    y = df['label']

    # Step 6: Split the data
    print("\nSplitting data into train and test sets...")
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

    # Step 7: Feature selection
    print("Performing feature selection...")
    selector = SelectKBest(score_func=f_classif, k=min(30, X.shape[1]))
    X_train_selected = selector.fit_transform(X_train, y_train)
    X_test_selected = selector.transform(X_test)

    # Step 8: Train model using LightGBM
    print("\nTraining the model...")
    model = LGBMClassifier(
        objective='binary',
        random_state=42,
        force_col_wise=True,
    )
    model.fit(X_train_selected, y_train)

    # Step 9: Predictions
    print("Making predictions...")
    y_pred = model.predict(X_test_selected)

    # Step 10: Evaluation
    print("\nModel Evaluation:")
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))
    print("\nConfusion Matrix:")
    print(confusion_matrix(y_test, y_pred))

    # Additional information
    print("\nProcess completed!")

    print("\nDataset Information:")
    print(f"Total samples: {len(df)}")
    print(f"Number of features: {X.shape[1]}")
    print(f"Number of selected features: {X_train_selected.shape[1]}")
    print(f"Label distribution:\n{y.value_counts()}")

if __name__ == "__main__":
    mainIntelex()

Loading dataset...
Dataset shape: (2278689, 115)
Labels shape: (2278689, 1)
Label values found: [nan  0.  1.]

Preprocessing data...
Optimizing memory usage...
Cleaning data...
Shape after cleaning: (2278688, 116)

Splitting data into train and test sets...
Performing feature selection...

Training the model...
[LightGBM] [Info] Number of positive: 646252, number of negative: 948829
[LightGBM] [Info] Total Bins 7650
[LightGBM] [Info] Number of data points in the train set: 1595081, number of used features: 30
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.405153 -> initscore=-0.384039
[LightGBM] [Info] Start training from score -0.384039
Making predictions...

Model Evaluation:

Classification Report:
              precision    recall  f1-score   support

         0.0       1.00      0.96      0.98    406644
         1.0       0.94      1.00      0.97    276963

    accuracy                           0.98    683607
   macro avg       0.97      0.98      0.97    683607
weighted avg  