In [2]:
import os
import thrember
import numpy as np
import pandas as pd
import polars as pl
import altair as alt
import lightgbm as lgb
import matplotlib.pylab as plt
from sklearn.metrics import roc_auc_score, roc_curve
alt.renderers.enable('default')

  from .autonotebook import tqdm as notebook_tqdm


RendererRegistry.enable('default')

In [3]:
data_dir = "/Users/zheng/VisualStudioCodeProjects/EMBER2024/database/" # change this to where you unzipped the download

In [4]:
train_df, test_df, challenge_df = thrember.read_metadata(data_dir)

In [5]:
# Add a 'week' column to the dataframe
plotdf = pl.concat([train_df, test_df])
start_date = pd.Timestamp("2023-09-24")
plotdf = plotdf.with_columns(
    pl.from_epoch("first_submission_date", time_unit="s").alias("first_submission_dt")
)
plotdf = plotdf.with_columns(
    (
        (pl.col("first_submission_dt") - pl.lit(start_date)).dt.total_days() // 7
    ).cast(pl.Int64).alias("week")
)

print(plotdf.shape)

# Plot file types across weeks
gbdf = plotdf.group_by(["file_type", "week"]).agg(pl.len().alias("count"))
alt.Chart(gbdf).mark_bar().encode(
    alt.X('week:O', axis=alt.Axis(title='Week First Seen')),
    alt.Y('count:Q', axis=alt.Axis(title='File Type')),
        alt.Color('file_type:N', scale=alt.Scale(range=["#4c78a8", "#54a24b", "#f58518",  "#88d27a",  "#9ecae9", "#ffbf79"]),
              legend=alt.Legend(values=["Win32", "Win64", "Dot_Net", "APK", "ELF", "PDF"]))
)

(672000, 16)


In [6]:
# Get number of occurrences of each family
family_counts = plotdf.select(
    pl.col("family").value_counts().alias("family_counts")
).unnest("family_counts")
family_counts = (
    plotdf.filter(pl.col("family").is_not_null())
          .select(pl.col("family").value_counts())
          .unnest("family")
          .sort("count", descending=True)
)
family_counts.head(10)

family,count
str,u32
"""wacatac""",20603
"""jiagu""",9515
"""mirai""",9246
"""agenttesla""",9075
"""xworm""",8114
"""spymax""",7181
"""wapron""",6963
"""heracles""",6274
"""njrat""",5622
"""mobidash""",4038


In [7]:
# Get number of occurrences of each behavior tag
plotdf_explode = plotdf.filter(pl.col("behavior").list.len() > 0).explode("behavior")
behavior_counts = (plotdf_explode.group_by("behavior").agg(pl.len().alias("count")).sort("count", descending=True))
behavior_counts.head(10)

behavior,count
str,u32
"""backdoor""",30317
"""phishing""",21778
"""spyware""",18817
"""dropper""",10230
"""adware""",9798
"""downloader""",7560
"""banker""",6155
"""pua""",4905
"""riskware""",4266
"""webshell""",3866


In [8]:
# Define database path for all file types (APK + .NET + PDF + ELF)
data_path = '/Users/zheng/VisualStudioCodeProjects/EMBER2024/database/'

print("=== Processing Multi-File-Type Dataset ===")
print("Dataset contains: APK, .NET, PDF, ELF files")

# 1. Vectorise features for all file types
print("Creating vectorized features for all file types...")
thrember.create_vectorized_features(data_path)

# 2. Read vectorised data
print("Reading vectorized data...")
X_train, y_train = thrember.read_vectorized_features(data_path, subset="train")
X_test, y_test = thrember.read_vectorized_features(data_path, subset="test")
X_challenge, y_challenge = thrember.read_vectorized_features(data_path, subset="challenge")

print(f"Training set shape: {X_train.shape}")
print(f"Test set shape: {X_test.shape}")
print(f"Challenge set shape: {X_challenge.shape}")
print("Multi-file-type dataset ready for training!")

=== Processing Multi-File-Type Dataset ===
Dataset contains: APK, .NET, PDF, ELF files
Creating vectorized features for all file types...
Preparing to vectorize raw features
Vectorizing training set


100%|██████████| 546000/546000 [22:56<00:00, 396.78it/s]


Vectorizing test set


100%|██████████| 126000/126000 [05:07<00:00, 409.50it/s]


Vectorizing challenge set


100%|██████████| 6315/6315 [00:15<00:00, 396.22it/s]


Reading vectorized data...
Training set shape: (546000, 2568)
Test set shape: (126000, 2568)
Challenge set shape: (6315, 2568)
Multi-file-type dataset ready for training!


In [10]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import roc_auc_score
import time

print("--- Extended Experiment A: Model Comparison on Multi-File-Type Dataset ---")

# 1. Define models for comparison
# Store each model and its name in a dictionary for easy iteration
models = {
    "Logistic Regression": LogisticRegression(random_state=42, max_iter=100, solver='liblinear'),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1),
    "LightGBM": lgb.LGBMClassifier(n_estimators=500, num_leaves=64, min_child_samples=100, random_state=42),
    "MLP Classifier": MLPClassifier(random_state=42, max_iter=300)
}

# 2. Train and evaluate each model
results = []
for model_name, model in models.items():
    print(f"\n--- Processing Model: {model_name} ---")
    # Training
    start_time = time.time()
    model.fit(X_train, y_train)
    training_time = time.time() - start_time

    # Prediction
    y_pred_proba = model.predict_proba(X_test)[:, 1]

    # Evaluation
    roc_auc = roc_auc_score(y_test, y_pred_proba)

    # Store results
    results.append({
        "Model": model_name,
        "ROC AUC": roc_auc,
        "Training Time (s)": training_time
    })

    print(f"Completed. ROC AUC: {roc_auc:.4f}, Training Time: {training_time:.2f} seconds")

# 3. Create a comprehensive results table
results_df = pd.DataFrame(results)
print("\n\n--- Multi-File-Type Dataset Performance Comparison ---")
print(results_df.sort_values(by="ROC AUC", ascending=False).to_markdown(index=False))

--- Extended Experiment A: Model Comparison on Multi-File-Type Dataset ---

--- Processing Model: Logistic Regression ---
Completed. ROC AUC: 0.6604, Training Time: 62.33 seconds

--- Processing Model: Random Forest ---
Completed. ROC AUC: 0.9882, Training Time: 151.05 seconds

--- Processing Model: LightGBM ---
[LightGBM] [Info] Number of positive: 273000, number of negative: 273000
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 1.671125 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 188027
[LightGBM] [Info] Number of data points in the train set: 546000, number of used features: 1040
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000




Completed. ROC AUC: 0.9923, Training Time: 130.23 seconds

--- Processing Model: MLP Classifier ---
Completed. ROC AUC: 0.6781, Training Time: 90.76 seconds


--- Multi-File-Type Dataset Performance Comparison ---
| Model               |   ROC AUC |   Training Time (s) |
|:--------------------|----------:|--------------------:|
| LightGBM            |  0.992314 |            130.228  |
| Random Forest       |  0.988226 |            151.055  |
| MLP Classifier      |  0.67812  |             90.7587 |
| Logistic Regression |  0.660355 |             62.3295 |


In [12]:
print("--- Extended Experiment B: Performance Analysis on Individual File Types ---")

# Step 1: Read metadata to get file type information
print("Reading metadata to identify file types...")
database_path = '/Users/zheng/VisualStudioCodeProjects/EMBER2024/database/'
train_df, test_df, challenge_df = thrember.read_metadata(database_path)

print(f"Metadata loaded:")
print(f"  Training samples: {len(train_df)}")
print(f"  Test samples: {len(test_df)}")

# Step 2: Check file type distribution
print("\nFile type distribution in training set:")
print(train_df['file_type'].value_counts())

print("\nFile type distribution in test set:")
print(test_df['file_type'].value_counts())

# Step 3: Create file type mapping for vectorized data
# We need to match the order of samples in vectorized data with metadata
print("\nCreating file type labels for vectorized data...")

# Get file types for training and test sets
train_file_types = train_df['file_type'].to_numpy()
test_file_types = test_df['file_type'].to_numpy()

print(f"Training file types shape: {train_file_types.shape}")
print(f"Test file types shape: {test_file_types.shape}")
print(f"Vectorized training data shape: {X_train.shape}")
print(f"Vectorized test data shape: {X_test.shape}")

# Verify data alignment
if len(train_file_types) == X_train.shape[0] and len(test_file_types) == X_test.shape[0]:
    print("Data alignment verified!")
else:
    print("Data alignment issue detected!")
    print(f"Metadata train samples: {len(train_file_types)}, Vectorized train samples: {X_train.shape[0]}")
    print(f"Metadata test samples: {len(test_file_types)}, Vectorized test samples: {X_test.shape[0]}")

--- Extended Experiment B: Performance Analysis on Individual File Types ---
Reading metadata to identify file types...
Metadata loaded:
  Training samples: 546000
  Test samples: 126000

File type distribution in training set:
shape: (4, 2)
┌───────────┬────────┐
│ file_type ┆ count  │
│ ---       ┆ ---    │
│ str       ┆ u32    │
╞═══════════╪════════╡
│ ELF       ┆ 26000  │
│ APK       ┆ 208000 │
│ PDF       ┆ 52000  │
│ Dot_Net   ┆ 260000 │
└───────────┴────────┘

File type distribution in test set:
shape: (4, 2)
┌───────────┬───────┐
│ file_type ┆ count │
│ ---       ┆ ---   │
│ str       ┆ u32   │
╞═══════════╪═══════╡
│ Dot_Net   ┆ 60000 │
│ APK       ┆ 48000 │
│ PDF       ┆ 12000 │
│ ELF       ┆ 6000  │
└───────────┴───────┘

Creating file type labels for vectorized data...
Training file types shape: (546000,)
Test file types shape: (126000,)
Vectorized training data shape: (546000, 2568)
Vectorized test data shape: (126000, 2568)
Data alignment verified!


In [14]:
 # Step 4: Perform model comparison on each file type
print("--- Starting Individual File Type Analysis ---")

# Define file types to analyze
file_types_to_analyze = ['APK', 'Dot_Net', 'PDF', 'ELF']

# Define models
models = {
    "Logistic Regression": LogisticRegression(random_state=42, max_iter=1000, solver='liblinear'),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1),
    "LightGBM": lgb.LGBMClassifier(n_estimators=500, num_leaves=64, min_child_samples=100, random_state=42),
    "MLP Classifier": MLPClassifier(random_state=42, max_iter=300)
}

# Store all results
all_results = []

# Outer loop: iterate through file types
for file_type in file_types_to_analyze:
    print(f"\n{'='*60}")
    print(f"ANALYZING FILE TYPE: {file_type}")
    print(f"{'='*60}")

    # Create masks for current file type
    train_mask = train_file_types == file_type
    test_mask = test_file_types == file_type

    # Extract data for current file type
    X_train_subset = X_train[train_mask]
    y_train_subset = y_train[train_mask]
    X_test_subset = X_test[test_mask]
    y_test_subset = y_test[test_mask]

    print(f"Training samples: {X_train_subset.shape[0]}")
    print(f"Test samples: {X_test_subset.shape[0]}")
    print(f"Positive ratio (train): {y_train_subset.mean():.3f}")
    print(f"Positive ratio (test): {y_test_subset.mean():.3f}")

    # Inner loop: iterate through models
    for model_name, model in models.items():
        print(f"\n--- Processing {model_name} on {file_type} ---")
        # Training
        start_time = time.time()
        model.fit(X_train_subset, y_train_subset)
        training_time = time.time() - start_time

        # Prediction
        y_pred_proba = model.predict_proba(X_test_subset)[:, 1]

        # Evaluation
        roc_auc = roc_auc_score(y_test_subset, y_pred_proba)

        # Store results
        all_results.append({
            "File_Type": file_type,
            "Model": model_name,
            "ROC_AUC": roc_auc,
            "Training_Time": training_time,
            "Train_Samples": X_train_subset.shape[0],
            "Test_Samples": X_test_subset.shape[0]
        })

        print(f"ROC AUC: {roc_auc:.4f}, Training Time: {training_time:.2f}s")

print(f"\n{'='*60}")
print("ALL FILE TYPE ANALYSIS COMPLETED!")
print(f"{'='*60}")

--- Starting Individual File Type Analysis ---

ANALYZING FILE TYPE: APK
Training samples: 208000
Test samples: 48000
Positive ratio (train): 0.500
Positive ratio (test): 0.500

--- Processing Logistic Regression on APK ---
ROC AUC: 0.6239, Training Time: 11.36s

--- Processing Random Forest on APK ---
ROC AUC: 0.9776, Training Time: 52.26s

--- Processing LightGBM on APK ---
[LightGBM] [Info] Number of positive: 104000, number of negative: 104000
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.267727 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 171614
[LightGBM] [Info] Number of data points in the train set: 208000, number of used features: 686
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000




ROC AUC: 0.9871, Training Time: 60.23s

--- Processing MLP Classifier on APK ---
ROC AUC: 0.5598, Training Time: 107.06s

ANALYZING FILE TYPE: Dot_Net
Training samples: 260000
Test samples: 60000
Positive ratio (train): 0.500
Positive ratio (test): 0.500

--- Processing Logistic Regression on Dot_Net ---
ROC AUC: 0.6842, Training Time: 25.72s

--- Processing Random Forest on Dot_Net ---
ROC AUC: 0.9951, Training Time: 63.86s

--- Processing LightGBM on Dot_Net ---
[LightGBM] [Info] Number of positive: 130000, number of negative: 130000
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.874984 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 191471
[LightGBM] [Info] Number of data points in the train set: 260000, number of used features: 1039
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000




ROC AUC: 0.9981, Training Time: 75.94s

--- Processing MLP Classifier on Dot_Net ---
ROC AUC: 0.8592, Training Time: 34.30s

ANALYZING FILE TYPE: PDF
Training samples: 52000
Test samples: 12000
Positive ratio (train): 0.500
Positive ratio (test): 0.500

--- Processing Logistic Regression on PDF ---
ROC AUC: 0.7713, Training Time: 3.45s

--- Processing Random Forest on PDF ---
ROC AUC: 0.9890, Training Time: 9.65s

--- Processing LightGBM on PDF ---
[LightGBM] [Info] Number of positive: 26000, number of negative: 26000
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.097170 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 159800
[LightGBM] [Info] Number of data points in the train set: 52000, number of used features: 664
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000




ROC AUC: 0.9912, Training Time: 24.74s

--- Processing MLP Classifier on PDF ---
ROC AUC: 0.8144, Training Time: 10.43s

ANALYZING FILE TYPE: ELF
Training samples: 26000
Test samples: 6000
Positive ratio (train): 0.500
Positive ratio (test): 0.500

--- Processing Logistic Regression on ELF ---
ROC AUC: 0.6635, Training Time: 1.09s

--- Processing Random Forest on ELF ---
ROC AUC: 0.9903, Training Time: 3.91s

--- Processing LightGBM on ELF ---
[LightGBM] [Info] Number of positive: 13000, number of negative: 13000
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.054045 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 166948
[LightGBM] [Info] Number of data points in the train set: 26000, number of used features: 683
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000




ROC AUC: 0.9931, Training Time: 18.44s

--- Processing MLP Classifier on ELF ---
ROC AUC: 0.8692, Training Time: 16.92s

ALL FILE TYPE ANALYSIS COMPLETED!
