**Importing Libraries**

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
from sklearn.preprocessing import LabelEncoder
import tensorflow as tf
from tensorflow.keras.layers import *
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
import re
from collections import Counter
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm

**Loading the Dataset**

In [None]:
dataset_path = "/content/drive/MyDrive/Colab Notebooks/Alexa.csv"
dataset_path = "/content/drive/MyDrive/Colab Notebooks/dga.csv"
dataset_path = "/content/drive/MyDrive/Colab Notebooks/extra.csv"

**Loading the initial Shape of the dataset**

In [None]:
import pandas as pd

# Load Alexa dataset without assuming headers
alexa_df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Alexa.csv', header=None)

# Load DGA dataset with correct headers
dga_df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/dga.csv', names=['rank', 'domain'])

# Display first few rows
print("First few rows of Alexa dataset:\n", alexa_df.head())
print("\nFirst few rows of DGA dataset:\n", dga_df.head())

First few rows of Alexa dataset:
    0              1
0  1     google.com
1  2    youtube.com
2  3   facebook.com
3  4      baidu.com
4  5  wikipedia.org

First few rows of DGA dataset:
              rank  domain
0          domain     NaN
1  mzvbfkkoij.com     NaN
2  dxczoqvzpc.com     NaN
3  gezufojmci.com     NaN
4  vgtsavhzfg.com     NaN


In [None]:
# Reload DGA dataset if needed
dga_raw_df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/dga.csv', names=['rank', 'domain'], header=None)

# Move valid domain names from 'rank' column to 'domain' column
dga_raw_df['domain'] = dga_raw_df['rank']

# Now drop the 'rank' column as it's no longer needed
dga_cleaned_df = dga_raw_df.drop(columns=['rank'])

# Remove rows where the 'domain' column is NaN
dga_cleaned_df = dga_cleaned_df[dga_cleaned_df['domain'].notna()]

# Convert all domain names to lowercase for consistency
dga_cleaned_df['domain'] = dga_cleaned_df['domain'].str.lower()

# Verify the cleaned dataset
print("Cleaned DGA dataset shape:", dga_cleaned_df.shape)
print("First few rows of cleaned DGA dataset:\n", dga_cleaned_df.head())


Cleaned DGA dataset shape: (499002, 1)
First few rows of cleaned DGA dataset:
            domain
0          domain
1  mzvbfkkoij.com
2  dxczoqvzpc.com
3  gezufojmci.com
4  vgtsavhzfg.com


In [None]:
# Keep only the domain column
alexa_df = alexa_df[[1]]  # Select the correct column
alexa_df.columns = ["domain"]  # Rename column
# Remove the first row (which contains 'domain' as a value)
dga_cleaned_df = dga_cleaned_df[dga_cleaned_df['domain'] != "domain"]

# Verify the cleaned DGA dataset
print("Cleaned DGA dataset shape:", dga_cleaned_df.shape)
print("Cleaned Alexa dataset shape:", alexa_df.shape)
# Print first few rows of Alexa dataset
print("\nFirst few rows of Alexa dataset:\n", alexa_df.head())
# Print first few rows of Cleaned DGA dataset
print("First few rows of cleaned DGA dataset:\n", dga_cleaned_df.head())


Cleaned DGA dataset shape: (499001, 1)
Cleaned Alexa dataset shape: (1000000, 1)

First few rows of Alexa dataset:
           domain
0     google.com
1    youtube.com
2   facebook.com
3      baidu.com
4  wikipedia.org
First few rows of cleaned DGA dataset:
            domain
1  mzvbfkkoij.com
2  dxczoqvzpc.com
3  gezufojmci.com
4  vgtsavhzfg.com
5  fvbvfmwlcn.com


In [None]:
# Assuming dga_cleaned_df and alexa_df are already loaded
# Remove the first row (which contains 'domain' as a value)
dga_cleaned_df = dga_cleaned_df[dga_cleaned_df['domain'] != "domain"]

# Verify the cleaned DGA dataset
print("Cleaned DGA dataset shape:", dga_cleaned_df.shape)
print("Cleaned Alexa dataset shape:", alexa_df.shape)

# Print first few rows of Alexa dataset
print("\nFirst few rows of Alexa dataset:\n", alexa_df.head())

# Print first few rows of Cleaned DGA dataset
print("First few rows of cleaned DGA dataset:\n", dga_cleaned_df.head())

Cleaned DGA dataset shape: (499001, 1)
Cleaned Alexa dataset shape: (1000000, 1)

First few rows of Alexa dataset:
           domain
0     google.com
1    youtube.com
2   facebook.com
3      baidu.com
4  wikipedia.org
First few rows of cleaned DGA dataset:
            domain
1  mzvbfkkoij.com
2  dxczoqvzpc.com
3  gezufojmci.com
4  vgtsavhzfg.com
5  fvbvfmwlcn.com


**Added Extra data from 360**

In [None]:
# Load the additional DGA data
extra_dga_path = "/content/drive/MyDrive/Colab Notebooks/extra.csv"
extra_dga_df = pd.read_csv(extra_dga_path)

# Verify the additional DGA data
print("Additional DGA dataset shape:", extra_dga_df.shape)
print("First few rows of additional DGA dataset:\n", extra_dga_df.head())

Additional DGA dataset shape: (10000, 1)
First few rows of additional DGA dataset:
                              Domain
0               suggestmoredue.link
1    admin.leaveacceptablerock.link
2              meanhardpositive.net
3  server.killcontentexternal.click
4            hearconsciousowner.net


**Combine the Additional DGA Data with the Existing DGA Data**

In [None]:
# Ensure the column names match
if 'domain' not in extra_dga_df.columns:
    extra_dga_df.rename(columns={extra_dga_df.columns[0]: 'domain'}, inplace=True)

# Combine the additional DGA data with the existing DGA data
dga_combined_df = pd.concat([dga_cleaned_df, extra_dga_df], ignore_index=True)

# Verify the combined DGA dataset
print("Combined DGA dataset shape:", dga_combined_df.shape)

Combined DGA dataset shape: (509001, 1)


In [None]:
# Add labels to the combined DGA dataset
dga_combined_df['label'] = 1  # DGA domains are labeled as 1

# Verify the labeled DGA dataset
print("Labeled DGA dataset shape:", dga_combined_df.shape)
print("First few rows of labeled DGA dataset:\n", dga_combined_df.head())

Labeled DGA dataset shape: (509001, 2)
First few rows of labeled DGA dataset:
            domain  label
0  mzvbfkkoij.com      1
1  dxczoqvzpc.com      1
2  gezufojmci.com      1
3  vgtsavhzfg.com      1
4  fvbvfmwlcn.com      1


In [None]:
print("Cleaned Alexa dataset shape:", alexa_df.shape)

Cleaned Alexa dataset shape: (1000000, 1)


In [None]:
# Add labels to the datasets
dga_combined_df['label'] = 1  # DGA domains are labeled as 1
alexa_df['label'] = 0        # Alexa domains are labeled as 0

# Combine the datasets
combined_df = pd.concat([dga_combined_df, alexa_df ], ignore_index=True)

# Verify the combined dataset
print("Combined dataset shape:", combined_df.shape)
print("First few rows of combined dataset:\n", combined_df.head())

Combined dataset shape: (1509001, 2)
First few rows of combined dataset:
            domain  label
0  mzvbfkkoij.com      1
1  dxczoqvzpc.com      1
2  gezufojmci.com      1
3  vgtsavhzfg.com      1
4  fvbvfmwlcn.com      1


**Balancing the Datasets to 500k each**

In [None]:
# Separate the datasets
dga_df = combined_df[combined_df['label'] == 1]
alexa_df = combined_df[combined_df['label'] == 0]

# Downsample DGA data to 500,000 samples (if it has more)
if len(dga_df) > 500000:
    dga_df = resample(dga_df, replace=False, n_samples=500000, random_state=42)

# Downsample Alexa data to 500,000 samples
alexa_df = resample(alexa_df, replace=False, n_samples=500000, random_state=42)

# Combine the balanced datasets
balanced_df = pd.concat([dga_df, alexa_df], ignore_index=True)

# Shuffle the dataset
balanced_df = balanced_df.sample(frac=1, random_state=42).reset_index(drop=True)

# Verify the balanced dataset
print("Balanced dataset shape:", balanced_df.shape)
print("Class distribution:\n", balanced_df['label'].value_counts())

Balanced dataset shape: (1000000, 2)
Class distribution:
 label
0    500000
1    500000
Name: count, dtype: int64


**Save the Balanced Dataset**

In [None]:
# Save the balanced dataset to Google Drive
output_path = "/content/drive/MyDrive/Colab Notebooks/balanced_dataset.csv"
balanced_df.to_csv(output_path, index=False)

print(f"Dataset saved to {output_path}")

Dataset saved to /content/drive/MyDrive/Colab Notebooks/balanced_dataset.csv


**Split the Data into Training and Testing Sets**

Split the balanced dataset into training (80%) and testing (20%) sets.

In [None]:
# Split the data into features (X) and labels (y)
X = balanced_df['domain']
y = balanced_df['label']

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Verify the splits
print("Training data shape:", X_train.shape)
print("Testing data shape:", X_test.shape)

Training data shape: (800000,)
Testing data shape: (200000,)


**Feature Extraction for Each Model Component**

**1. FastText Embaddings**

In [None]:
!pip install fasttext
import fasttext
import numpy as np

# Save the training data to a temporary file
with open("train_data.txt", "w") as f:
    for domain in X_train.tolist():
        f.write(domain + "\n")

# Train FastText model on the temporary file
fasttext_model = fasttext.train_unsupervised("train_data.txt", model='skipgram', dim=100)

# Function to generate FastText embeddings
def get_fasttext_embeddings(texts, model):
    return np.array([model.get_word_vector(text) for text in texts])

# Generate FastText embeddings for training and testing data
X_train_fasttext = get_fasttext_embeddings(X_train.tolist(), fasttext_model)
X_test_fasttext = get_fasttext_embeddings(X_test.tolist(), fasttext_model)

# Verify the embeddings
print("Training FastText embeddings shape:", X_train_fasttext.shape)
print("Testing FastText embeddings shape:", X_test_fasttext.shape)

Collecting fasttext
  Downloading fasttext-0.9.3.tar.gz (73 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/73.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m73.4/73.4 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting pybind11>=2.2 (from fasttext)
  Using cached pybind11-2.13.6-py3-none-any.whl.metadata (9.5 kB)
Using cached pybind11-2.13.6-py3-none-any.whl (243 kB)
Building wheels for collected packages: fasttext
  Building wheel for fasttext (pyproject.toml) ... [?25l[?25hdone
  Created wheel for fasttext: filename=fasttext-0.9.3-cp311-cp311-linux_x86_64.whl size=4313473 sha256=013d270b20429eddc4a47882b21d2e2e9491df8e0436d6f1a68af5beeb4fb304
  Stored in directory: /root/.cache/pip/wheels/65/4f/35/5057db0249224e9ab55a51

 **2. Graph CNN
Convert domain names into character-level graphs.t**

In [None]:
import networkx as nx
import numpy as np

# Function to convert domain names to graphs
def domain_to_graph(domain):
    G = nx.Graph()
    for i in range(len(domain) - 1):
        G.add_edge(domain[i], domain[i + 1])
    return G

# Convert training and testing data to graphs
X_train_graphs = [domain_to_graph(domain) for domain in X_train]
X_test_graphs = [domain_to_graph(domain) for domain in X_test]

 **3. LSTM (LLN)
Use character-level sequences for LSTM.**

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Tokenize characters in domain names
tokenizer = Tokenizer(char_level=True)
tokenizer.fit_on_texts(X_train)

# Convert domain names to sequences of character indices
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

# Pad sequences to a fixed length
max_len = 50
X_train_seq = pad_sequences(X_train_seq, maxlen=max_len, padding='post')
X_test_seq = pad_sequences(X_test_seq, maxlen=max_len, padding='post')

**4. SVM (Traditional Features)
Extract traditional features like domain length, entropy, and n-grams. **

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Convert domain names into TF-IDF features
vectorizer = TfidfVectorizer(analyzer='char', ngram_range=(3, 5))
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

**Step 3: Build the Hybrid Model
Combine the outputs of FastText, Graph CNN, LSTM, and SVM using a meta-classifier.**

In [None]:
# Import necessary libraries
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import StackingClassifier
from sklearn.svm import SVC
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding

# Define individual models
svm_model = SVC(kernel='linear', probability=True)

# Define LSTM model
lstm_model = Sequential([
    Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=64, input_length=max_len),
    LSTM(64),
    Dense(1, activation='sigmoid')
])

# Compile the LSTM model
lstm_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the LSTM model
lstm_model.fit(X_train_seq, y_train, epochs=5, batch_size=64, validation_split=0.2)

# Combine models using a stacking classifier
meta_classifier = LogisticRegression()
hybrid_model = StackingClassifier(
    estimators=[
        ('svm', svm_model),
        ('lstm', lstm_model)
    ],
    final_estimator=meta_classifier
)

# Train the hybrid model
hybrid_model.fit(X_train_tfidf, y_train)



Epoch 1/5
[1m10000/10000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m455s[0m 45ms/step - accuracy: 0.8475 - loss: 0.3278 - val_accuracy: 0.9302 - val_loss: 0.1704
Epoch 2/5
[1m10000/10000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m508s[0m 46ms/step - accuracy: 0.9340 - loss: 0.1624 - val_accuracy: 0.9441 - val_loss: 0.1415
Epoch 3/5
[1m10000/10000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m447s[0m 40ms/step - accuracy: 0.9482 - loss: 0.1322 - val_accuracy: 0.9527 - val_loss: 0.1244
Epoch 4/5
[1m10000/10000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m451s[0m 41ms/step - accuracy: 0.9565 - loss: 0.1145 - val_accuracy: 0.9556 - val_loss: 0.1153
Epoch 5/5
[1m10000/10000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m405s[0m 41ms/step - accuracy: 0.9608 - loss: 0.1033 - val_accuracy: 0.9596 - val_loss: 0.1050


**1: Predict on Test Data**

In [None]:
# Predict on test data
y_pred = hybrid_model.predict(X_test_tfidf)

# Predict probabilities for ROC curve
y_pred_proba = hybrid_model.predict_proba(X_test_tfidf)[:, 1]

Calculate Evaluation Metrics
Compute accuracy, precision,** recall, F1-score, and AUC-ROC**

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

# Calculate metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred_proba)

# Print metrics
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-Score:", f1)
print("AUC-ROC:", roc_auc)

**Generate a Classification Report**

In [None]:
from sklearn.metrics import classification_report

# Generate classification report
class_report = classification_report(y_test, y_pred, target_names=["Legitimate", "DGA"])
print("Classification Report:\n", class_report)

**4: Plot the ROC Curve**

In [None]:
from sklearn.metrics import roc_curve
import matplotlib.pyplot as plt

# Calculate ROC curve
fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba)

# Plot ROC curve
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, label=f'AUC = {roc_auc:.2f}')
plt.plot([0, 1], [0, 1], linestyle='--', color='gray', label='Random Guess')
plt.xlabel('False Positive Rate (FPR)')
plt.ylabel('True Positive Rate (TPR)')
plt.title('ROC Curve')
plt.legend()
plt.show()

** 5: Plot the Precision-Recall Curve**

In [None]:
from sklearn.metrics import precision_recall_curve

# Calculate precision-recall curve
precision, recall, _ = precision_recall_curve(y_test, y_pred_proba)

# Plot precision-recall curve
plt.figure(figsize=(8, 6))
plt.plot(recall, precision, label='Precision-Recall Curve')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve')
plt.legend()
plt.show()

** the Confusion Matrix**

In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sns

# Calculate confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Plot confusion matrix
plt.figure(figsize=(6, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False,
            xticklabels=["Legitimate", "DGA"],
            yticklabels=["Legitimate", "DGA"])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()

**Features Importances**

In [None]:
import numpy as np

# Get feature importances from the SVM model
feature_importances = svm_model.coef_[0]

# Plot feature importances
plt.figure(figsize=(10, 6))
plt.bar(range(len(feature_importances)), feature_importances)
plt.xlabel('Feature Index')
plt.ylabel('Importance')
plt.title('Feature Importances from SVM')
plt.show()

**Evaluation Results**

In [None]:
import json

# Save evaluation metrics to a JSON file
evaluation_metrics = {
    "Accuracy": accuracy,
    "Precision": precision,
    "Recall": recall,
    "F1-Score": f1,
    "AUC-ROC": roc_auc
}

with open("/content/drive/MyDrive/Colab Notebooks/evaluation_metrics.json", "w") as f:
    json.dump(evaluation_metrics, f, indent=4)

# Save classification report to a text file
with open("/content/drive/MyDrive/Colab Notebooks/classification_report.txt", "w") as f:
    f.write(class_report)

# Save ROC curve plot
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, label=f'AUC = {roc_auc:.2f}')
plt.plot([0, 1], [0, 1], linestyle='--', color='gray', label='Random Guess')
plt.xlabel('False Positive Rate (FPR)')
plt.ylabel('True Positive Rate (TPR)')
plt.title('ROC Curve')
plt.legend()
plt.savefig("/content/drive/MyDrive/Colab Notebooks/roc_curve.png")
plt.close()

# Save confusion matrix plot
plt.figure(figsize=(6, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False,
            xticklabels=["Legitimate", "DGA"],
            yticklabels=["Legitimate", "DGA"])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.savefig("/content/drive/MyDrive/Colab Notebooks/confusion_matrix.png")
plt.close()