In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import pandas as pd
import os
import re

def create_embedding_df(base_dir, label):
    data = []
    for file_name in os.listdir(base_dir):
        file_path = os.path.join(base_dir, file_name)
        if os.path.isfile(file_path) and not re.search("^OUTPUT_ANDROPY_", file_name):
            try:
                with open(file_path, "r") as file:
                    file_content = file.read()
                    data.append({'fileContent': file_content, 'label': label})
            except Exception as e:
                print(f"Error reading file {file_name}: {e}")
    df = pd.DataFrame(data, columns=['fileContent', 'label'])
    return df

# Define base directories
base_dir_train = "/content/drive/MyDrive/Android_Malware_Features-main"
dir_BW = os.path.join(base_dir_train, "Benign", "Features_files")
dir_adware = os.path.join(base_dir_train, "Malware", "Adware", "Features_files")
dir_banking = os.path.join(base_dir_train, "Malware", "Banking", "Features_files")
dir_riskware = os.path.join(base_dir_train, "Malware", "Riskware", "Features_files")

# Create DataFrames
df_train_BW = create_embedding_df(dir_BW, 0)
df_train_adware = create_embedding_df(dir_adware, 1)  # Adware with label 1
df_train_banking = create_embedding_df(dir_banking, 2)  # Banking with label 2
df_train_riskware = create_embedding_df(dir_riskware, 3)  # Riskware with label 3

# Concatenate DataFrames
df_train = pd.concat([df_train_BW, df_train_adware, df_train_banking, df_train_riskware], ignore_index=True)

# Shuffle the DataFrame rows
df_train = df_train.sample(frac=1).reset_index(drop=True)

# Extract texts and labels
texts = df_train['fileContent'].tolist()
labels = df_train['label'].tolist()


In [4]:
print(len(texts))
print(len(dir_BW ))
print(len(dir_adware ))
print(len(dir_banking ))
print(len(dir_riskware))

11063
74
82
83
84


In [None]:
print(labels)

[3, 1, 0, 0, 0, 2, 0, 3, 3, 0, 0, 3, 3, 2, 0, 3, 0, 2, 1, 0, 0, 3, 3, 0, 0, 1, 0, 2, 0, 0, 3, 3, 3, 2, 0, 2, 2, 0, 3, 0, 0, 3, 0, 3, 0, 3, 2, 2, 3, 2, 3, 1, 2, 0, 3, 0, 1, 0, 2, 3, 0, 3, 2, 3, 3, 3, 0, 0, 2, 0, 3, 0, 1, 3, 2, 1, 0, 3, 3, 0, 0, 0, 2, 0, 2, 0, 3, 2, 1, 0, 3, 1, 0, 0, 3, 2, 2, 0, 0, 2, 0, 3, 2, 3, 1, 0, 2, 3, 1, 3, 3, 0, 3, 1, 3, 3, 3, 0, 0, 0, 3, 1, 0, 3, 3, 0, 0, 0, 3, 3, 0, 3, 0, 1, 0, 3, 1, 1, 2, 0, 2, 3, 1, 0, 0, 2, 1, 0, 3, 0, 1, 0, 1, 0, 3, 2, 3, 0, 3, 3, 1, 3, 3, 3, 0, 0, 0, 0, 1, 0, 2, 3, 3, 2, 1, 2, 3, 0, 0, 0, 0, 1, 2, 3, 2, 3, 1, 3, 2, 2, 3, 3, 0, 2, 0, 2, 2, 3, 2, 0, 3, 3, 0, 0, 0, 0, 2, 0, 3, 0, 1, 0, 2, 1, 3, 3, 2, 3, 3, 3, 3, 0, 1, 3, 2, 0, 0, 3, 2, 2, 3, 1, 0, 2, 2, 0, 2, 0, 0, 3, 3, 3, 0, 0, 0, 0, 0, 3, 2, 3, 2, 0, 3, 2, 2, 2, 3, 0, 0, 0, 0, 1, 3, 3, 0, 0, 3, 2, 1, 2, 0, 3, 3, 0, 2, 2, 2, 0, 2, 0, 3, 0, 3, 3, 2, 0, 2, 3, 2, 3, 0, 1, 1, 3, 0, 0, 1, 3, 0, 0, 2, 2, 3, 3, 2, 1, 0, 2, 3, 3, 1, 0, 2, 3, 3, 0, 2, 3, 0, 3, 3, 2, 2, 2, 3, 3, 2, 0, 0, 0, 1, 3, 0, 

In [5]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
import numpy as np

def tokenize_and_sequence_in_batches(full_texts, texts, labels, max_len, max_words, validation_samples, test_samples, batch_size=1000, shuffle=True):
    tokenizer = Tokenizer(num_words=max_words)
    tokenizer.fit_on_texts(full_texts)
    word_index = tokenizer.word_index
    print('{} unique tokens found'.format(len(word_index)))

    num_batches = len(texts) // batch_size + 1
    all_data = []
    all_labels = []

    for i in range(num_batches):
        print(f'Processing batch {i + 1}/{num_batches}')
        batch_texts = texts[i * batch_size: (i + 1) * batch_size]
        batch_labels = labels[i * batch_size: (i + 1) * batch_size]

        sequences = tokenizer.texts_to_sequences(batch_texts)
        data = pad_sequences(sequences, maxlen=max_len)
        batch_labels = np.asarray(batch_labels)

        all_data.append(data)
        all_labels.append(batch_labels)

    all_data = np.vstack(all_data)
    all_labels = np.concatenate(all_labels)

    print('Data tensor shape: {}\nLabel tensor shape: {}'.format(all_data.shape, all_labels.shape))

    if shuffle:
        indices = np.arange(all_data.shape[0])
        np.random.shuffle(indices)
        all_data = all_data[indices]
        all_labels = all_labels[indices]

    x_train_val, x_test, y_train_val, y_test = train_test_split(all_data, all_labels, test_size=test_samples, random_state=42)
    x_train, x_val, y_train, y_val = train_test_split(x_train_val, y_train_val, test_size=validation_samples, random_state=42)

    return tokenizer, word_index, x_train, y_train, x_val, y_val, x_test, y_test

# Parameters for tokenization and sequence padding



In [29]:

print(len(word_index))
print(len(x_train))
print(len(y_train))
print(len(x_val))
print(len(y_val))
print(len(x_test))
print(len(y_test))

4923689
7964
7964
1992
1992
1107
1107


In [6]:
max_words = 10000
max_len = 100
validation_samples = 0.2
test_samples = 0.1

# Tokenize and sequence data
tokenizer, word_index, x_train, y_train, x_val, y_val, x_test, y_test = tokenize_and_sequence_in_batches(
    texts, texts, labels, max_len, max_words, validation_samples, test_samples
)

4923689 unique tokens found
Processing batch 1/12
Processing batch 2/12
Processing batch 3/12
Processing batch 4/12
Processing batch 5/12
Processing batch 6/12
Processing batch 7/12
Processing batch 8/12
Processing batch 9/12
Processing batch 10/12
Processing batch 11/12
Processing batch 12/12
Data tensor shape: (11063, 100)
Label tensor shape: (11063,)


In [7]:
from keras.utils import to_categorical

# One-hot encode labels
y_train = to_categorical(y_train, num_classes=4)
y_val = to_categorical(y_val, num_classes=4)
y_test = to_categorical(y_test, num_classes=4)


In [8]:
print(y_train)
print(y_val)
print(y_test)

[[0. 1. 0. 0.]
 [1. 0. 0. 0.]
 [0. 0. 0. 1.]
 ...
 [0. 0. 0. 1.]
 [0. 1. 0. 0.]
 [0. 1. 0. 0.]]
[[0. 0. 0. 1.]
 [0. 0. 1. 0.]
 [1. 0. 0. 0.]
 ...
 [1. 0. 0. 0.]
 [0. 0. 1. 0.]
 [1. 0. 0. 0.]]
[[1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [0. 0. 0. 1.]
 ...
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]]


In [21]:
from keras.models import Sequential
from keras.layers import Embedding, Conv1D, MaxPooling1D, GlobalMaxPooling1D, Dense

# Define the model parameters
max_words = 10000  # Vocabulary size
max_len = 100      # Sequence length

# Build the model
model = Sequential()
model.add(Embedding(input_dim=max_words, output_dim=128, input_length=max_len))
model.add(Conv1D(128, 5, activation='relu'))
model.add(MaxPooling1D(2))  # Adjusted pooling size to prevent dimension reduction
model.add(Conv1D(128, 5, activation='relu'))
model.add(MaxPooling1D(2))
model.add(Conv1D(128, 5, activation='relu'))
model.add(GlobalMaxPooling1D())
model.add(Dense(128, activation='relu'))
model.add(Dense(4, activation='softmax'))

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

model.summary()  # To check the final output dimensions


Model: "sequential_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_5 (Embedding)     (None, 100, 128)          1280000   
                                                                 
 conv1d_15 (Conv1D)          (None, 96, 128)           82048     
                                                                 
 max_pooling1d_10 (MaxPooli  (None, 48, 128)           0         
 ng1D)                                                           
                                                                 
 conv1d_16 (Conv1D)          (None, 44, 128)           82048     
                                                                 
 max_pooling1d_11 (MaxPooli  (None, 22, 128)           0         
 ng1D)                                                           
                                                                 
 conv1d_17 (Conv1D)          (None, 18, 128)          

In [23]:
# Train the model
history = model.fit(x_train, y_train, epochs=20, batch_size=128, validation_data=(x_val, y_val))


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [25]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Evaluate the model on the test set
y_pred = model.predict(x_test)
y_pred_classes = np.argmax(y_pred, axis=1)
y_true = np.argmax(y_test, axis=1)

# Calculate performance metrics
accuracy = accuracy_score(y_true, y_pred_classes)
precision = precision_score(y_true, y_pred_classes, average='weighted')
recall = recall_score(y_true, y_pred_classes, average='weighted')
f1 = f1_score(y_true, y_pred_classes, average='weighted')

# Print performance metrics
print(f"Test Accuracy: {accuracy:.4f}")
print(f"Test Precision: {precision:.4f}")
print(f"Test Recall: {recall:.4f}")
print(f"Test F1 Score: {f1:.4f}")


Test Accuracy: 0.9801
Test Precision: 0.9802
Test Recall: 0.9801
Test F1 Score: 0.9802


In [26]:
# Save the model
model.save("/content/drive/MyDrive/CNN_new.h5", save_format='h5')
print("Model saved successfully")

  saving_api.save_model(


Model saved successfully
