In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Data loading and train/test split
!mkdir /content/data
!unzip "/content/drive/MyDrive/Temp/Abhinaya/11_Oct-17_Oct/1550K MALWARE ANALYSIS DATASETS_API IMPORT.zip" -d /content/data

# A sample set 
!head -100 "/content/data/MALWARE ANALYSIS DATASETS_API IMPORT.csv" > /content/sample.csv

# Train set
!head -1240001 "/content/data/MALWARE ANALYSIS DATASETS_API IMPORT.csv" > /content/data/train.csv

# Test set
!head -1 "/content/data/MALWARE ANALYSIS DATASETS_API IMPORT.csv" > /content/data/test.csv
!tail -150000 "/content/data/MALWARE ANALYSIS DATASETS_API IMPORT.csv" >> /content/data/test.csv


In [None]:
# Check a sample set
import pandas as pd
sample_path = "/content/sample.csv"
data = pd.read_csv(sample_path)
#data.shape

# Create dataset path
train_path="/content/data/train.csv"
test_path="/content/data/test.csv"

import tensorflow as tf
# Custom Data Generators
def train_data():
    while True:
        for data,label in train:
            yield pd.DataFrame(data).to_numpy(), pd.Series(label).to_numpy()

def test_data():
    while True:
        for data,label in test:
            yield pd.DataFrame(data).to_numpy(), pd.Series(label).to_numpy()

In [None]:
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import Dropout
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D

In [None]:
n_timesteps=1000
n_features=1
n_outputs=1

model = Sequential()
model.add(tf.keras.layers.Reshape((1000, 1), input_shape=(1000,)))
model.add(Conv1D(filters=256, kernel_size=50, activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Dropout(0.5))
model.add(Conv1D(filters=64, kernel_size=50, activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Dropout(0.5))

model.add(Flatten())
model.add(Dense(100, activation='relu'))
model.add(Dense(n_outputs, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

model.fit(train_data(), steps_per_epoch=int(1240000/128),epochs=2)



<keras.callbacks.History at 0x7f532d4af2d0>

In [None]:
# Evaluation on test data
import numpy as np, pandas as pd
def get_acc(test_path, model):
    test_data = pd.read_csv(test_path)
    y_test=test_data.label.to_numpy()
    test_data=test_data.iloc[:,3:]

    y_pred = model.predict(test_data)
    y_pred =  np.where(y_pred<0.5,0.,1.)

#    accuracy, precision, recall, F-1, confusion matrix
    from sklearn.metrics import precision_score,f1_score, recall_score, confusion_matrix, accuracy_score
    acc = accuracy_score(y_test,y_pred)#(y_pred.ravel()==y_test).sum()/y_test.shape[0]

    print("Accuracy - ",acc)
    print("Recall - ",recall_score(y_test,y_pred))
    print("Precision - ",precision_score(y_test,y_pred))
    print("F1 - ",f1_score(y_test,y_pred))
    print("CM - ",confusion_matrix(y_test,y_pred))

    return acc, y_pred

get_acc(test_path, model)

#del y_pred ;import gc ;gc.collect()


Accuracy -  0.82232
Recall -  0.7526084056037359
Precision -  0.8742986453392851
F1 -  0.808902400550664
CM -  [[66940  8110]
 [18542 56408]]


**Post-Training:**
<br>Once your model is trained, save and store the model. Then, create a function (or method) that takes a PE file as its argument, runs it through the trained model, and returns the output (i.e., Malware or Benign).

In [None]:
# Save model
model_path="/content/drive/MyDrive/Temp/Abhinaya/11_Oct-17_Oct/model.h5"
#model.save(model_path)

from keras.models import load_model
def validate(path):
    model = load_model(model_path)

    acc, y_pred=get_acc(path, model)
    return y_pred

# Test for the sample data
y_pred_sample=validate("/content/sample.csv").ravel()

array([[0.],
       [0.],
       [0.],
       ...,
       [0.],
       [0.],
       [1.]])

**Task 2 - Deploy your model on the cloud:**
<br>In this task, you will be using Amazon Sagemaker to deploy your model on the cloud, and create an endpoint (~ API) so that other applications can make use of the model.

In [None]:
# Sagemaker is throwing error with this version of TF/Py
export_dir = 'export/Servo/1/'

import tensorflow as tf
tf.saved_model.save(model, export_dir)

import tarfile

output_filename="/content/model.tar.gz"
with tarfile.open(output_filename, "w:gz") as tar:
    tar.add("export")

In [None]:
# Upload the tar to ASW S3
!pip install boto3
import boto3
from boto3.session import Session

AWS_ACCESS_KEY_ID = 'XXXXXXXXXXXXXXXXXXXXXXXXXXXX'
AWS_SECRET_ACCESS_KEY = 'XXXXXXXXXXXXXXXXXXXXXXXXXXXX'

session = Session(aws_access_key_id=AWS_ACCESS_KEY_ID,aws_secret_access_key=AWS_SECRET_ACCESS_KEY)
s3 = session.resource('s3')

BUCKET = "ai-sec-mid"
s3.Bucket(BUCKET).upload_file(output_filename, "model.tar.gz")

In [None]:
# Create an End point for the trained model
!pip install sagemaker
from sagemaker.tensorflow.model import TensorFlowModel

import os
os.environ['AWS_DEFAULT_REGION'] = 'us-east-2'
os.environ['AWS_ACCESS_KEY_ID'] = 'XXXXXXXXXXXXXXXXXXXXXXXXXXXX'
os.environ['AWS_SECRET_ACCESS_KEY'] = 'XXXXXXXXXXXXXXXXXXXXXXXXXXXX'

model = TensorFlowModel(model_data='s3://ai-sec-mid/model.tar.gz', 
                        role='AmazonSageMaker-ExecutionRole-20211013T210978',
                        framework_version = '2.4.1', entry_point='train.py')

predictor = model.deploy(initial_instance_count=1, instance_type='ml.m4.xlarge')


**Task 3 –Create a client:**
<br>This task is quite simple as well: create a Python code that *loads* a PE file, converts it into a feature vector that is compatiblewith your MalConv/EMBER model, runs the vector on the cloud API, and then prints the results (i.e., Malware or Benign –or probabilities of each).

In [None]:
# Predict for the sample data
from sagemaker.predictor import Predictor
endpoint = 'sagemaker-tensorflow-serving-2021-10-16-07-35-22-029'

test_data = pd.read_csv("/content/sample.csv")
y_test=test_data.label.to_numpy()
test_data=test_data.iloc[:,3:]

predictor = Predictor(endpoint)
inference_response = predictor.predict(data=test_data)

y_pred =  np.where(y_pred<0.5,0.,1.)