In [None]:
from __future__ import print_function, division
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import pandas as pd
import matplotlib.pyplot as plt
from skimage import io, transform
from sklearn.model_selection import train_test_split
import numpy as np
from torch.utils.data import TensorDataset
from torch.utils.data import Dataset, DataLoader
from torch.utils.data.sampler import SubsetRandomSampler
from torch.utils.tensorboard import SummaryWriter
from sklearn.metrics import classification_report
%matplotlib inline
import datetime
import time






Custom Datset - Done 

create two two-layer fully-connected neural networks. The networks have an input dimension
of N (given by the number of features in an app),
a  hidden layer dimension of H (your choice), and perform classification over 2 classes.

The first network uses a linear activation after the first fully connected layer,
and the sigmoid function to compute the output. In other words, the first network
has the following architecture (where FC means “fully connected layer”): input - FC - linear - FC - sigmoid. (done)

The second network uses the ReLU activation function after the first layer, and the sigmoid to compute the output. Thus, the architecture of the second network is: input - FC - ReLU - FC - sigmoid. (done)

The binary cross-entropy (i.e., BCELoss) is used to compute the loss in both cases. Use a subset of the training data as validation to perform hyperparamter tuning.
(done)    


Log your models’ training and use TensorBoard to plot learning curves that show the variation of 
the loss/accuracy with the number of epochs (fixed) for both training and validations sets, 
for two different values of H, for both networks. The learning rate can also be fixed,
unless you’d like to experiment with two parameters (i.e., H and learning rate) at the same time.(done)

Identify your best model according to the validation data, and use it to evaluate the performance 
on the test dataset in terms of accuracy. 

In addition, report the precision, recall 
and F-measure on the malware class. 

Between the linear and non-linear networks, which one has better performance?

(Optional) Compare the model training times on CPU versus GPU.

In [None]:
#Imported my data from Google Drive
#from google.colab import drive
#drive.mount('/content/gdrive')


In [None]:
def load_data(file):
    return pd.read_csv(file)
androidtrain = load_data("/content/sample_data/AndroidAppsTrain.csv")
androidtest = load_data("/content/sample_data/AndroidAppsTest.csv")


In [None]:
def split_data(dataset):
    X_data,y_data = dataset.iloc[:,:-1],dataset.iloc[:,-1]
    y_data = y_data.to_frame()  
    return train_test_split(X_data,y_data,train_size = 0.7,random_state = 42,shuffle=True)
X_train_set,X_val_set,y_train_set,y_val_set =  split_data(androidtrain)

def split_test(dataset):
  X_data,y_data = dataset.iloc[:,:-1],dataset.iloc[:,-1]
  y_data = y_data.to_frame() 
  return  X_data,y_data
X_test,y_test = split_test(androidtest)

In [None]:
print(X_train_set.shape)
print(y_train_set.shape)

(103179, 471)
(103179, 1)


In [None]:
#Custom PyTorch Dataset
class CustomDataSet(Dataset):
    def __init__(self,features,target,transform = None):
        self.X = features
        self.y = target
        self.transform = transform
        
    def __len__(self):
        return len(self.X)

    def __getitem__(self,idx):      
        data_features = torch.Tensor(self.X.values)
        data_target = torch.Tensor(self.y.values)
        return data_features,data_target
            
            

In [None]:
X_train_set = torch.from_numpy(X_train_set.to_numpy()).float()
y_train_set = torch.from_numpy(y_train_set.to_numpy()).float()
X_val_set = torch.from_numpy(X_val_set.to_numpy()).float()
y_val_set =torch.from_numpy(y_val_set.to_numpy()).float()

X_test = torch.from_numpy(X_test.to_numpy()).float()
y_test =torch.from_numpy(y_test.to_numpy()).float()

In [None]:
#androidtrain_data = CustomDataSet(X_train_set,y_train_set)
###androidval_data = CustomDataSet(X_val_set,y_val_set)
#androidtest_data = CustomDataSet(X_test,y_test)


androidtrain_data = TensorDataset(X_train_set,y_train_set)
androidval_data = TensorDataset(X_val_set,y_val_set)
androidtest_data = TensorDataset(X_test,y_test)



In [None]:
batch_size = 128
# split to train val
dataset_size = len(androidtrain_data)
print(dataset_size)
print(len(androidtest_data))
print(len(androidval_data))

103179
72599
44220


In [None]:
#Load into DataLoader
train_dataloader = DataLoader(androidtrain_data, batch_size=batch_size)
val_dataloader = DataLoader(androidval_data, batch_size=batch_size)
test_dataloader = DataLoader(androidtest_data, batch_size=batch_size,shuffle=True,num_workers = 2)
    


In [None]:
print(val_dataloader.batch_size)
print(train_dataloader.sampler)

128
<torch.utils.data.sampler.SequentialSampler object at 0x7f87183e9518>


In [None]:
#len(train_dataloader)

In [None]:
#dataiter = iter(train_dataloader)
#myfeatures, labels = dataiter.next()
#print(myfeatures.shape)
#print(labels.shape)

In [None]:
#Define First model with Linear and sigmoid output layer
class Net(nn.Module):
    def __init__(self,input_size,output_size):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(input_size,5) # Change hidden layer  =[5,10]
        self.fc2 = nn.Linear(5,output_size)
        self.sigmoid = nn.Sigmoid()
        
    def forward(self, x):
        x = self.fc1(x)
        x = self.fc2(x)
        output = self.sigmoid(x)
        return output

output_dim = 1
input_dim = 471
net = Net(input_dim,output_dim)
if torch.cuda.is_available():
    net.cuda()
learning_rate = 0.01
optimizer1 = optim.SGD(net.parameters(), lr = learning_rate)
criterion1 = nn.BCELoss() 

In [None]:
#Define Second Model with ReLu and sigmoid output layer 
class Net2(nn.Module):
    def __init__(self,input_size,output_size):
        super(Net2, self).__init__()
        self.fc1 = nn.Linear(input_size,5) # Change hidden layer  =[5,10]
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(5,output_size)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.fc1(x)
        relu = self.relu(x)
        output = self.fc2(relu)
        output = self.sigmoid(output)
        return output

output_dim = 1
input_dim = 471
net2 = Net2(input_dim,output_dim)
if torch.cuda.is_available():
    net2.cuda()
learning_rate2 = 0.01
optimizer2 = optim.SGD(net2.parameters(), lr=learning_rate2)
criterion2 = nn.BCELoss()



In [None]:
#Train model with net()....Linear Model
writer = SummaryWriter('runs/linear_5/')
epochs = 50
train_losses, val_losses = [], [] 
train_accuracy = []
val_accuracy =[]
for e in range(epochs):
    running_loss = 0
    total_train = 0
    correct_train = 0
    val_loss = 0
    total_val = 0
    correct_val = 0
    
    
    for X,y in train_dataloader:
         if torch.cuda.is_available():
          optimizer1.zero_grad()
          
          X_1 = X.cuda()
          y_1 = y.cuda()
          pred = net(X_1)
          #print("X:",X_1.shape)
          #print("y:",y_1.shape)
          #print("Pred:",pred.shape) 
          loss = criterion1(pred, y_1)
          
          loss.backward()
          optimizer1.step()        
          running_loss += loss.item()
        
          #accuracy
          predicted = pred.data > 0.5
          correct_train += (predicted==y_1.view_as(predicted)).sum().item()
          total_train += int(y_1.shape[0])
          #print("GPU train")
         else:
          optimizer1.zero_grad()
          pred = net(X)
          #print("X:",X.shape)
          #print("y:",y.shape)
          #print("Pred:",pred.shape)     
          loss = criterion1(pred, y)
          
          loss.backward()
          optimizer1.step()        
          running_loss += loss.item()

          #accuracy
          predicted = pred.data > 0.5
          correct_train += (predicted==y.view_as(predicted)).sum().item()
          total_train += int(y.shape[0])
          #print("CPU train")
 


    with torch.no_grad():
      for X,y in val_dataloader:
        if torch.cuda.is_available():
          X_1 = X.cuda()
          y_1 = y.cuda()
          pred = net(X_1)
          val_loss += criterion1(pred, y_1)
          
          #accuracy
          predicted_t = pred.data > 0.5
          correct_val += (predicted_t==y_1.view_as(predicted_t)).sum().item()
          total_val += int(y_1.shape[0])
          #print("GPU_val train")
      
        else:
          pred = net(X)
          #print("X:",X.shape)
          #print("y:",y.shape)
          #print("Pred:",pred.shape)           
          val_loss += criterion1(pred, y)
          
          #accuracy
          predicted_t = pred.data > 0.5
          correct_val += (predicted_t==y.view_as(predicted_t)).sum().item()
          total_val += int(y.shape[0])
          #print("CPU_val train")
                          
    writer.add_scalar("Train Loss/Epoch", running_loss/len(train_dataloader), e+1)
    writer.add_scalar("Train Accuracy/Epoch", ( 100*correct_train / total_train), e+1)
    writer.add_scalar("Validation Loss/Epoch", (val_loss/len(val_dataloader)), e+1)
    writer.add_scalar("Validation Accuracy/Epoch", (100* correct_val / total_val), e+1)
    train_losses.append(running_loss/len(train_dataloader))
    val_losses.append(val_loss/len(val_dataloader))
    train_accuracy.append(100 * correct_train // total_train)
    val_accuracy.append(100 * correct_val // total_val)
    print("Epoch: {}/{}.. ".format(e+1, epochs),
          "Training Loss: {:.3f}.. ".format(running_loss/len(train_dataloader)),
          "Training Accuracy:%d %%"%( 100*correct_train / total_train),
          "Val Loss: {:.3f}.. ".format((val_loss/len(val_dataloader))),
          "val Accuracy: :%d %%"%(100* correct_val / total_val))
    
writer.flush()
    
print('Finished Training')

writer.close()

Epoch: 1/50..  Training Loss: 0.328..  Training Accuracy:90 % Val Loss: 0.235..  val Accuracy: :92 %
Epoch: 2/50..  Training Loss: 0.201..  Training Accuracy:93 % Val Loss: 0.175..  val Accuracy: :94 %
Epoch: 3/50..  Training Loss: 0.166..  Training Accuracy:94 % Val Loss: 0.155..  val Accuracy: :94 %
Epoch: 4/50..  Training Loss: 0.151..  Training Accuracy:95 % Val Loss: 0.145..  val Accuracy: :95 %
Epoch: 5/50..  Training Loss: 0.143..  Training Accuracy:95 % Val Loss: 0.139..  val Accuracy: :95 %
Epoch: 6/50..  Training Loss: 0.138..  Training Accuracy:95 % Val Loss: 0.135..  val Accuracy: :95 %
Epoch: 7/50..  Training Loss: 0.135..  Training Accuracy:95 % Val Loss: 0.133..  val Accuracy: :95 %
Epoch: 8/50..  Training Loss: 0.132..  Training Accuracy:95 % Val Loss: 0.131..  val Accuracy: :95 %
Epoch: 9/50..  Training Loss: 0.131..  Training Accuracy:95 % Val Loss: 0.129..  val Accuracy: :95 %
Epoch: 10/50..  Training Loss: 0.129..  Training Accuracy:95 % Val Loss: 0.128..  val Accur

In [None]:
writer.close()

In [None]:
#Train model with net()....Non-Linear Model
writer = SummaryWriter('runs/nonlinear_5/') 
epochs = 50
train_losses2, val_losses2 = [], [] 
train_accuracy2 = []
val_accuracy2 =[]
for e in range(epochs):
    running_loss2 = 0
    total_train2 = 0
    correct_train2 = 0
    val_loss2 = 0
    total_val2 = 0
    correct_val2 = 0
    
    
    for X_new,y_new in train_dataloader:
         if torch.cuda.is_available():
          optimizer2.zero_grad()
          X_2 = X_new.cuda()
          y_2 = y_new.cuda()
          pred2 = net2(X_2)
          #print("X:",X_1.shape)
          #print("y:",y_1.shape)
          #print("Pred:",pred.shape) 
          loss2 = criterion2(pred2, y_2)
          loss2.backward()
          optimizer2.step()        
          running_loss2 += loss2.item()
        
          #accuracy
          predicted2 = pred2.data > 0.5
          #top_p,predicted2 = pred2.topk(1,dim=1)
          correct_train2 += (predicted2==y_2.view_as(predicted2)).sum().item()
          total_train2 += int(y_2.shape[0])
          #print("GPU train")
         else:
          optimizer2.zero_grad()
          pred2 = net2(X_new)
          #print("X:",X.shape)
          #print("y:",y.shape)
          #print("Pred:",pred.shape)     
          loss2 = criterion2(pred2, y_new)
          loss2.backward()
          optimizer2.step()        
          running_loss2 += loss2.item()

          #accuracy
          predicted2 = pred2.data > 0.5
          #top_p,predicted2 = pred2.topk(1,dim=1)
          correct_train2 += (predicted2==y_new.view_as(predicted2)).sum().item()
          total_train2 += int(y_new.shape[0])
          #print("CPU train")


    with torch.no_grad():
      for X_new,y_new in val_dataloader:
        if torch.cuda.is_available():
          X_2 = X_new.cuda()
          y_2 = y_new.cuda()
          pred2 = net2(X_2)
          
          val_loss2 += criterion2(pred2, y_2)
          
          #accuracy
          predicted_t2 = pred2.data > 0.5
          #top_p,predicted_t2 = pred2.topk(1,dim=1)
          correct_val2 += (predicted_t2==y_2.view_as(predicted_t2)).sum().item()
          total_val2 += int(y_2.shape[0])
          #print("GPU_val train")
        
      
        else:
          pred2 = net2(X_new)
          #print("X:",X.shape)
          #print("y:",y.shape)
          #print("Pred:",pred.shape)           
          val_loss2 += criterion2(pred, y_new)
          #accuracy
          #top_p,predicted2_t = pred2.topk(1,dim=1)
          predicted2_t = pred2.data > 0.5
          correct_val2 += (predicted_t2==y_new.view_as(predicted_t2)).sum().item()
          total_val2 += int(y_new.shape[0])
          #print("CPU_val train")

    
    writer.add_scalar("Train Loss/Epoch", running_loss2/len(train_dataloader), e+1)
    writer.add_scalar("Train Accuracy/Epoch", ( 100*correct_train2 / total_train2), e+1)
    writer.add_scalar("Validation Loss/Epoch", (val_loss2/len(val_dataloader)), e+1)
    writer.add_scalar("Validation Accuracy/Epoch", (100* correct_val2 / total_val2), e+1)
    print("Epoch: {}/{}.. ".format(e+1, epochs),
          "Training Loss: {:.3f}.. ".format(running_loss2/len(train_dataloader)),
          "Training Accuracy:%d %%"%( 100*correct_train2 / total_train2),
          "Val Loss: {:.3f}.. ".format((val_loss2/len(val_dataloader))),
          "val Accuracy: :%d %%"%(100* correct_val2 / total_val2))
    
writer.flush()
    
print('Finished Validating')

writer.close()
     

Epoch: 1/50..  Training Loss: 0.374..  Training Accuracy:90 % Val Loss: 0.288..  val Accuracy: :90 %
Epoch: 2/50..  Training Loss: 0.264..  Training Accuracy:90 % Val Loss: 0.238..  val Accuracy: :90 %
Epoch: 3/50..  Training Loss: 0.221..  Training Accuracy:90 % Val Loss: 0.192..  val Accuracy: :92 %
Epoch: 4/50..  Training Loss: 0.177..  Training Accuracy:93 % Val Loss: 0.161..  val Accuracy: :94 %
Epoch: 5/50..  Training Loss: 0.155..  Training Accuracy:94 % Val Loss: 0.147..  val Accuracy: :95 %
Epoch: 6/50..  Training Loss: 0.145..  Training Accuracy:95 % Val Loss: 0.140..  val Accuracy: :95 %
Epoch: 7/50..  Training Loss: 0.139..  Training Accuracy:95 % Val Loss: 0.136..  val Accuracy: :95 %
Epoch: 8/50..  Training Loss: 0.136..  Training Accuracy:95 % Val Loss: 0.133..  val Accuracy: :95 %
Epoch: 9/50..  Training Loss: 0.133..  Training Accuracy:95 % Val Loss: 0.131..  val Accuracy: :95 %
Epoch: 10/50..  Training Loss: 0.131..  Training Accuracy:95 % Val Loss: 0.129..  val Accur

Answer: According to the validation data, my best model is the net2() nonlinear model since it converges faster than the linear model, hence, I am using it on my test data.

In [None]:
#Test model on Test Dataset
#Train model with net2()....Non-Linear Model with hidden layer 
writer = SummaryWriter('runs/test/') 
epochs = 50
test_losses2 = [] 
test_accuracy2 =[]
all_predicts =[]
all_labels =[]
for e in range(epochs):
  test_loss2 = 0
  total_test2 = 0
  correct_test2 = 0
    
  with torch.no_grad():
      for X_new,y_new in test_dataloader:
        if torch.cuda.is_available():
          X_2 = X_new.cuda()
          y_2 = y_new.cuda()
          pred2 = net2(X_2)
          test_loss2 += criterion2(pred2, y_2)
          #accuracy
          predicted_t2 = pred2.data > 0.5
          final_predicted = predicted_t2.int().cpu().data.numpy()
          label = y_2.cpu()
          label = torch.gt(label , 0).int().data.numpy()
          correct_test2 += (predicted_t2==y_2.view_as(predicted_t2)).sum().item()
          total_test2 += int(y_2.shape[0])
          #print("GPU_val train")
      
        else:
          pred2 = net2(X_new)
          #print("X:",X.shape)
          #print("y:",y.shape)
          #print("Pred:",pred.shape)           
          test_loss2 += criterion2(pred, y_new)
          #accuracy
          predicted2_t = pred2.data > 0.5
          final_predicted = predicted_t2.data.numpy()
          label = torch.gt(y_new , 0)
          correct_test2 += (predicted_t2==y_new.view_as(predicted_t2)).sum().item()
          total_test2 += int(y_new.shape[0])                
  all_labels.append(label)
  all_predicts.append(final_predicted)       
  
  writer.add_scalar("Validation Loss/Epoch", (test_loss2/len(test_dataloader)), e+1)
  writer.add_scalar("Validation Accuracy/Epoch", (100* correct_test2 / total_test2), e+1)
  print("Epoch: {}/{}.. ".format(e+1, epochs),
          "Test Loss: {:.3f}.. ".format((test_loss2/len(test_dataloader))),
          "Test Accuracy: :%d %%"%(100* correct_test2 / total_test2))
writer.flush()
    
print('Finished Testing')

writer.close()


Epoch: 1/50..  Test Loss: 0.115..  Test Accuracy: :96 %
Epoch: 2/50..  Test Loss: 0.115..  Test Accuracy: :96 %
Epoch: 3/50..  Test Loss: 0.115..  Test Accuracy: :96 %
Epoch: 4/50..  Test Loss: 0.115..  Test Accuracy: :96 %
Epoch: 5/50..  Test Loss: 0.115..  Test Accuracy: :96 %
Epoch: 6/50..  Test Loss: 0.115..  Test Accuracy: :96 %
Epoch: 7/50..  Test Loss: 0.115..  Test Accuracy: :96 %
Epoch: 8/50..  Test Loss: 0.115..  Test Accuracy: :96 %
Epoch: 9/50..  Test Loss: 0.115..  Test Accuracy: :96 %
Epoch: 10/50..  Test Loss: 0.115..  Test Accuracy: :96 %
Epoch: 11/50..  Test Loss: 0.115..  Test Accuracy: :96 %
Epoch: 12/50..  Test Loss: 0.115..  Test Accuracy: :96 %
Epoch: 13/50..  Test Loss: 0.115..  Test Accuracy: :96 %
Epoch: 14/50..  Test Loss: 0.115..  Test Accuracy: :96 %
Epoch: 15/50..  Test Loss: 0.115..  Test Accuracy: :96 %
Epoch: 16/50..  Test Loss: 0.115..  Test Accuracy: :96 %
Epoch: 17/50..  Test Loss: 0.115..  Test Accuracy: :96 %
Epoch: 18/50..  Test Loss: 0.115..  Test

In [None]:
# Evaluate Precision, Recall and F-measure
y_true = all_labels
y_pred = all_predicts
def flatten(x):
    result = []
    for el in x:
        if hasattr(el, "__iter__") and not isinstance(el, str):
            result.extend(flatten(el))
        else:
            result.append(el)
    return result
y_true = flatten(y_true)
y_pred = flatten(y_pred)
target_names = ['class 0', 'class 1']
print(classification_report(y_true, y_pred, target_names=target_names))



              precision    recall  f1-score   support

     class 0       0.98      0.99      0.98      1055
     class 1       0.92      0.73      0.81        95

    accuracy                           0.97      1150
   macro avg       0.95      0.86      0.90      1150
weighted avg       0.97      0.97      0.97      1150



# New Section

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Answers:

For my experiments, I used two different hidden layers[5,10] to test my two models. After running with different learning rates, I noticed I got a better performace with my non linear model(net2()) since it converges the fastest with hidden model = 10. 

Answer:

For my test model, I got an accuracy of 96% same with both my linear and non-linear model.

Answer:

The non-linear model is slightly better since it converges faster, however both converges at 96% accuracy.