<a href="https://colab.research.google.com/github/anupkashyap/malicious-app-classification/blob/main/classifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%pip install lazypredict

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting lazypredict
  Downloading lazypredict-0.2.12-py2.py3-none-any.whl (12 kB)
Installing collected packages: lazypredict
Successfully installed lazypredict-0.2.12


In [2]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [3]:
#Import libraries
import lazypredict
from lazypredict.Supervised import LazyClassifier
from sklearn.model_selection import train_test_split
import json
import os
import pandas as pd
from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn import tree
from sklearn.svm import SVC

import sys
sys.stdout=open("output.txt","w")

In [4]:
#Config
PERMISSION_COUNT=75
TEST_SIZE=.2
DANGEROUS_PERMISSIONS = ['READ_CALENDAR','WRITE_CALENDAR','CAMERA','READ_CONTACTS','WRITE_CONTACTS','GET_ACCOUNTS','ACCESS_FINE_LOCATION','ACCESS_COARSE_LOCATION','RECORD_AUDIO','READ_PHONE_STATE','READ_PHONE_NUMBERS ','CALL_PHONE','ANSWER_PHONE_CALLS ','READ_CALL_LOG','WRITE_CALL_LOG','ADD_VOICEMAIL','USE_SIP','PROCESS_OUTGOING_CALLS','BODY_SENSORS','SEND_SMS','RECEIVE_SMS','READ_SMS','RECEIVE_WAP_PUSH','RECEIVE_MMS','READ_EXTERNAL_STORAGE','WRITE_EXTERNAL_STORAGE','ACCESS_MEDIA_LOCATION','ACCEPT_HANDOVER','ACCESS_BACKGROUND_LOCATION','ACTIVITY_RECOGNITION']
FEATURE_LIST=["permissions", "libraries","activities", "files", "features", "providers"]

In [5]:
#Load features
featuresben = json.load(open('gdrive/MyDrive/featuresBenign.json'))
featuresmal = json.load(open('gdrive/MyDrive/featuresMalware.json'))

In [6]:
#Get list of all permissions
permissions={}
for app in featuresben:

  for p in app['permissions']:
    permissions[p.split('.')[-1]]=permissions[p.split('.')[-1]]+1 if p in permissions.keys() else 1

for app in featuresmal:

  for p in app['permissions']:
    permissions[p.split('.')[-1]]=permissions[p.split('.')[-1]]+1 if p in permissions.keys() else 1
len(permissions)

188

In [7]:
#Get the top 'PERMISSION_COUNT' permissions
dict(sorted(permissions.items(), key=lambda item: item[1],reverse=True))

top_permissions=list(permissions.keys())[0:PERMISSION_COUNT-1]

print(top_permissions)


In [8]:
#Create one hot encoding for the frequently used permissions 
dataframes=[]
encoding=[]
for app in featuresben:
  row=[]
  for p in top_permissions:
    if(p in [t.split('.')[-1] for t in app['permissions']]):
      row.append(1)
    else:
      row.append(0)
  row.append(1 if app['isMalicious'] else 0)
  encoding.append(row)
benign_df_freq= pd.DataFrame(encoding,columns=[p for p in top_permissions]+['Class'])
# benign_df_freq.head()

encoding=[]
for app in featuresmal:
  row=[]
  for p in top_permissions:
    if(p in [t.split('.')[-1] for t in app['permissions']]):
      row.append(1)
    else:
      row.append(0)
  row.append(1 if app['isMalicious'] else 0)
  encoding.append(row)
malicous_df_freq= pd.DataFrame(encoding,columns=[p for p in top_permissions]+['Class'])
# malicous_df_freq.head()

dataframes.append((benign_df_freq,malicous_df_freq,"FREQUENTLY USED PERMISSIONS"))

In [9]:
#Create one hot encoding for the dangerous permissions 
encoding=[]
for app in featuresben:
  row=[]
  for p in DANGEROUS_PERMISSIONS:
    if(p in [t.split('.')[-1] for t in app['permissions']]):
      row.append(1)
    else:
      row.append(0)
  row.append(1 if app['isMalicious'] else 0)
  encoding.append(row)
benign_df_dangerous= pd.DataFrame(encoding,columns=[p for p in DANGEROUS_PERMISSIONS]+['Class'])
#benign_df_dangerous.head()
encoding=[]
for app in featuresmal:
  row=[]
  for p in DANGEROUS_PERMISSIONS:
    if(p in [t.split('.')[-1] for t in app['permissions']]):
      row.append(1)
    else:
      row.append(0)
  row.append(1 if app['isMalicious'] else 0)
  encoding.append(row)
malicous_df_dangerous= pd.DataFrame(encoding,columns=[p for p in DANGEROUS_PERMISSIONS]+['Class'])
#malicous_df_dangerous.head()

dataframes.append((benign_df_dangerous,malicous_df_dangerous,"Dangerous Permissions"))

In [10]:
#Feature Set 2
#Extract feature counts from the apps

feature_count_benign = []
for app in featuresben:
  inner_list_ben = []
  for key,value in app.items():
    if key in FEATURE_LIST:
      inner_list_ben.append(len(value))
  inner_list_ben.append(1 if key== 'isMalicious' and value == 1 else 0)
  feature_count_benign.append(inner_list_ben)

#print(feature_count_benign)
benign_df_feature_count= pd.DataFrame(feature_count_benign,columns=[f for f in FEATURE_LIST] +["Class"])
#malicous_df_feature_count.head()

feature_count_malicious = []
for app in featuresmal:
  inner_list_mal = []
  for key,value in app.items():
    if key in FEATURE_LIST:
      inner_list_mal.append(len(value))
  inner_list_mal.append(1 if key== 'isMalicious' and value == 1 else 0)
  feature_count_malicious.append(inner_list_mal)

#print(feature_count_malicious)
malicous_df_feature_count= pd.DataFrame(feature_count_malicious,columns=[f for f in FEATURE_LIST] +["Class"])
#malicous_df_feature_count.head()

dataframes.append((benign_df_dangerous,malicous_df_dangerous,"FEATURE COUNT"))

In [11]:
#Feature set 3
#Combination of all previous features

malicous_df_combined= pd.concat([malicous_df_dangerous[malicous_df_dangerous.columns.difference(['Class'])],malicous_df_feature_count ],axis=1)
benign_df_combined= pd.concat([benign_df_dangerous[benign_df_dangerous.columns.difference(['Class'])],benign_df_feature_count ],axis=1)
dataframes.append((benign_df_combined,malicous_df_combined,"COMBINED FEATURES"))
print(type(benign_df_combined))
print(type(dataframes[3][0]))


In [12]:
len(dataframes)

4

In [13]:
#Create test and train dataset splits
classifier = LazyClassifier(verbose=0,ignore_warnings=True, custom_metric=None)
for d in dataframes:
  Xb_train, Xb_test, yb_train, yb_test = train_test_split(d[0][d[0].columns.difference(['Class'])], d[0]['Class'],test_size=TEST_SIZE,random_state =123)
  Xm_train, Xm_test, ym_train, ym_test = train_test_split(d[1][d[1].columns.difference(['Class'])], d[1]['Class'],test_size=TEST_SIZE,random_state =123)

  
  models,predictions = classifier.fit(pd.concat([Xb_train,Xm_train]), pd.concat([Xb_test,Xm_test]), pd.concat([yb_train,ym_train]), pd.concat([yb_test,ym_test]))
  print("\n\n"+"#"*30)
  print(d[2])
  print(models)

100%|██████████| 29/29 [00:04<00:00,  6.46it/s]
100%|██████████| 29/29 [00:03<00:00,  7.47it/s]
100%|██████████| 29/29 [00:03<00:00,  7.80it/s]
100%|██████████| 29/29 [00:03<00:00,  8.46it/s]


In [14]:
#Random Forest Classifier
print("\n\n"+"#"*30)
print("RANDOM FOREST CLASSIFIER")
print("#"*30)
for d in dataframes:
 

  Xb_train, Xb_test, yb_train, yb_test = train_test_split(d[0][d[0].columns.difference(['Class'])], d[0]['Class'],test_size=TEST_SIZE,random_state =123)
  Xm_train, Xm_test, ym_train, ym_test = train_test_split(d[1][d[1].columns.difference(['Class'])], d[1]['Class'],test_size=TEST_SIZE,random_state =123)

  
  X = pd.concat([Xb_train, Xm_train])
  y = pd.concat([yb_train, ym_train])              
  mod = RandomForestClassifier(max_depth=5, random_state=0)
  print("\n\n"+"-"*30)
  print(d[2])
  print("-"*30)
  mod.fit(X, y)
  print("Accuracy :",mod.score(pd.concat([Xb_test,Xm_test]),pd.concat([yb_test,ym_test])))
  scores = cross_validate(mod, X, y, cv=5, scoring = ['f1','precision','accuracy'])
  for s in scores:
    print(s,scores[s])

In [15]:
# Decision Tree Classifier
print("\n\n"+"#"*30)
print("DECISION TREE CLASSIFIER")
print("#"*30)
for d in dataframes:
  print("\n\n"+"#"*30)
  print("DECISION TREE CLASSIFIER")
  print("#"*30)

  Xb_train, Xb_test, yb_train, yb_test = train_test_split(d[0][d[0].columns.difference(['Class'])], d[0]['Class'],test_size=TEST_SIZE,random_state =123)
  Xm_train, Xm_test, ym_train, ym_test = train_test_split(d[1][d[1].columns.difference(['Class'])], d[1]['Class'],test_size=TEST_SIZE,random_state =123)

  
  X = pd.concat([Xb_train, Xm_train])
  y = pd.concat([yb_train, ym_train])              
  mod = tree.DecisionTreeClassifier()
  print("\n\n"+"-"*30)
  print(d[2])
  print("-"*30)
  mod.fit(X, y)
  print("Accuracy :",mod.score(pd.concat([Xb_test,Xm_test]),pd.concat([yb_test,ym_test])))
  scores = cross_validate(mod, X, y, cv=5, scoring = ['f1','precision','accuracy'])
  for s in scores:
    print(s,scores[s])

In [16]:
#SVC Classifier
print("\n\n"+"#"*30)
print("Support Vector CLASSIFIER")
print("#"*30)

for d in dataframes:

  Xb_train, Xb_test, yb_train, yb_test = train_test_split(d[0][d[0].columns.difference(['Class'])], d[0]['Class'],test_size=TEST_SIZE,random_state =123)
  Xm_train, Xm_test, ym_train, ym_test = train_test_split(d[1][d[1].columns.difference(['Class'])], d[1]['Class'],test_size=TEST_SIZE,random_state =123)

  
  X = pd.concat([Xb_train, Xm_train])
  y = pd.concat([yb_train, ym_train])              
  mod = SVC(gamma = 'auto')
  print("\n\n"+"-"*30)
  print(d[2])
  print("-"*30)
  mod.fit(X, y)
  print("Accuracy :",mod.score(pd.concat([Xb_test,Xm_test]),pd.concat([yb_test,ym_test])))
  scores = cross_validate(mod, X, y, cv=5, scoring = ['f1','precision','accuracy'])
  for s in scores:
    print(s,scores[s])