# Analysis - NATICUSdroid

*by : Zahrizhal Ali*

# Import Packages

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Dataset Overview

📚 Data berisi izin yang diambil dari lebih dari 29.000 aplikasi Android, termasuk aplikasi jinak (tidak berbahaya) dan malware (berbahaya). Aplikasi ini dirilis antara tahun 2010 dan 2019.

## Read Dataset

In [2]:
df_train = pd.read_csv('data/training.csv')
df_test = pd.read_csv('data/testing.csv')

In [3]:
print(f"Jumlah data train: {len(df_train)} data")
print(f"Jumlah data test: {len(df_test)} data")

print(f"\nJumlah column data Train {len(df_train.columns)}")
print(f"Jumlah column data Test {len(df_test.columns)}")

Jumlah data train: 20531 data
Jumlah data test: 8799 data

Jumlah column data Train 88
Jumlah column data Test 87


## Dataset Train Info

In [4]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20531 entries, 0 to 20530
Data columns (total 88 columns):
 #   Column                                                                         Non-Null Count  Dtype
---  ------                                                                         --------------  -----
 0   index                                                                          20531 non-null  int64
 1   android.permission.GET_ACCOUNTS                                                20531 non-null  int64
 2   com.sonyericsson.home.permission.BROADCAST_BADGE                               20531 non-null  int64
 3   android.permission.READ_PROFILE                                                20531 non-null  int64
 4   android.permission.MANAGE_ACCOUNTS                                             20531 non-null  int64
 5   android.permission.WRITE_SYNC_SETTINGS                                         20531 non-null  int64
 6   android.permission.READ_EXTERNAL_STORA

📝**Note** Looking at the entire column attribute we can infer that there is no missing value since the total rows was 20531.

## Dataset Description

In [5]:
df_train.describe()

Unnamed: 0,index,android.permission.GET_ACCOUNTS,com.sonyericsson.home.permission.BROADCAST_BADGE,android.permission.READ_PROFILE,android.permission.MANAGE_ACCOUNTS,android.permission.WRITE_SYNC_SETTINGS,android.permission.READ_EXTERNAL_STORAGE,android.permission.RECEIVE_SMS,com.android.launcher.permission.READ_SETTINGS,android.permission.WRITE_SETTINGS,...,com.android.launcher.permission.UNINSTALL_SHORTCUT,com.sec.android.iap.permission.BILLING,com.htc.launcher.permission.UPDATE_SHORTCUT,com.sec.android.provider.badge.permission.WRITE,android.permission.ACCESS_NETWORK_STATE,com.google.android.finsky.permission.BIND_GET_INSTALL_REFERRER_SERVICE,com.huawei.android.launcher.permission.READ_SETTINGS,android.permission.READ_SMS,android.permission.PROCESS_INCOMING_CALLS,Result
count,20531.0,20531.0,20531.0,20531.0,20531.0,20531.0,20531.0,20531.0,20531.0,20531.0,...,20531.0,20531.0,20531.0,20531.0,20531.0,20531.0,20531.0,20531.0,20531.0,20531.0
mean,10265.0,0.223808,0.034679,0.045736,0.012469,0.005894,0.172325,0.059958,0.013443,0.111831,...,0.02937,0.00414,0.033949,0.035069,0.948566,0.023282,0.023672,0.052409,0.003507,0.499781
std,5926.933524,0.416805,0.182971,0.208916,0.110969,0.076545,0.377672,0.237415,0.115165,0.315166,...,0.168846,0.064212,0.181101,0.183959,0.220888,0.150801,0.152027,0.222855,0.059117,0.500012
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,5132.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
50%,10265.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
75%,15397.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
max,20530.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [6]:
df_train.columns

Index(['index', 'android.permission.GET_ACCOUNTS',
       'com.sonyericsson.home.permission.BROADCAST_BADGE',
       'android.permission.READ_PROFILE', 'android.permission.MANAGE_ACCOUNTS',
       'android.permission.WRITE_SYNC_SETTINGS',
       'android.permission.READ_EXTERNAL_STORAGE',
       'android.permission.RECEIVE_SMS',
       'com.android.launcher.permission.READ_SETTINGS',
       'android.permission.WRITE_SETTINGS',
       'com.google.android.providers.gsf.permission.READ_GSERVICES',
       'android.permission.DOWNLOAD_WITHOUT_NOTIFICATION',
       'android.permission.GET_TASKS',
       'android.permission.WRITE_EXTERNAL_STORAGE',
       'android.permission.RECORD_AUDIO',
       'com.huawei.android.launcher.permission.CHANGE_BADGE',
       'com.oppo.launcher.permission.READ_SETTINGS',
       'android.permission.CHANGE_NETWORK_STATE',
       'com.android.launcher.permission.INSTALL_SHORTCUT',
       'android.permission.android.permission.READ_PHONE_STATE',
       'android.per

# Exploratory Data Analysis

📝 **Note:** Since the entire feature are using Numeric data type we don't need to perform categorical related operations

## Find Missing Values

In [42]:
missing_values = df_train.isnull().any()
total_missing_values = missing_values.sum()

print(f"Total missing value: {total_missing_values}")
print(missing_values)

Total missing value: 0
index                                                                     False
android.permission.GET_ACCOUNTS                                           False
com.sonyericsson.home.permission.BROADCAST_BADGE                          False
android.permission.READ_PROFILE                                           False
android.permission.MANAGE_ACCOUNTS                                        False
                                                                          ...  
com.google.android.finsky.permission.BIND_GET_INSTALL_REFERRER_SERVICE    False
com.huawei.android.launcher.permission.READ_SETTINGS                      False
android.permission.READ_SMS                                               False
android.permission.PROCESS_INCOMING_CALLS                                 False
Result                                                                    False
Length: 88, dtype: bool


## Case 1: Investigate Column

📚 **Info**: Android permissions provide controls that increase user awareness and limit an app's access to sensitive data

🔑 **Ref** According to Paper, There are 2 type of android permission, _NATIVE PERMISSION_ and _CUSTOM PERMISSION_, 

* Native Permission usually declared as `android.permission.PERMISSION_NAME`
* Custom permission usually declared as `com.company.package.PERMISSION_NAME`
* 

In [31]:
# List all columns from df_train which starts from "android.permission" in its name
native_permission = [column for column in df_train.columns if column.startswith('android.permission')]
custom_permission = [column for column in df_train.columns if column.startswith('com.')]
other_permission = [column for column in df_train.columns if column.startswith('me.')]

# Print the android_permission_columns list
print(f"Amount of Native Permission column: {len(native_permission)}")
print(f"Amount of Custom Permission column: {len(custom_permission)}")
print(f"Amount of Other Permission column: {len(other_permission)}")


Amount of Native Permission column: 58
Amount of Custom Permission column: 26
Amount of Other Permission column: 2
