In [2]:
import os
import sys
sys.path.append('/root/malware/ELSA')
import pandas as pd 
import numpy as np

import logging
import json
from datetime import datetime
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_extraction import DictVectorizer

import operator
from sklearn.svm import LinearSVC
from sklearn.feature_selection import SelectFromModel
from tqdm import tqdm
import pickle


FOLDER = "/scratch1/NOT_BACKED_UP/cavallarogrp/datasets/processed_dataset/elsa_data"

In [2]:
# create traindataset
with open(os.path.join(FOLDER,'train_feature_raw' ,"000042D8EC830FD5F2DAF1C4D6B35E302395A85F3F1A63A09D5C4CE914296293.json"), "r") as f:
    feature = json.load(f)

print(feature.keys())
feature

dict_keys(['req_permissions', 'activities', 'services', 'providers', 'receivers', 'features', 'intent_filters', 'used_permissions', 'api_calls', 'suspicious_calls', 'urls'])


{'req_permissions': ['android.permission.READ_PHONE_STATE',
  'android.permission.WRITE_EXTERNAL_STORAGE',
  'com.anddoes.launcher.permission.UPDATE_COUNT',
  'com.huawei.android.launcher.permission.WRITE_SETTINGS',
  'android.permission.INTERNET',
  'com.oppo.launcher.permission.WRITE_SETTINGS',
  'android.permission.GET_ACCOUNTS',
  'com.htc.launcher.permission.UPDATE_SHORTCUT',
  'com.google.android.c2dm.permission.RECEIVE',
  'android.permission.CHANGE_WIFI_STATE',
  'android.permission.RECEIVE_SMS',
  'android.permission.GET_TASKS',
  'android.permission.READ_EXTERNAL_STORAGE',
  'android.permission.SYSTEM_ALERT_WINDOW',
  'android.permission.RECEIVE_MMS',
  'com.pantech.fingerprint.security',
  'android.permission.ACCESS_COARSE_LOCATION',
  'com.huawei.android.launcher.permission.CHANGE_BADGE',
  'android.permission.ACCESS_BACKGROUND_SERVICE',
  'org.fidoalliance.uaf.permissions.FIDO_CLIENT',
  'com.sonyericsson.home.permission.BROADCAST_BADGE',
  'android.permission.VIBRATE',
  

In [6]:
# transfer original feature to dictionary format to match lab's previous work

transfered_feature = {}
for key in feature.keys():
    for i in range(len(feature[key])):
        new_feature = key + "::" + feature[key][i]
        transfered_feature[new_feature] = 1

{'req_permissions::android.permission.READ_PHONE_STATE': 1,
 'req_permissions::android.permission.WRITE_EXTERNAL_STORAGE': 1,
 'req_permissions::com.anddoes.launcher.permission.UPDATE_COUNT': 1,
 'req_permissions::com.huawei.android.launcher.permission.WRITE_SETTINGS': 1,
 'req_permissions::android.permission.INTERNET': 1,
 'req_permissions::com.oppo.launcher.permission.WRITE_SETTINGS': 1,
 'req_permissions::android.permission.GET_ACCOUNTS': 1,
 'req_permissions::com.htc.launcher.permission.UPDATE_SHORTCUT': 1,
 'req_permissions::com.google.android.c2dm.permission.RECEIVE': 1,
 'req_permissions::android.permission.CHANGE_WIFI_STATE': 1,
 'req_permissions::android.permission.RECEIVE_SMS': 1,
 'req_permissions::android.permission.GET_TASKS': 1,
 'req_permissions::android.permission.READ_EXTERNAL_STORAGE': 1,
 'req_permissions::android.permission.SYSTEM_ALERT_WINDOW': 1,
 'req_permissions::android.permission.RECEIVE_MMS': 1,
 'req_permissions::com.pantech.fingerprint.security': 1,
 'req_p

In [3]:
df_info = pd.read_csv(os.path.join(FOLDER, "training_set_info.csv"))
df_info

Unnamed: 0,sha256,dex_date,vt_detection
0,00002100E99DD61C62888906022139942B487CE5CD0DA8...,2018-01-22 17:27:36,0.0
1,000042D8EC830FD5F2DAF1C4D6B35E302395A85F3F1A63...,2018-08-10 11:10:52,0.0
2,00005CE360664A0E9D0F32EE10A8F8D76DFD2233E35C39...,2016-08-21 04:55:52,0.0
3,00009E82FC84439087058C0DBFE0DD67EDCF5D5F02B0DE...,1980-01-01 00:00:00,0.0
4,0000D3F7EE33C44094C8006501899876C7DC315B34A3F9...,1980-01-01 00:00:00,0.0
...,...,...,...
74995,FFF9A0163AC163B99D95F46C57B71996303822859084E3...,1980-01-01 00:00:00,0.0
74996,FFFAAFFE2D9B8BDA6C964AF30231CDC2B88E2993B9D024...,2017-01-13 20:44:42,0.0
74997,FFFB3DCB5A05CE6F86AEFC5666ED4BC658F9909B74CC7F...,1980-01-01 00:00:00,0.0
74998,FFFC0B3829330D22FB42E7FE6D919C42D2DB4EF97035AC...,2016-09-02 15:13:08,0.0


In [4]:
df_train = pd.read_csv(os.path.join(FOLDER, "training_set.csv"))
df_train['sha256'] = df_train['sha256'].apply(lambda x: x.upper())
df_train = df_train.merge(df_info[['sha256','vt_detection']], on='sha256', how='left')
df_train['timestamp'] = df_train['timestamp'].apply(lambda x: datetime.strptime(x, '%Y-%m-%d %H:%M:%S'))
df_train

Unnamed: 0,sha256,timestamp,label,vt_detection
0,0175CF5222FD6A221A3E71878D937C724C377885E3EDE5...,2017-03-20 05:16:14,0,0.0
1,72397CBEE4DA830D870D85F3FCCA41E729772E89B2C5E0...,2017-01-25 22:27:10,0,0.0
2,18C152C6E1C09CFA402783758585DE1BE04345021718A4...,2017-01-12 15:22:09,0,0.0
3,B2DE9BCCA4FA7522A5E9B39B295D1E8C94A1FA5EF55AF7...,2017-01-17 09:52:26,0,0.0
4,14D08E2D9E0C354AD08FFE13D063D8EDDC73E4B56DB133...,2017-01-13 22:25:29,0,0.0
...,...,...,...,...
74995,24B498C9B18F9375D2682608F63F3A811330EB4AF91124...,2019-12-06 14:42:49,1,16.0
74996,12928C70AE7F98F8D155BBAE728FA6D803B3F827DBCA1F...,2019-10-03 22:15:24,1,30.0
74997,111CBC65E111B679ECB9A2C1D17189DCE540F52D5B0A9A...,2019-11-28 02:33:42,1,10.0
74998,64972B8DB0F08CED4131129C14CA27F341F629F6E49AA8...,2019-12-15 18:04:57,1,21.0


In [5]:
from collections import Counter

df_train['year-month'] = df_train['timestamp'].apply(lambda x: x.strftime('%Y-%m'))
groups = df_train.groupby('year-month')
for name, group in groups:
    print(name, len(group))
    print(Counter(group['label']))
    

2017-01 1964
Counter({0: 1783, 1: 181})
2017-02 1547
Counter({0: 1386, 1: 161})
2017-03 2739
Counter({0: 2456, 1: 283})
2017-04 2399
Counter({0: 2129, 1: 270})
2017-05 1886
Counter({0: 1685, 1: 201})
2017-06 1965
Counter({0: 1811, 1: 154})
2017-07 1163
Counter({0: 960, 1: 203})
2017-08 2180
Counter({0: 1965, 1: 215})
2017-09 2907
Counter({0: 2700, 1: 207})
2017-10 1330
Counter({0: 1086, 1: 244})
2017-11 1528
Counter({0: 1366, 1: 162})
2017-12 3392
Counter({0: 3173, 1: 219})
2018-01 1946
Counter({0: 1782, 1: 164})
2018-02 1418
Counter({0: 1322, 1: 96})
2018-03 2886
Counter({0: 2521, 1: 365})
2018-04 746
Counter({0: 657, 1: 89})
2018-05 1856
Counter({0: 1481, 1: 375})
2018-06 3648
Counter({0: 3487, 1: 161})
2018-07 1683
Counter({0: 1408, 1: 275})
2018-08 2016
Counter({0: 1811, 1: 205})
2018-09 2551
Counter({0: 2406, 1: 145})
2018-10 2808
Counter({0: 2613, 1: 195})
2018-11 1895
Counter({0: 1619, 1: 276})
2018-12 1547
Counter({0: 1393, 1: 154})
2019-01 741
Counter({0: 688, 1: 53})
2019-02 

In [8]:
from tqdm import tqdm

feature_saved_folder = "/scratch1/NOT_BACKED_UP/cavallarogrp/datasets/processed_dataset/elsa_data/train_features"

for name, group in groups:
    print(name, len(group))
    json_features = []
    sha256_list = group['sha256'].tolist()
    for sha256 in tqdm(sha256_list):
        transfered_feature = {}
        with open(os.path.join(FOLDER,'train_feature_raw', sha256 + ".json"), "r") as f:
            feature = json.load(f)
        for key in feature.keys():
            for i in range(len(feature[key])):
                new_feature = key + "::" + feature[key][i]
                transfered_feature[new_feature] = 1
        json_features.append(transfered_feature)

    group['json_features'] = json_features
    with open(os.path.join(feature_saved_folder, name + ".pkl"), "wb") as f:
        pickle.dump(group, f)

    

2017-01 1964


100%|██████████| 1964/1964 [00:00<00:00, 10560.65it/s]


2017-02 1547


100%|██████████| 1547/1547 [00:00<00:00, 6181.13it/s]


2017-03 2739


100%|██████████| 2739/2739 [00:00<00:00, 10574.33it/s]


2017-04 2399


100%|██████████| 2399/2399 [00:00<00:00, 11916.24it/s]


2017-05 1886


100%|██████████| 1886/1886 [00:00<00:00, 9939.32it/s]


2017-06 1965


100%|██████████| 1965/1965 [00:00<00:00, 7833.43it/s]


2017-07 1163


100%|██████████| 1163/1163 [00:17<00:00, 67.84it/s] 


2017-08 2180


100%|██████████| 2180/2180 [00:15<00:00, 140.46it/s]  


2017-09 2907


100%|██████████| 2907/2907 [00:29<00:00, 98.88it/s]   


2017-10 1330


100%|██████████| 1330/1330 [00:30<00:00, 43.67it/s]


2017-11 1528


100%|██████████| 1528/1528 [00:34<00:00, 44.41it/s]


2017-12 3392


100%|██████████| 3392/3392 [01:22<00:00, 41.15it/s]


2018-01 1946


100%|██████████| 1946/1946 [00:36<00:00, 53.70it/s]


2018-02 1418


100%|██████████| 1418/1418 [00:27<00:00, 51.80it/s]


2018-03 2886


100%|██████████| 2886/2886 [00:58<00:00, 49.16it/s]


2018-04 746


100%|██████████| 746/746 [00:10<00:00, 72.88it/s]


2018-05 1856


100%|██████████| 1856/1856 [00:39<00:00, 47.36it/s] 


2018-06 3648


100%|██████████| 3648/3648 [01:08<00:00, 53.15it/s]


2018-07 1683


100%|██████████| 1683/1683 [00:37<00:00, 44.64it/s]


2018-08 2016


100%|██████████| 2016/2016 [00:39<00:00, 50.61it/s]


2018-09 2551


100%|██████████| 2551/2551 [00:58<00:00, 43.32it/s]


2018-10 2808


100%|██████████| 2808/2808 [01:00<00:00, 46.48it/s]


2018-11 1895


100%|██████████| 1895/1895 [00:50<00:00, 37.28it/s]


2018-12 1547


100%|██████████| 1547/1547 [00:33<00:00, 46.39it/s]


2019-01 741


100%|██████████| 741/741 [00:19<00:00, 37.79it/s]


2019-02 1403


100%|██████████| 1403/1403 [00:21<00:00, 65.66it/s]


2019-03 4106


100%|██████████| 4106/4106 [01:44<00:00, 39.42it/s]


2019-04 2303


100%|██████████| 2303/2303 [00:41<00:00, 55.59it/s]


2019-05 1786


100%|██████████| 1786/1786 [00:36<00:00, 49.11it/s]


2019-06 2161


100%|██████████| 2161/2161 [00:56<00:00, 38.04it/s]


2019-07 2349


100%|██████████| 2349/2349 [00:55<00:00, 42.40it/s]


2019-08 2325


100%|██████████| 2325/2325 [00:56<00:00, 41.39it/s]


2019-09 1576


100%|██████████| 1576/1576 [00:34<00:00, 45.79it/s]


2019-10 2172


100%|██████████| 2172/2172 [00:42<00:00, 50.77it/s]


2019-11 1772


100%|██████████| 1772/1772 [00:49<00:00, 36.02it/s]


2019-12 2306


100%|██████████| 2306/2306 [00:42<00:00, 54.04it/s] 


In [17]:
round = [4]

for r in round: 
    os.makedirs(os.path.join(FOLDER, f"test_features_round_{r}"), exist_ok=True)
    df_test = pd.read_csv(os.path.join(FOLDER, f"test_set_round_{r}.csv"))
    df_test['sha256'] = df_test['sha256'].apply(lambda x: x.upper())
    df_test_info = pd.read_csv(os.path.join(FOLDER, f"test_set_round_{r}_info.csv"))
    df_test = df_test.merge(df_test_info[['sha256','vt_detection','label']], on='sha256', how='left')
    df_test['timestamp'] = df_test['timestamp'].apply(lambda x: datetime.strptime(x, '%Y-%m-%d %H:%M:%S'))
    df_test['year-month'] = df_test['timestamp'].apply(lambda x: x.strftime('%Y-%m'))
    groups = df_test.groupby('year-month')
    for name, group in groups:
        print(name, len(group))
        json_features = []
        sha256_list = group['sha256'].tolist()
        for sha256 in tqdm(sha256_list):
            transfered_feature = {}
            with open(os.path.join(FOLDER,f'test_set_features_round_{r}_raw', sha256 + ".json"), "r") as f:
                feature = json.load(f)
            for key in feature.keys():
                for i in range(len(feature[key])):
                    new_feature = key + "::" + feature[key][i]
                    transfered_feature[new_feature] = 1
            json_features.append(transfered_feature)

        group['json_features'] = json_features
        with open(os.path.join(FOLDER, f"test_features_round_{r}", name + ".pkl"), "wb") as f:
            pickle.dump(group, f)


2021-07 1952


100%|██████████| 1952/1952 [00:00<00:00, 7445.37it/s]


2021-08 1986


100%|██████████| 1986/1986 [00:00<00:00, 7068.81it/s]


2021-09 2312


100%|██████████| 2312/2312 [00:00<00:00, 7676.94it/s]


2021-10 1115


100%|██████████| 1115/1115 [00:00<00:00, 7371.01it/s]


2021-11 1824


100%|██████████| 1824/1824 [00:00<00:00, 8835.64it/s]


2021-12 3311


100%|██████████| 3311/3311 [00:00<00:00, 7828.13it/s]


2022-01 3644


100%|██████████| 3644/3644 [00:00<00:00, 7502.30it/s]


2022-02 3869


100%|██████████| 3869/3869 [00:00<00:00, 6273.65it/s]


2022-03 1307


100%|██████████| 1307/1307 [00:00<00:00, 7124.74it/s]


2022-04 1251


100%|██████████| 1251/1251 [00:00<00:00, 6961.39it/s]


2022-05 1421


100%|██████████| 1421/1421 [00:00<00:00, 6617.26it/s]


2022-06 1008


100%|██████████| 1008/1008 [00:00<00:00, 6516.95it/s]


In [14]:
df_train['timestamp'] = df_train['timestamp'].apply(lambda x: datetime.strptime(x, '%Y-%m-%d %H:%M:%S'))
df_train['dex_date'] = df_train['dex_date'].apply(lambda x: datetime.strptime(x, '%Y-%m-%d %H:%M:%S'))

# max timestamp and min timestamp
max_timestamp = df_train['timestamp'].max()
min_timestamp = df_train['timestamp'].min()
max_dex_date = df_train['dex_date'].max()
min_dex_date = df_train['dex_date'].min()

print(max_timestamp, min_timestamp, max_dex_date, min_dex_date)

2019-12-31 23:14:17 2017-01-01 09:42:30 2107-11-30 00:00:00 1980-01-01 00:00:00
