In [1]:
import json
import os
import pandas as pd
import numpy as np
import glob

DREBIN_FEATURE_PATH = "/scratch1/NOT_BACKED_UP/cavallarogrp/datasets/processed_dataset/Androzoo_info/drebin_22_23"
ANDROZOO_INFO_PATH = "/cs/academic/phd3/xinrzhen/xinran/invariant_training/raw_dataset/androzoo_family_with_gray"
ANDROZ00_INFO = "/scratch1/NOT_BACKED_UP/cavallarogrp/datasets/processed_dataset/Androzoo_info"


In [2]:
df_meta = pd.read_csv(os.path.join(ANDROZ00_INFO, 'final_meta_info_file.tsv'), delimiter='\t')
print(df_meta.head())
print(df_meta.columns)

                                              sha256             dex_date  \
0  0000143ef8d00e3a65c5c8c380221d00678fed906fdc2e...  2016-01-19 13:28:36   
1  0000205479c9ba53e1b209d5fc484a8b5d12235b83374e...  2016-05-03 17:25:52   
2  0000230ae0799eaa963f0ae5337a0e6e1ac32acf1fa4e3...  2014-05-15 19:23:12   
3  0000241b6c550b52927c74c854b7a8d9dba6e86a3576d5...  2015-11-09 17:06:28   
4  000026e846ffcf60e2a9e135251b4f411300e903dce95d...  2015-09-13 21:46:06   

   label  family  
0    0.0  benign  
1    0.0  benign  
2    0.0  benign  
3    0.0  benign  
4    1.0     NaN  
Index(['sha256', 'dex_date', 'label', 'family'], dtype='object')


In [5]:
drebin_14_21_path = os.path.join(ANDROZ00_INFO, "drebin_14_21.json")
with open(drebin_14_21_path, "r") as f:
    drebin_14_21 = json.load(f)

In [2]:
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm
from collections import Counter

years = ['2022', '2023']
months = ['01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12']

output_directory = '/scratch1/NOT_BACKED_UP/cavallarogrp/datasets/feature_space/features/drebin_22_23'
info_22_23 = '/cs/academic/phd3/xinrzhen/xinran/invariant_training/data_process/22_23_sha256.txt'

df_info = pd.read_csv(info_22_23, header=None, names=["sha256","dex_date","vt_detection"])

def find_vt_detection(sha256):
    vt_detection = df_info[df_info['sha256'] == sha256]['vt_detection'].values
    if len(vt_detection) > 0:
        return vt_detection[0]
    else:
        return 0

# 使用并行读取JSON文件
def load_feature(sha256,name):
    feature_path = f"{DREBIN_FEATURE_PATH}/{name}/{sha256}.json"
    if os.path.exists(feature_path):  # 确保文件存在
        with open(feature_path, 'r') as f:
            feature = json.load(f)
        return feature.get(sha256, None)
    

def delete_feature(sha256, name):
    feature_path = f"{DREBIN_FEATURE_PATH}/{name}/{sha256}.json"
    if os.path.exists(feature_path):
        os.remove(feature_path)

save_folder = os.path.join(DREBIN_FEATURE_PATH, "features")
fail_list = []
# 读取CSV文件
df_all = []
for year in years:
    for month in months:
        name = f"{year}-{month}"
        print(f"Processing {name}")
        month_output_dir = os.path.join(output_directory, name)
        if not os.path.exists(month_output_dir):
            os.makedirs(month_output_dir)

        df = pd.read_csv(f"{ANDROZOO_INFO_PATH}/{year}-{month}.txt", header=None, names=["apk_name", "label", "releaseTime" , "family_label"])
        # df = pd.read_csv(f"{ANDROZOO_INFO_PATH}/{year}-{month}.txt")
        # df_meta = df[["sha256", "date", "label", "family"]]
        # df_meta.columns = ["sha256", "dex_date", "label", "family"]
        df_meta = df[["apk_name", "label", "releaseTime" , "family_label"]]
        df_meta['vt_detection'] = 0
        df_meta['vt_detection'] = df_meta['apk_name'].apply(find_vt_detection)
        print(Counter(df_meta['vt_detection']))
        print(Counter(df_meta['label']))
        df_meta.columns = ["sha256",  "label", "dex_date", "family","vt_detection"]
        if not os.path.exists(os.path.join(f"{save_folder}/{year}-{month}")):
            os.makedirs(os.path.join(f"{save_folder}/{year}-{month}"))
        df_meta.to_csv(f"{save_folder}/{year}-{month}/{year}-{month}-meta.csv", index=False)
        sha256s = df['apk_name'].values.tolist()
        print(f"Total {len(sha256s)} samples in read txt")

        existed_files = glob.glob(f"{DREBIN_FEATURE_PATH}/{name}/*.json")
        existed_sha256 = [os.path.basename(file).split(".")[0] for file in existed_files]
        difference = set(sha256s) - set(existed_sha256)
        sha256s = list(set(sha256s) - difference)
        print(f"Total {len(difference)} difference files")
        print(difference)
        for sha256 in difference:
            delete_feature(sha256, name)
            print(f"Delete {sha256}")

        X = []

        existed_files = os.listdir(os.path.join(save_folder, f"{name}"))

        with ThreadPoolExecutor() as executor:
            futures = [executor.submit(load_feature, sha256, name) for sha256 in sha256s]
            for future in tqdm(as_completed(futures), total=len(futures)):
                try:
                    feature = future.result()
                except Exception as e:
                    print(e)
                    fail_list.append(sha256s[futures.index(future)])
                    feature = None
                
                if feature is not None:
                    X.append(feature)

        sha256s_final = [sha256 for sha256 in sha256s if sha256 not in fail_list]
        print(f"fail list: {fail_list}")
        print(f"fail list length: {len(fail_list)}, final length: {len(sha256s_final)}")
        print(f"Total {len(X)} samples")

        assert len(X) == len(sha256s_final)

        df_meta = df_meta[df_meta['sha256'].isin(sha256s_final)]

        os.makedirs(os.path.join(save_folder, f"{name}"), exist_ok=True)
        save_path = os.path.join(save_folder,f"{name}",f"{name}.json")
        with open(save_path, "w") as f:
            json.dump(X, f)
        df_meta['json_features'] = X
        df_meta['dex_date'] = pd.to_datetime(df_meta['dex_date'], errors='coerce')
        print(f"final saved shape: {df_meta.shape}")
        file_name = 'features.pkl'
        file_path = os.path.join(month_output_dir, file_name)

        df_meta.to_pickle(file_path)
        print(f"Save to {file_path}")
        df_all.append(df_meta)

df_all = pd.concat(df_all)
# print(f"Fail list: {len(fail_list)}")

# for sha256 in tqdm(fail_list):
#     delete_json_path = f"{DREBIN_FEATURE_PATH}/{year}-{month}/{sha256}.json"
#     print(f"Delete {delete_json_path}")
#     os.remove(delete_json_path)
    

Processing 2022-01
Counter({0.0: 1379, 5.0: 58, 6.0: 46, 7.0: 24, 8.0: 12, 10.0: 4, 9.0: 2, 17.0: 1, 13.0: 1, 12.0: 1})
Counter({0: 1379, 1: 149})
Total 1528 samples in read txt
Total 0 difference files
set()


100%|██████████| 1528/1528 [00:01<00:00, 1165.81it/s]


fail list: []
fail list length: 0, final length: 1528
Total 1528 samples
final saved shape: (1528, 6)
Save to /scratch1/NOT_BACKED_UP/cavallarogrp/datasets/feature_space/features/drebin_22_23/2022-01/features.pkl
Processing 2022-02
Counter({0.0: 1407, 5.0: 58, 6.0: 41, 7.0: 33, 8.0: 12, 9.0: 4, 12.0: 2, 17.0: 1, 11.0: 1})
Counter({0: 1407, 1: 152})
Total 1559 samples in read txt
Total 0 difference files
set()


100%|██████████| 1559/1559 [00:00<00:00, 1819.82it/s]


fail list: []
fail list length: 0, final length: 1559
Total 1559 samples
final saved shape: (1559, 6)
Save to /scratch1/NOT_BACKED_UP/cavallarogrp/datasets/feature_space/features/drebin_22_23/2022-02/features.pkl
Processing 2022-03
Counter({0.0: 1379, 5.0: 63, 6.0: 47, 7.0: 22, 9.0: 5, 8.0: 5, 12.0: 3, 10.0: 2, 11.0: 1, 19.0: 1})
Counter({0: 1379, 1: 149})
Total 1528 samples in read txt
Total 0 difference files
set()


100%|██████████| 1528/1528 [00:02<00:00, 740.80it/s] 


fail list: []
fail list length: 0, final length: 1528
Total 1528 samples
final saved shape: (1528, 6)
Save to /scratch1/NOT_BACKED_UP/cavallarogrp/datasets/feature_space/features/drebin_22_23/2022-03/features.pkl
Processing 2022-04
Counter({0.0: 1379, 5.0: 94, 6.0: 32, 7.0: 10, 8.0: 9, 9.0: 1, 12.0: 1, 14.0: 1, 11.0: 1})
Counter({0: 1379, 1: 149})
Total 1528 samples in read txt
Total 0 difference files
set()


100%|██████████| 1528/1528 [00:01<00:00, 776.45it/s] 


fail list: []
fail list length: 0, final length: 1528
Total 1528 samples
final saved shape: (1528, 6)
Save to /scratch1/NOT_BACKED_UP/cavallarogrp/datasets/feature_space/features/drebin_22_23/2022-04/features.pkl
Processing 2022-05
Counter({0.0: 1388, 5.0: 93, 6.0: 33, 7.0: 10, 9.0: 4, 10.0: 4, 13.0: 2, 11.0: 1, 16.0: 1, 12.0: 1, 8.0: 1})
Counter({0: 1388, 1: 150})
Total 1538 samples in read txt
Total 0 difference files
set()


100%|██████████| 1538/1538 [00:01<00:00, 917.40it/s] 


fail list: []
fail list length: 0, final length: 1538
Total 1538 samples
final saved shape: (1538, 6)
Save to /scratch1/NOT_BACKED_UP/cavallarogrp/datasets/feature_space/features/drebin_22_23/2022-05/features.pkl
Processing 2022-06
Counter({0.0: 1509, 5.0: 101, 6.0: 38, 7.0: 13, 9.0: 3, 13.0: 2, 18.0: 1, 8.0: 1, 17.0: 1, 16.0: 1, 11.0: 1, 12.0: 1})
Counter({0: 1509, 1: 163})
Total 1672 samples in read txt
Total 0 difference files
set()


100%|██████████| 1672/1672 [00:01<00:00, 852.42it/s] 


fail list: []
fail list length: 0, final length: 1672
Total 1672 samples
final saved shape: (1672, 6)
Save to /scratch1/NOT_BACKED_UP/cavallarogrp/datasets/feature_space/features/drebin_22_23/2022-06/features.pkl
Processing 2022-07
Counter({0.0: 1388, 5.0: 102, 6.0: 29, 7.0: 13, 8.0: 3, 17.0: 2, 11.0: 1})
Counter({0: 1388, 1: 150})
Total 1538 samples in read txt
Total 0 difference files
set()


100%|██████████| 1538/1538 [00:04<00:00, 347.08it/s]


fail list: []
fail list length: 0, final length: 1538
Total 1538 samples
final saved shape: (1538, 6)
Save to /scratch1/NOT_BACKED_UP/cavallarogrp/datasets/feature_space/features/drebin_22_23/2022-07/features.pkl
Processing 2022-08
Counter({0.0: 1379, 5.0: 75, 6.0: 33, 7.0: 13, 11.0: 8, 9.0: 7, 8.0: 5, 10.0: 3, 12.0: 3, 13.0: 2})
Counter({0: 1379, 1: 149})
Total 1528 samples in read txt
Total 0 difference files
set()


100%|██████████| 1528/1528 [00:01<00:00, 1250.33it/s]


fail list: []
fail list length: 0, final length: 1528
Total 1528 samples
final saved shape: (1528, 6)
Save to /scratch1/NOT_BACKED_UP/cavallarogrp/datasets/feature_space/features/drebin_22_23/2022-08/features.pkl
Processing 2022-09
Counter({0.0: 1386, 5.0: 45, 6.0: 21, 8.0: 21, 9.0: 20, 7.0: 16, 10.0: 12, 11.0: 6, 14.0: 3, 13.0: 3, 12.0: 2, 15.0: 1})
Counter({0: 1386, 1: 150})
Total 1536 samples in read txt
Total 0 difference files
set()


100%|██████████| 1536/1536 [00:01<00:00, 996.76it/s] 


fail list: []
fail list length: 0, final length: 1536
Total 1536 samples
final saved shape: (1536, 6)
Save to /scratch1/NOT_BACKED_UP/cavallarogrp/datasets/feature_space/features/drebin_22_23/2022-09/features.pkl
Processing 2022-10
Counter({0.0: 1425, 9.0: 31, 5.0: 30, 7.0: 24, 10.0: 21, 8.0: 19, 6.0: 14, 11.0: 11, 17.0: 1, 12.0: 1, 15.0: 1, 14.0: 1})
Counter({0: 1425, 1: 154})
Total 1579 samples in read txt
Total 16 difference files
{'B2657746FC5B0E0D471430100AE6A4F300CEC8DF0B239CCE0920127E22A7A6F5', 'BD4FC45E14C74B679D345BB9FD0DACC7771A0A2BA0785F7FE1433E8DE436E8C8', '2AE8EA1BAAEC8C5CA09118F10458BDA7790A7D591A0E81E32F7B11AFE522B1BE', '4F88143F4EBB9BFF731429B30D716E0E0D145C2C6F7C5429EF17335FA2BB35C9', '1E81C75C212F250695D2351F9B3F4CA06100E94164ECC0215DA8F7EB1399D3CA', '4AD0EED69C63E53AA1103ECED17DA730542D719D80AF2381C6BBE7E860D9E874', 'AF218A28C61AA8B4DA81594EA864EB6E7258C6C2700428FFD11EF127662376C6', '1983368C0EFE4C312A4322A2DEBF8E117A6D0FA87A50FBDA431A265A1654D153', '072F7EAF18A798DD

100%|██████████| 1563/1563 [00:05<00:00, 306.14it/s]


fail list: []
fail list length: 0, final length: 1563
Total 1563 samples
final saved shape: (1563, 6)
Save to /scratch1/NOT_BACKED_UP/cavallarogrp/datasets/feature_space/features/drebin_22_23/2022-10/features.pkl
Processing 2022-11
Counter({0.0: 1388, 5.0: 51, 6.0: 27, 8.0: 11, 14.0: 9, 7.0: 9, 15.0: 9, 11.0: 8, 9.0: 6, 13.0: 6, 10.0: 5, 12.0: 4, 16.0: 3, 19.0: 1, 18.0: 1})
Counter({0: 1388, 1: 150})
Total 1538 samples in read txt
Total 0 difference files
set()


100%|██████████| 1538/1538 [00:02<00:00, 516.17it/s]


fail list: []
fail list length: 0, final length: 1538
Total 1538 samples
final saved shape: (1538, 6)
Save to /scratch1/NOT_BACKED_UP/cavallarogrp/datasets/feature_space/features/drebin_22_23/2022-11/features.pkl
Processing 2022-12
Counter({0.0: 1083, 5.0: 34, 11.0: 12, 6.0: 10, 12.0: 9, 7.0: 9, 10.0: 8, 8.0: 7, 15.0: 7, 14.0: 5, 13.0: 5, 9.0: 4, 18.0: 2, 20.0: 2, 21.0: 2, 19.0: 1})
Counter({0: 1083, 1: 117})
Total 1200 samples in read txt
Total 0 difference files
set()


100%|██████████| 1200/1200 [00:01<00:00, 1031.74it/s]


fail list: []
fail list length: 0, final length: 1200
Total 1200 samples
final saved shape: (1200, 6)
Save to /scratch1/NOT_BACKED_UP/cavallarogrp/datasets/feature_space/features/drebin_22_23/2022-12/features.pkl
Processing 2023-01
Counter({0.0: 611, 5.0: 38, 6.0: 16, 7.0: 4, 8.0: 3, 11.0: 2, 9.0: 2, 10.0: 1})
Counter({0: 611, 1: 66})
Total 677 samples in read txt
Total 0 difference files
set()


100%|██████████| 677/677 [00:01<00:00, 503.78it/s]


fail list: []
fail list length: 0, final length: 677
Total 677 samples
final saved shape: (677, 6)
Save to /scratch1/NOT_BACKED_UP/cavallarogrp/datasets/feature_space/features/drebin_22_23/2023-01/features.pkl
Processing 2023-02
Counter({0.0: 898, 5.0: 65, 6.0: 28, 10.0: 2, 8.0: 1, 7.0: 1})
Counter({0: 898, 1: 97})
Total 995 samples in read txt
Total 0 difference files
set()


100%|██████████| 995/995 [00:01<00:00, 782.34it/s] 


fail list: []
fail list length: 0, final length: 995
Total 995 samples
final saved shape: (995, 6)
Save to /scratch1/NOT_BACKED_UP/cavallarogrp/datasets/feature_space/features/drebin_22_23/2023-02/features.pkl
Processing 2023-03
Counter({0.0: 1000, 5.0: 59, 6.0: 31, 7.0: 11, 8.0: 3, 14.0: 2, 10.0: 1, 11.0: 1})
Counter({0: 1000, 1: 108})
Total 1108 samples in read txt
Total 0 difference files
set()


100%|██████████| 1108/1108 [00:02<00:00, 384.00it/s]


fail list: []
fail list length: 0, final length: 1108
Total 1108 samples
final saved shape: (1108, 6)
Save to /scratch1/NOT_BACKED_UP/cavallarogrp/datasets/feature_space/features/drebin_22_23/2023-03/features.pkl
Processing 2023-04
Counter({0.0: 944, 5.0: 42, 6.0: 26, 7.0: 10, 8.0: 10, 11.0: 5, 10.0: 3, 12.0: 2, 9.0: 2, 13.0: 1, 14.0: 1})
Counter({0: 944, 1: 102})
Total 1046 samples in read txt
Total 0 difference files
set()


 32%|███▏      | 330/1046 [00:00<00:00, 855.95it/s]

Extra data: line 1 column 72 (char 71)
Extra data: line 1 column 72 (char 71)


 71%|███████   | 742/1046 [00:00<00:00, 995.49it/s]

Extra data: line 1 column 72 (char 71)


 88%|████████▊ | 921/1046 [00:01<00:00, 394.28it/s]

Extra data: line 1 column 72 (char 71)
Extra data: line 1 column 72 (char 71)


100%|██████████| 1046/1046 [00:02<00:00, 490.37it/s]


fail list: ['4D052FD84FF96D5325DD2AAEE6C03DAEA0D2009531E07284B8E11BFB56D78A36', 'B1FB0B082189A388D80DA64BF1846A3CDEA7325B5F9979F17411E432E223302A', '6C91B5B97D0BE4476B5D930324FB5B4CEF64D76B9B829A0E7054CFC94B4E6569', 'ADB918706343143C3041BCFD76301F552F6B4E9F485F414E994F022BAC2A6804', '12F56A5617BC8C8639366D92A51445C1B6A7F47C1B71BD72DDF93D6A22917C5E']
fail list length: 5, final length: 1041
Total 1041 samples
final saved shape: (1041, 6)
Save to /scratch1/NOT_BACKED_UP/cavallarogrp/datasets/feature_space/features/drebin_22_23/2023-04/features.pkl
Processing 2023-05
Counter({0.0: 527, 5.0: 28, 6.0: 18, 7.0: 5, 8.0: 3, 11.0: 2, 10.0: 1})
Counter({0: 527, 1: 57})
Total 584 samples in read txt
Total 0 difference files
set()


100%|██████████| 584/584 [00:01<00:00, 526.07it/s]


fail list: ['4D052FD84FF96D5325DD2AAEE6C03DAEA0D2009531E07284B8E11BFB56D78A36', 'B1FB0B082189A388D80DA64BF1846A3CDEA7325B5F9979F17411E432E223302A', '6C91B5B97D0BE4476B5D930324FB5B4CEF64D76B9B829A0E7054CFC94B4E6569', 'ADB918706343143C3041BCFD76301F552F6B4E9F485F414E994F022BAC2A6804', '12F56A5617BC8C8639366D92A51445C1B6A7F47C1B71BD72DDF93D6A22917C5E']
fail list length: 5, final length: 584
Total 584 samples
final saved shape: (584, 6)
Save to /scratch1/NOT_BACKED_UP/cavallarogrp/datasets/feature_space/features/drebin_22_23/2023-05/features.pkl
Processing 2023-06
Counter({0.0: 833, 6.0: 21, 7.0: 19, 8.0: 18, 5.0: 15, 9.0: 7, 10.0: 6, 13.0: 1, 15.0: 1, 12.0: 1, 11.0: 1})
Counter({0: 833, 1: 90})
Total 923 samples in read txt
Total 0 difference files
set()


100%|██████████| 923/923 [00:03<00:00, 252.87it/s]


fail list: ['4D052FD84FF96D5325DD2AAEE6C03DAEA0D2009531E07284B8E11BFB56D78A36', 'B1FB0B082189A388D80DA64BF1846A3CDEA7325B5F9979F17411E432E223302A', '6C91B5B97D0BE4476B5D930324FB5B4CEF64D76B9B829A0E7054CFC94B4E6569', 'ADB918706343143C3041BCFD76301F552F6B4E9F485F414E994F022BAC2A6804', '12F56A5617BC8C8639366D92A51445C1B6A7F47C1B71BD72DDF93D6A22917C5E']
fail list length: 5, final length: 923
Total 923 samples
final saved shape: (923, 6)
Save to /scratch1/NOT_BACKED_UP/cavallarogrp/datasets/feature_space/features/drebin_22_23/2023-06/features.pkl
Processing 2023-07
Counter({0.0: 879, 5.0: 32, 6.0: 23, 7.0: 17, 8.0: 8, 9.0: 5, 10.0: 4, 12.0: 3, 11.0: 2, 13.0: 1})
Counter({0: 879, 1: 95})
Total 974 samples in read txt
Total 98 difference files
{'C2B340D2F2A67F6EFDBB7EFC5A25E6A0DB9AD244516ADF54FDE4650FDFC5B63B', 'C1EDDA9A6E7AB1AC44C58DB9BD2FDB99572510AAF064C9C885DACAF5E23688AA', 'C64BDFF2838B3B37D20E87E5D7C9BD0BDA677C01DADB057F013FFEE08CA91BAC', 'F2EA7BD5C363FFE43CBE1C4B4C457AB0CE2F887CE49FE06

100%|██████████| 876/876 [00:02<00:00, 408.44it/s] 


fail list: ['4D052FD84FF96D5325DD2AAEE6C03DAEA0D2009531E07284B8E11BFB56D78A36', 'B1FB0B082189A388D80DA64BF1846A3CDEA7325B5F9979F17411E432E223302A', '6C91B5B97D0BE4476B5D930324FB5B4CEF64D76B9B829A0E7054CFC94B4E6569', 'ADB918706343143C3041BCFD76301F552F6B4E9F485F414E994F022BAC2A6804', '12F56A5617BC8C8639366D92A51445C1B6A7F47C1B71BD72DDF93D6A22917C5E']
fail list length: 5, final length: 876
Total 876 samples
final saved shape: (876, 6)
Save to /scratch1/NOT_BACKED_UP/cavallarogrp/datasets/feature_space/features/drebin_22_23/2023-07/features.pkl
Processing 2023-08
Counter({0.0: 870, 5.0: 39, 6.0: 30, 7.0: 8, 8.0: 7, 9.0: 4, 10.0: 2, 14.0: 2, 13.0: 1, 12.0: 1})
Counter({0: 870, 1: 94})
Total 964 samples in read txt
Total 0 difference files
set()


100%|██████████| 964/964 [00:01<00:00, 528.89it/s]


fail list: ['4D052FD84FF96D5325DD2AAEE6C03DAEA0D2009531E07284B8E11BFB56D78A36', 'B1FB0B082189A388D80DA64BF1846A3CDEA7325B5F9979F17411E432E223302A', '6C91B5B97D0BE4476B5D930324FB5B4CEF64D76B9B829A0E7054CFC94B4E6569', 'ADB918706343143C3041BCFD76301F552F6B4E9F485F414E994F022BAC2A6804', '12F56A5617BC8C8639366D92A51445C1B6A7F47C1B71BD72DDF93D6A22917C5E']
fail list length: 5, final length: 964
Total 964 samples
final saved shape: (964, 6)
Save to /scratch1/NOT_BACKED_UP/cavallarogrp/datasets/feature_space/features/drebin_22_23/2023-08/features.pkl
Processing 2023-09
Counter({0.0: 879, 5.0: 38, 6.0: 27, 7.0: 15, 8.0: 6, 10.0: 4, 9.0: 3, 13.0: 1, 16.0: 1})
Counter({0: 879, 1: 95})
Total 974 samples in read txt
Total 221 difference files
{'A0B64E61DE5DA2E8B5BF295013405171C9D7D9D239198883B9C2A0A97A0FBE1E', 'B5D8F8C891E6F6A9C1DC60CCEDCA948CAD36056684E22D1E1CE5A584AE34273B', '1875EAE24F5479A27E3706278E6017DF898CB0AC4D52765A651C0877528DE2CD', '3D13371DD0662BD5AB016DC8EC2EE424F3DAC582FB8BCF0B30AE785

100%|██████████| 753/753 [00:01<00:00, 453.26it/s]


fail list: ['4D052FD84FF96D5325DD2AAEE6C03DAEA0D2009531E07284B8E11BFB56D78A36', 'B1FB0B082189A388D80DA64BF1846A3CDEA7325B5F9979F17411E432E223302A', '6C91B5B97D0BE4476B5D930324FB5B4CEF64D76B9B829A0E7054CFC94B4E6569', 'ADB918706343143C3041BCFD76301F552F6B4E9F485F414E994F022BAC2A6804', '12F56A5617BC8C8639366D92A51445C1B6A7F47C1B71BD72DDF93D6A22917C5E']
fail list length: 5, final length: 753
Total 753 samples
final saved shape: (753, 6)
Save to /scratch1/NOT_BACKED_UP/cavallarogrp/datasets/feature_space/features/drebin_22_23/2023-09/features.pkl
Processing 2023-10
Counter({0.0: 1194, 5.0: 58, 6.0: 35, 7.0: 14, 9.0: 8, 10.0: 6, 8.0: 6, 22.0: 1, 11.0: 1})
Counter({0: 1194, 1: 129})
Total 1323 samples in read txt
Total 588 difference files
{'0C283DADE8A271F518515BDFBA0516D1A44ADF9F8A41513AD1F0B93BEBC920C7', 'E45DD7F2A0934EF7BFA71F56B869B8C316F86697F1469E3857CA963AFE5DC770', '2DFEF5CBACA98C5BECD7A7D3A3BF50DD4096E886CFD758CB311E8632F9FF3949', '6963D70E5F76591AF048F991FB572273D32E73FE5A19668262E

100%|██████████| 735/735 [00:00<00:00, 966.16it/s] 


fail list: ['4D052FD84FF96D5325DD2AAEE6C03DAEA0D2009531E07284B8E11BFB56D78A36', 'B1FB0B082189A388D80DA64BF1846A3CDEA7325B5F9979F17411E432E223302A', '6C91B5B97D0BE4476B5D930324FB5B4CEF64D76B9B829A0E7054CFC94B4E6569', 'ADB918706343143C3041BCFD76301F552F6B4E9F485F414E994F022BAC2A6804', '12F56A5617BC8C8639366D92A51445C1B6A7F47C1B71BD72DDF93D6A22917C5E']
fail list length: 5, final length: 735
Total 735 samples
final saved shape: (735, 6)
Save to /scratch1/NOT_BACKED_UP/cavallarogrp/datasets/feature_space/features/drebin_22_23/2023-10/features.pkl
Processing 2023-11
Counter({0.0: 1333, 5.0: 75, 6.0: 38, 7.0: 12, 8.0: 7, 9.0: 5, 10.0: 2, 15.0: 1, 27.0: 1, 11.0: 1, 12.0: 1, 28.0: 1})
Counter({0: 1333, 1: 144})
Total 1477 samples in read txt
Total 910 difference files
{'847BCF5F2884D45DA30221DC76F5D49EA2D32B29FADDFE6BAD40B52BB162F4BF', 'D3941651DFB9CD0362291D40F33C08A67B3AA8729E60B6EBA46A87CA5C3AA7F9', '2F7C7308D704BBC9913B65D1FF924906CBBCA72A42284BB113AFBD1284C7BCE8', '39F8E5D7ED58FBEC352F7F66

 75%|███████▍  | 423/567 [00:00<00:00, 1336.23it/s]

Expecting value: line 1 column 1 (char 0)


100%|██████████| 567/567 [00:00<00:00, 889.23it/s] 


fail list: ['4D052FD84FF96D5325DD2AAEE6C03DAEA0D2009531E07284B8E11BFB56D78A36', 'B1FB0B082189A388D80DA64BF1846A3CDEA7325B5F9979F17411E432E223302A', '6C91B5B97D0BE4476B5D930324FB5B4CEF64D76B9B829A0E7054CFC94B4E6569', 'ADB918706343143C3041BCFD76301F552F6B4E9F485F414E994F022BAC2A6804', '12F56A5617BC8C8639366D92A51445C1B6A7F47C1B71BD72DDF93D6A22917C5E', 'E5E5571B77C6F758F81946F556C19383728CCB840C372AC09294EF9A82DFD494']
fail list length: 6, final length: 566
Total 566 samples
final saved shape: (566, 6)
Save to /scratch1/NOT_BACKED_UP/cavallarogrp/datasets/feature_space/features/drebin_22_23/2023-11/features.pkl
Processing 2023-12
Counter({0.0: 833, 5.0: 59, 6.0: 18, 7.0: 5, 8.0: 3, 27.0: 1, 10.0: 1, 11.0: 1, 9.0: 1, 12.0: 1})
Counter({0: 833, 1: 90})
Total 923 samples in read txt
Total 380 difference files
{'1A2FBFBD825B516E9BD498B217564F2FC05561D77B792F0D1DA73AA40353F3FB', '8E917CD230DC32B0B4A25F3574310DD93479C6155305D8A6299FF5CA733548A9', 'DE510F2233D3CBE76A0E77E6A024A0F4EFD9291AF542563

100%|██████████| 543/543 [00:00<00:00, 881.01it/s]


fail list: ['4D052FD84FF96D5325DD2AAEE6C03DAEA0D2009531E07284B8E11BFB56D78A36', 'B1FB0B082189A388D80DA64BF1846A3CDEA7325B5F9979F17411E432E223302A', '6C91B5B97D0BE4476B5D930324FB5B4CEF64D76B9B829A0E7054CFC94B4E6569', 'ADB918706343143C3041BCFD76301F552F6B4E9F485F414E994F022BAC2A6804', '12F56A5617BC8C8639366D92A51445C1B6A7F47C1B71BD72DDF93D6A22917C5E', 'E5E5571B77C6F758F81946F556C19383728CCB840C372AC09294EF9A82DFD494']
fail list length: 6, final length: 543
Total 543 samples
final saved shape: (543, 6)
Save to /scratch1/NOT_BACKED_UP/cavallarogrp/datasets/feature_space/features/drebin_22_23/2023-12/features.pkl


In [28]:
for sha256 in difference:
    delete_json_path = f"/scratch1/NOT_BACKED_UP/cavallarogrp/datasets/feature_space/raw_smali/{sha256}"
    print(f"Delete {delete_json_path}")
    if os.path.exists(delete_json_path):
        os.remove(delete_json_path)
    else:
        print(f"{delete_json_path} not exists")

Delete /scratch1/NOT_BACKED_UP/cavallarogrp/datasets/feature_space/raw_smali/2AE8EA1BAAEC8C5CA09118F10458BDA7790A7D591A0E81E32F7B11AFE522B1BE
/scratch1/NOT_BACKED_UP/cavallarogrp/datasets/feature_space/raw_smali/2AE8EA1BAAEC8C5CA09118F10458BDA7790A7D591A0E81E32F7B11AFE522B1BE not exists
Delete /scratch1/NOT_BACKED_UP/cavallarogrp/datasets/feature_space/raw_smali/1950B1D8A95D5F9C148983ADABD398ECC506053E1AC66C03B3F5500E5847B177
/scratch1/NOT_BACKED_UP/cavallarogrp/datasets/feature_space/raw_smali/1950B1D8A95D5F9C148983ADABD398ECC506053E1AC66C03B3F5500E5847B177 not exists
Delete /scratch1/NOT_BACKED_UP/cavallarogrp/datasets/feature_space/raw_smali/1983368C0EFE4C312A4322A2DEBF8E117A6D0FA87A50FBDA431A265A1654D153
/scratch1/NOT_BACKED_UP/cavallarogrp/datasets/feature_space/raw_smali/1983368C0EFE4C312A4322A2DEBF8E117A6D0FA87A50FBDA431A265A1654D153 not exists
Delete /scratch1/NOT_BACKED_UP/cavallarogrp/datasets/feature_space/raw_smali/9501C24E6BDC96315086469B8FD0D794E2C0E3A37046B713D77AA70F89F

In [26]:
for sha256 in fail_list:
    delete_feature(sha256, name)
    print(f"Delete {sha256}")

Delete 7A77335464456B38847D6EF835E986EC88B83153EAFB5464DEE5335CFA0DF379


In [7]:
from sklearn.feature_extraction import DictVectorizer

vec =  DictVectorizer()
X = vec.fit_transform(X)
print(X.shape)

(1528, 114847)
