In [2]:
import os
import pandas as pd
import numpy as np
import pickle

dataset_folder = "/Users/zhengxinran/Documents/S2LAB/dataset/tif/combine_drebin"
years = ['2015', '2016', '2017', '2018', '2019', '2020', '2021', '2022', '2023']
months = ['01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12']

adware_push = [
    'airpush', 'revmob', 'adwo', 'domob', 'youmi', 'kuguo', 'zdtad',
    'minimob', 'gumen', 'mobwin', 'wkload', 'callflakes', 'ganlet',
    'pandaad', 'dianjin', 'mobidash', 'adflex', 'mecor', 'gexin',
    'jiead', 'odpa', 'dianle', 'mobeleader'
]

adware_silent = [
    'dowgin', 'leadbolt', 'gappusin', 'plankton', 'shixot', 'tekwon',
    'skplanet', 'gizmo', 'qumi', 'autoins'
]

riskware_priv = [
    'feiad', 'kyview', 'viser', 'admogo', 'feiwo', 'appoffer', 'skymobi',
    'joye', 'fengvi', 'appinventor', 'fobus', 'invent','cnzz'
]

sms_trojan = [
    'smsreg', 'smssend', 'hipposms', 'smsagent', 'smspay'
]

fake_installer = [
    'fakeinst', 'shedun', 'spyagent', 'fakeangry', 'fakeflash', 'torjok'
]

dropper = [
    'nandrobox', 'systemmonitor', 'toreoc', 'appsgeyser', 'commplat',
    'jiagu', 'ewind', 'leapp', 'anydown', 'ginmaster'
]


unknown = [
    'unknown', 'oveead', 'systush', 'gomunc', 'reflod', 'buzztouch',
    'umpay', 'kirko', 'bauts', 'tachi', 'dnotua', 'frupi'
]


all_types = adware_push + adware_silent + riskware_priv + sms_trojan + fake_installer + dropper  + unknown

dfs_train = []
dfs_test = []
for month in months:
    file_path = os.path.join(dataset_folder, f'2014-{month}', 'features.pkl')
    with open(file_path, 'rb') as f:
        data = pickle.load(f)
    dfs_train.append(data)

for year in years:
    for month in months:
        file_path = os.path.join(dataset_folder, f'{year}-{month}', 'features.pkl')
        with open(file_path, 'rb') as f:
            data = pickle.load(f)
        dfs_test.append(data)

df_train = pd.concat(dfs_train)
print(df_train.shape)

df_test = pd.concat(dfs_test)
print(df_test.shape)

print(df_train.columns)
print(df_test.columns)
        

train_family = df_train['family'].tolist()
diff_family = set(train_family) - set(all_types)
print(diff_family)


# allocate type to each row

(51953, 6)
(292701, 6)
Index(['sha256', 'dex_date', 'label', 'family', 'vt_detection',
       'json_features'],
      dtype='object')
Index(['sha256', 'dex_date', 'label', 'family', 'vt_detection',
       'json_features'],
      dtype='object')
{'benign'}


In [3]:
def allocate_type(x):
    if x in adware_push:
        return 'adware_push'
    elif x in adware_silent:
        return 'adware_silent'
    elif x in riskware_priv:
        return 'riskware_priv'
    elif x in sms_trojan:
        return 'sms_trojan'
    elif x in fake_installer:
        return 'fake_installer'
    elif x in dropper:
        return 'dropper'
    elif x in unknown:
        return 'unknown'
    elif x == 'benign':
        return 'benign'
    else:
        print(f"family {x} not found")

df_train['type'] = df_train['family'].apply(allocate_type)
group_type = df_train.groupby('type')
print(df_train['type'].value_counts())

type
benign            46536
adware_silent      2473
adware_push        2030
unknown             303
riskware_priv       241
sms_trojan          238
dropper              76
fake_installer       56
Name: count, dtype: int64


In [4]:
# adware_silent
df_as = df_train[df_train['type'] == 'adware_silent']
print(df_as.shape)

# adware_push
df_ap = df_train[df_train['type'] == 'adware_push']
print(df_ap.shape)

# riskware_priv
df_rp = df_train[df_train['type'] == 'riskware_priv']
print(df_rp.shape)

# sms_trojan
df_st = df_train[df_train['type'] == 'sms_trojan']
print(df_st.shape)

# fake_installer
df_fi = df_train[df_train['type'] == 'fake_installer']
print(df_fi.shape)

# dropper
df_dp = df_train[df_train['type'] == 'dropper']
print(df_dp.shape)

# unknown
df_unk = df_train[df_train['type'] == 'unknown']
print(df_unk.shape)

# benign
df_ben = df_train[df_train['type'] == 'benign']
print(df_ben.shape)

(2473, 7)
(2030, 7)
(241, 7)
(238, 7)
(56, 7)
(76, 7)
(303, 7)
(46536, 7)


In [5]:
from sklearn.svm import LinearSVC
from sklearn.metrics import precision_score, recall_score, f1_score
import numpy as np

def drebin_svm_train(X_train, y_train):
    print(f"train Linear SVM")
    clf = LinearSVC(C=1.0, max_iter=10000)
    clf.fit(X_train, y_train)

    return clf

def drebin_svm_pred(clf, X_test, y_test):
    print(f"eval Linear SVM on overall test data")
    pred_labels = clf.predict(X_test)

    precision = precision_score(y_test, pred_labels,average='macro')
    recall = recall_score(y_test, pred_labels,average='macro')
    f1 = f1_score(y_test, pred_labels,average='macro')

    print(f"precision: {precision}, recall: {recall}, f1: {f1}")

    return precision, recall, f1

In [27]:
# adware_silent feature selection
# set random seed
from sklearn.svm import LinearSVC
from scipy.sparse import csr_matrix, vstack, save_npz, load_npz, issparse
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_selection import VarianceThreshold, SelectFromModel, SelectKBest, chi2
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

np.random.seed(42)

def ensure_sparse(X):
    """Ensure X is a sparse matrix (CSR format)"""
    if not issparse(X):
        X = csr_matrix(X)
    elif not isinstance(X, csr_matrix):
        X = csr_matrix(X)
    return X

def select_features(data, seg = 'json_features'):
    

    X_train = data[seg].values
    y_train = data['label'].values

    # vectorize
    vec = DictVectorizer(sparse=True, sort=True)
    X_train_vectorized = vec.fit_transform(X_train)
    print(f"before feature selection: {X_train_vectorized.shape}")
    all_feature_names = vec.get_feature_names_out()

    print(f"all_feature_names: {len(all_feature_names)}, {X_train_vectorized.shape}")

    n_features = X_train_vectorized.shape[1]
    max_features = min(10000, n_features)

    # feature selection
    # selector = SelectFromModel(LinearSVC(C=0.01, penalty="l1", dual=False), max_features=max_features)
    selector = SelectFromModel(RandomForestClassifier(n_estimators=100, max_depth=None, random_state=42), max_features=max_features)
    selector.fit(X_train_vectorized, y_train)
    print(selector.get_support())

    feature_mask = selector.get_support()
    selected_feature_names = vec.get_feature_names_out()[feature_mask]
    print(selected_feature_names[:10])

    # Create a mapping from feature names to their indices
    name_to_index = {name: idx for idx, name in enumerate(all_feature_names)}

    selected_indices = [name_to_index[name] for name in selected_feature_names]
                
    # Select features from training data in the EXACT SAME ORDER
    X_train_selected = X_train_vectorized[:, selected_indices]
    X_train_selected = ensure_sparse(X_train_selected)

    print(X_train_selected.shape)
    return X_train_selected, y_train, selected_feature_names


# load process features name
path = "/Users/zhengxinran/Documents/S2LAB/dataset/tif/processed_features/selected_features_linearsvc.txt"
feature_names = []
with open(path, 'r') as f:
    for line in f:
        feature_names.append(line.strip())

def filter_features(feature_dict, feature_name_list):
    return {k: v for k, v in feature_dict.items() if k in feature_name_list}
    
# set benign_num
benign_num = df_as.shape[0] * 9
df_ben_selected = df_ben.sample(benign_num)

data = pd.concat([df_ben_selected, df_as])
#  for each row, select the features in feature_names
feature_name_set = set(feature_names)

def filter_features_fast(feature_dict):
    return {k: v for k, v in feature_dict.items() if k in feature_name_set}

data["selected_features"] = data["json_features"].apply(filter_features_fast)


In [None]:
X_type_selected, y_type_selected, selected_feature_names_type = select_features(data, seg = 'json_features')
X_def_selected, y_def_selected, selected_feature_names_def = select_features(data, seg = 'selected_features')

print(len(selected_feature_names_type))
print(len(selected_feature_names_def))

before feature selection: (24730, 7660)
all_feature_names: 7660, (24730, 7660)
[False False False ... False False False]
['activities::CreateShortcuts' 'activities::Dashboard'
 'activities::IntroFlow' 'activities::MAME4droid Define Keys'
 'activities::MAME4droid Help' 'activities::MAME4droid Player Keys'
 'activities::MAME4droid Preferences' 'activities::Main'
 'activities::NativeBrowser' 'activities::Select Key']
(24730, 559)
559


In [29]:
diff_feature_names = set(selected_feature_names_type) - set(selected_feature_names_def)
print(len(diff_feature_names))
# print(diff_feature_names)   

9449


In [32]:
# train test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train_selected, y_test_selected = train_test_split(X_type_selected, y_type_selected, test_size=0.2, random_state=42)

# evaluate
clf = drebin_svm_train(X_train, y_train_selected)
precision, recall, f1 = drebin_svm_pred(clf, X_test, y_test_selected)

train Linear SVM
eval Linear SVM on overall test data
precision: 0.9910877344406948, recall: 0.9839702870535665, f1: 0.987495771094882


In [33]:
# train test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train_selected, y_test_selected = train_test_split(X_def_selected, y_def_selected, test_size=0.2, random_state=42)

# evaluate
clf = drebin_svm_train(X_train, y_train_selected)
precision, recall, f1 = drebin_svm_pred(clf, X_test, y_test_selected)

train Linear SVM
eval Linear SVM on overall test data
precision: 0.9900713399288681, recall: 0.983858028409651, f1: 0.9869393125452428


In [3]:
# selected features
feature_path = "/scratch_NOT_BACKED_UP/NOT_BACKED_UP/xinran/dataset/processed_features/selected_features_randomforest.txt"
with open(feature_path, 'r') as f:
    features = f.read().splitlines()

print(len(features))
features[:1000]


5814


['activities:::pushservice',
 'activities::About',
 'activities::AboutActivity',
 'activities::AboutScreen',
 'activities::AboutUs',
 'activities::Activity2',
 'activities::ActivityPlay',
 'activities::Ad Activity',
 'activities::Ad Video Player',
 'activities::Add Contact',
 'activities::Add Your Name',
 'activities::Android',
 'activities::AppsgeyserApp message',
 'activities::Bankside Village Apts',
 'activities::CallLog',
 'activities::ChangeAppActivity',
 'activities::ChapterActivity',
 'activities::Contact',
 'activities::CreateShortcuts',
 'activities::Dashboard',
 'activities::DetailActivity',
 'activities::Dreammaker',
 'activities::ESDFL',
 'activities::Email',
 'activities::FamousActivity',
 'activities::FavActivity',
 'activities::Favourites',
 'activities::FileChooser',
 'activities::FireStar',
 'activities::Floppy Pig',
 'activities::ForceRate',
 'activities::GameActivity',
 'activities::GameFeat',
 'activities::Help',
 'activities::HelpActivity',
 'activities::HomeActivi