<div style="color:blue; text-align: center; font-weight: bold; font-family: 'Courier New', monospace; font-size: 40px; ">
 LUDA ANALYSIS NOTEBOOK</div>

In [1]:
import os
import pickle
import pandas as pd
from pprint import pprint

In [2]:
DATA = '../data/data_demo.csv'
PREPROCESSED_DATA = '../data/data_demo_preprocessed.csv'
MATRIX_OUTPUT = '../luda_output/mymatrix/'

In [3]:
def value_counts(df_col, limit=None):
    normalized = df_col.value_counts(normalize=True)[:limit]
    normal = df_col.value_counts()[:limit]
    normalized.name, normal.name = 'normalized', 'count'
    return pd.concat([normal, normalized], axis=1)

# Explore your data

## Data

In [4]:
df = pd.read_csv(DATA)
df

Unnamed: 0,url,label
0,http://173.243.112.132/serve/config.bin,malicious
1,http://194.15.112.29/2ja/panel/config.bin,malicious
2,http://216.170.125.134/neat/serverphp/config.bin,malicious
3,http://58.22.101.109/xz/cfg.bin,malicious
4,http://83.149.95.197/1/cfg.bin,malicious
...,...,...
10195,http://fhs.mcmaster.ca/main/benefactors/braley...,benign
10196,http://youtube.com/watch?v=_WQSaqs-fOs,benign
10197,http://randomdomain34623.com/B5iioj3SFI5gE_JbH...,benign
10198,http://randomdomain42219.com/-xwiPbFONIb8/AAAA...,benign


In [5]:
value_counts(df['label'])

Unnamed: 0,count,normalized
benign,10000,0.980392
malicious,200,0.019608


In [6]:
for label in ['malicious', 'benign']:
    _ = df[df['label'] == label]
    print(f"{_['url'].nunique()} unique {label} URLs")



200 unique malicious URLs
9978 unique benign URLs


## Preprocessed

In [7]:
df_preprocessed = pd.read_csv(PREPROCESSED_DATA)

In [8]:
df_preprocessed['label'].value_counts()

benign       10000
malicious      200
Name: label, dtype: int64

# Cluster analysis

Explore your cluster before running the automatic regex generation

In [9]:
def load_result(data_path, folder):
    df = pd.read_csv(data_path)
    with open(os.path.join(folder, 'index_to_keep.pkl'), 'rb') as f:
        index_to_keep = pickle.load(f)
    with open(os.path.join(folder, 'labels.pkl'), 'rb') as f:
        labels = pickle.load(f)
    df = df.iloc[index_to_keep, :]
    df['cluster'] = labels
    series_cluster_count = df['cluster'].value_counts()
    print('Clusters : ')
    print(df['cluster'].value_counts())
    for cluster in series_cluster_count.index:
        if cluster == -1:
            continue
        print('#####Cluster {} - {} samples: #### \n'.format(cluster, series_cluster_count[cluster]))
        pprint(df[(df['cluster']==cluster)]['path'].to_list())
        print('\n')
    return df


def get_stat_cluster(df_features):
    df_features_cluster = pd.DataFrame(df_features.groupby('cluster').agg({'domain': ['nunique'], 'path': 'count'}).to_records())
    df_features_cluster.columns = ['cluster', 'domain', 'path']
    df_features_cluster = df_features_cluster[df_features_cluster['cluster'] !=-1]
    n_path = df_features_cluster['path'].sum()
    print('{} paths ({} %) clustered from {} domains !'.format(n_path, round(100*n_path/df_features['path'].nunique(), 2), df_features_cluster['domain'].sum()))
    print('Cluster number: {}'.format(df_features_cluster['cluster'].nunique()))
    return df_features_cluster.sort_values('path', ascending=False)

In [10]:
df_features = load_result(PREPROCESSED_DATA, MATRIX_OUTPUT)

Clusters : 
 0     27
 4     17
 6     16
 1     16
 10    15
 15    12
 11    12
 3     10
 8      9
 16     8
 7      8
 9      8
 5      8
 12     8
 13     8
 2      8
 14     8
-1      2
Name: cluster, dtype: int64
#####Cluster 0 - 27 samples: #### 

['/neat/serverphp/config.bin',
 '/serverphp/config.bin',
 '/Zeus/serverphp/config.bin',
 '/files/serverphp/config.bin',
 '/high/serverphp/config.bin',
 '/work/server.php/config.bin',
 '/nice/serverphp/config.bin',
 '/online/serverphp/config.bin',
 '/adm/serverphp/config.bin',
 '/plain/serverphp/config.bin',
 '/dbb/serverphp/config.bin',
 '/figo/serverphp/config.bin',
 '/fine/serverphp/config.bin',
 '/sys/serverphp/config.bin',
 '/dbd/serverphp/config.bin',
 '/nku/serverphp/config.bin',
 '/lg/server-php/config.bin',
 '/crome/serverphp/config.bin',
 '/db/serverphp/config.bin',
 '/good/serverphp/config.bin',
 '/serverp/config.bin',
 '/dolls/serverphp/config.bin',
 '/pus1/serverphp/config.bin',
 '/lg/server.php/config.bin',
 '/ekene/Sever