In [8]:
"""
BOTection - MultiClass Classifier
Created by balahmadi @balahmadi_OX

@author: balahmadi - 2020
"""

# 4. Multi-Class Classification (Family Classification Detection) to classify n-flows to a Malware Family
#   Train a RF multi-class classifier to classify n-flows to its malware family.


import pandas as pd
import numpy as np
import math
from sklearn.decomposition import PCA
import sklearn.metrics as metrics
from sklearn.model_selection import cross_val_predict
from sklearn.ensemble import RandomForestClassifier as RF
from sklearn.preprocessing import scale
import dill as pickle 
from sklearn.model_selection import train_test_split
import sys  
import os
import csv 

n_flows = [10,15,35,30,25,20,10]

for n in n_flows:
    
    with open("./Data/MM_StateTransition/dataset_" + str(n), "rb") as f:
        dataset = pickle.load(f)
   
    malDataset = dataset.loc[dataset.Class =='Malicious']
    malDataset.dropna(axis=1, how='any')  
    to_drop=["Family","Class"]
    
    y = malDataset['Family']
    X = malDataset.drop(to_drop, axis=1)
    col_names=malDataset.columns    
    X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.33, random_state=42, stratify=y)
   
    X_train=X_train.as_matrix().astype(np.float)
    X_test=X_test.as_matrix().astype(np.float)
   
    # Binarize the output
    y_train = pd.factorize(y_train)
    y_test = pd.factorize(y_test)
   
    class_Names=y_test[1]
                               
    y_train = y_train[0]
    y_test = y_test[0]
    
    print ('-------------- Results: n = ' + str(n) + ' ---------------')

    
    print ('-------------- Precision - Recall - F1 Score Report ---------------')
    classifier = RF(n_estimators=101, max_features=None, class_weight ='balanced')
    model = classifier.fit(X_train,y_train)
    y_pred = model.predict(X_test)
    print (metrics.classification_report(y_test, y_pred, target_names = class_Names , digits=4))

    print ('-------------- Precision - Recall - F1 Score Report (Cross Validation) ---------------')
    # When applying cross-validation
    y_pred = cross_val_predict(RF(n_estimators=101, max_features=None, class_weight ='balanced'), X, y, cv=10)
    print (metrics.classification_report(y, y_pred, target_names =  class_Names , digits=4))

          Class                        Family Flows  OTH           RSTO  \
                                                     OTH   S0   SF  OTH   
0     Malicious            capture-win19.pcap     1  0.0  0.0  0.0  0.0   
1     Malicious            capture-win19.pcap     2  0.0  0.0  0.0  0.0   
2     Malicious            capture-win19.pcap     3  0.0  0.0  0.0  0.0   
3     Malicious            capture-win19.pcap     4  0.0  0.0  0.0  0.0   
4     Malicious            capture-win19.pcap     5  0.0  0.0  0.0  0.0   
5     Malicious            capture-win19.pcap     6  0.0  0.0  0.0  0.0   
6     Malicious            capture-win19.pcap     7  0.0  0.0  1.0  0.0   
7     Malicious            capture-win19.pcap     8  0.0  0.0  1.0  0.0   
8     Malicious            capture-win19.pcap     9  0.0  0.0  1.0  0.0   
9     Malicious            capture-win19.pcap    10  0.0  0.0  1.0  0.0   
10    Malicious            capture-win19.pcap    11  0.0  0.0  1.0  0.0   
11    Malicious          



                              precision    recall  f1-score   support

          capture-win19.pcap     0.0053    0.0263    0.0088       190
2013-08-20_capture-win2.pcap     0.0366    0.0074    0.0123       947
2013-11-06_capture-win5.pcap     0.9038    1.0000    0.9495        47

                   micro avg     0.0498    0.0498    0.0498      1184
                   macro avg     0.3153    0.3446    0.3235      1184
                weighted avg     0.0660    0.0498    0.0489      1184

                              precision    recall  f1-score   support

          capture-win19.pcap     0.9382    0.9313    0.9348      2869
2013-08-20_capture-win2.pcap     0.9408    1.0000    0.9695       143
2013-11-06_capture-win5.pcap     0.6780    0.6922    0.6850       575

                   micro avg     0.8957    0.8957    0.8957      3587
                   macro avg     0.8523    0.8745    0.8631      3587
                weighted avg     0.8966    0.8957    0.8961      3587

          Clas

                              precision    recall  f1-score   support

2013-08-20_capture-win2.pcap     0.9968    0.9946    0.9957       934
          capture-win19.pcap     0.0270    0.0053    0.0089       188
2013-11-06_capture-win5.pcap     0.0000    0.0000    0.0000        33

                   micro avg     0.8052    0.8052    0.8052      1155
                   macro avg     0.3413    0.3333    0.3349      1155
                weighted avg     0.8105    0.8052    0.8066      1155

                              precision    recall  f1-score   support

2013-08-20_capture-win2.pcap     0.9550    0.9445    0.9497      2831
          capture-win19.pcap     0.9333    0.9899    0.9608        99
2013-11-06_capture-win5.pcap     0.7445    0.7772    0.7605       570

                   micro avg     0.9186    0.9186    0.9186      3500
                   macro avg     0.8776    0.9039    0.8903      3500
                weighted avg     0.9201    0.9186    0.9192      3500

          Clas

                              precision    recall  f1-score   support

2013-08-20_capture-win2.pcap     1.0000    0.9955    0.9978       895
2013-11-06_capture-win5.pcap     0.0000    0.0000    0.0000        21
          capture-win19.pcap     0.0385    0.0055    0.0096       182

                   micro avg     0.8124    0.8124    0.8124      1098
                   macro avg     0.3462    0.3337    0.3358      1098
                weighted avg     0.8215    0.8124    0.8149      1098

                              precision    recall  f1-score   support

2013-08-20_capture-win2.pcap     0.9804    0.9945    0.9874      2711
2013-11-06_capture-win5.pcap     0.9275    0.9846    0.9552        65
          capture-win19.pcap     0.9783    0.9018    0.9385       550

                   micro avg     0.9790    0.9790    0.9790      3326
                   macro avg     0.9621    0.9603    0.9604      3326
                weighted avg     0.9790    0.9790    0.9787      3326

          Clas

                              precision    recall  f1-score   support

2013-08-20_capture-win2.pcap     0.0109    0.0022    0.0037       905
          capture-win19.pcap     0.0000    0.0000    0.0000       183
2013-11-06_capture-win5.pcap     0.9200    1.0000    0.9583        23

                   micro avg     0.0225    0.0225    0.0225      1111
                   macro avg     0.3103    0.3341    0.3207      1111
                weighted avg     0.0279    0.0225    0.0228      1111

                              precision    recall  f1-score   support

2013-08-20_capture-win2.pcap     0.9862    0.9394    0.9623      2741
          capture-win19.pcap     0.9306    0.9853    0.9571        68
2013-11-06_capture-win5.pcap     0.7621    0.9351    0.8398       555

                   micro avg     0.9397    0.9397    0.9397      3364
                   macro avg     0.8930    0.9533    0.9197      3364
                weighted avg     0.9481    0.9397    0.9420      3364

          Clas

                              precision    recall  f1-score   support

2013-08-20_capture-win2.pcap     0.9978    0.9978    0.9978       915
          capture-win19.pcap     0.9946    0.9946    0.9946       185
2013-11-06_capture-win5.pcap     0.9130    0.9130    0.9130        23

                   micro avg     0.9955    0.9955    0.9955      1123
                   macro avg     0.9685    0.9685    0.9685      1123
                weighted avg     0.9955    0.9955    0.9955      1123

                              precision    recall  f1-score   support

2013-08-20_capture-win2.pcap     0.9890    0.9383    0.9630      2771
          capture-win19.pcap     0.9333    0.9859    0.9589        71
2013-11-06_capture-win5.pcap     0.7607    0.9482    0.8442       560

                   micro avg     0.9409    0.9409    0.9409      3402
                   macro avg     0.8943    0.9575    0.9220      3402
                weighted avg     0.9502    0.9409    0.9433      3402

          Clas

                              precision    recall  f1-score   support

2013-08-20_capture-win2.pcap     0.0107    0.0022    0.0036       925
          capture-win19.pcap     0.0000    0.0000    0.0000       186
2013-11-06_capture-win5.pcap     0.9000    0.9310    0.9153        29

                   micro avg     0.0254    0.0254    0.0254      1140
                   macro avg     0.3036    0.3111    0.3063      1140
                weighted avg     0.0316    0.0254    0.0262      1140

                              precision    recall  f1-score   support

2013-08-20_capture-win2.pcap     0.9760    0.9886    0.9823      2801
          capture-win19.pcap     0.9239    0.9770    0.9497        87
2013-11-06_capture-win5.pcap     0.9504    0.8814    0.9146       565

                   micro avg     0.9708    0.9708    0.9708      3453
                   macro avg     0.9501    0.9490    0.9489      3453
                weighted avg     0.9705    0.9708    0.9704      3453

          Clas

                              precision    recall  f1-score   support

          capture-win19.pcap     0.0053    0.0263    0.0088       190
2013-08-20_capture-win2.pcap     0.0366    0.0074    0.0123       947
2013-11-06_capture-win5.pcap     0.9038    1.0000    0.9495        47

                   micro avg     0.0498    0.0498    0.0498      1184
                   macro avg     0.3153    0.3446    0.3235      1184
                weighted avg     0.0660    0.0498    0.0489      1184

                              precision    recall  f1-score   support

          capture-win19.pcap     0.9405    0.9313    0.9359      2869
2013-08-20_capture-win2.pcap     0.9408    1.0000    0.9695       143
2013-11-06_capture-win5.pcap     0.6818    0.7043    0.6929       575

                   micro avg     0.8977    0.8977    0.8977      3587
                   macro avg     0.8544    0.8786    0.8661      3587
                weighted avg     0.8991    0.8977    0.8983      3587

