# Feature Importance

In [1]:
from supportFiles.myFunc import loadModel, getDSName, setTarget, loadDataset, getFeatureList
from joblib import dump, load
import pandas as pd
import numpy as np
import os

'AB-TRAP_CIC'

### define functions

In [5]:
hocuspocus = lambda a,b: [[r*q for r,q in zip(p,b)] for p in a]

# get the top "qty" features from "relevantes" with names in "cols"
def getTopFeat(relevantes, cols, qty):
    relevantes = relevantes/np.sum(relevantes)
    relevantes = [100*round(num,3) for num in relevantes]
    fi_df = pd.DataFrame({'feature_importance': relevantes, 'feature_name': cols})
    fi_df.sort_values(by=['feature_importance'], ascending=False, inplace=True)
    #return fi_df['feature_name'].iloc[0:qty].values
    lero = fi_df['feature_name'].iloc[0:qty].values
    bla = fi_df['feature_importance'].iloc[0:qty].values
    out = []
    for i in range(len(lero)):
        #if bla[i] < 1:
        #    out.extend([''])
        #else:
        #    out.extend(["({1:.0f}%) {0}".format(lero[i],bla[i])])
        out.extend(["({1:.0f}%) {0}".format(lero[i],bla[i])])
    return out
    #data = pd.DataFrame(data, columns=[modelName])
    #data["Data Set"] = DS
    #return data[["Data Set", model]]

# build a LaTeX table line for feature importance
def getLine(datasetName, models, cols):
    qty = 10
    line = pd.DataFrame({})
    relevantes = models['DT'].best_estimator_.feature_importances_
    line['DT'] = getTopFeat(relevantes, cols, qty)
    relevantes = list(map(abs, models["LR"].best_estimator_.coef_))[0]
    line['LR'] = getTopFeat(relevantes, cols, qty)
    c = hocuspocus(models["MLP"].best_estimator_.coefs_[0],models["MLP"].best_estimator_.coefs_[1])
    relevantes = []
    for entry in c:
        relevantes.append(sum(list(map(abs, entry))[0]) )
    line['MLP'] = getTopFeat(relevantes, cols, qty)
    relevantes = list(map(abs, models["SVM"].best_estimator_.decision_function(np.identity(len(cols)))))
    line['SVM'] = getTopFeat(relevantes, cols, qty)
    relevantes = list(map(abs, models["XGB"].best_estimator_.feature_importances_))
    line['XGB'] = getTopFeat(relevantes, cols, qty)
    line.at[0,'Data Set'] = "\multirow{" + qty + "}{*}{\rotatebox{90}{\\parbox{100pt}{\\textbf{CIC-" + datasetName.split("_")[1] +"}}}}"
    
    return line

### build information table

In [7]:
#Datasets available are :
# {0: 'AB-TRAP', 1: 'NB15', 2: 'CIC-IDS', 3: 'ToN-IoT', 4: 'BoT-IoT'}
tableCols = ["Data Set", "DT", "LR", "MLP", "SVM", "XGB"]
tableInfo = []
#for each Data Set
for DS in range(5):
    #get name
    trainerDSName = getDSName(DS,1,True,True)
    #load models
    models, prep, table, algo = loadModel(trainerDSName)
    #get feature list
    cols = getFeatureList(DS)[0:-1]
    #append line with list of top 10 important features
    tableInfo.append(getLine(trainerDSName, models, cols)[tableCols])
tableInfo = pd.concat(tableInfo, ignore_index=True)

loading models from SCAN_AB-TRAP_CIC
Models fetched: ['SCAN_AB-TRAP_CIC_DT.joblib', 'SCAN_AB-TRAP_CIC_LR.joblib', 'SCAN_AB-TRAP_CIC_MLP.joblib', 'SCAN_AB-TRAP_CIC_NB.joblib', 'SCAN_AB-TRAP_CIC_SVM.joblib', 'SCAN_AB-TRAP_CIC_XGB.joblib']
DT's index of best performance: 5
LR's index of best performance: 0
MLP's index of best performance: 0
NB's index of best performance: 0
SVM's index of best performance: 0
XGB's index of best performance: 0
DT
reading file: 
loading models from SCAN_NB15_CIC
Models fetched: ['SCAN_NB15_CIC_DT.joblib', 'SCAN_NB15_CIC_LR.joblib', 'SCAN_NB15_CIC_MLP.joblib', 'SCAN_NB15_CIC_NB.joblib', 'SCAN_NB15_CIC_SVM.joblib', 'SCAN_NB15_CIC_XGB.joblib']
DT's index of best performance: 10
LR's index of best performance: 0
MLP's index of best performance: 0
NB's index of best performance: 0
SVM's index of best performance: 0
XGB's index of best performance: 0
DT
reading file: 
loading models from SCAN_CIC-IDS_CIC
Models fetched: ['SCAN_CIC-IDS_CIC_DT.joblib', 'SCAN_CIC-ID

In [8]:
pd.set_option('display.max_colwidth', None)
#table = pd.DataFrame(tableInfo)
table = tableInfo
mylabel = "tab:importFeat"
mycaption = "\\textit{Features} ordenadas por importância para cada modelo e \\textit{Data Set}"
tableName = "top10importFeat"
table.fillna('', inplace=True)
featFile = open("./dissertation/{0}.tex".format(tableName),"w")
featFile.write(table.to_latex(column_format='c'*table.columns.size, index=False, caption=mycaption, label=mylabel, position="H"))
featFile.close()

print(table.to_latex(multirow=True, column_format='c'*table.columns.size, index=False, caption=mycaption, label=mylabel, position="H"))

\begin{table}[H]
\centering
\caption{\textit{Features} ordenadas por importância para cada modelo e \textit{Data Set}}
\label{tab:importFeat}
\begin{tabular}{cccccc}
\toprule
                                                              Data Set &                     DT &                     LR &                     MLP &                    SVM &                     XGB \\
\midrule
\textbackslash multirow\{10\}\{*\}\{\textbackslash rotatebox\{90\}\{\textbackslash parbox\{100pt\}\{\textbackslash textbf\{CIC-AB-TRAP\}\}\}\} &       (67\%) bwd\_pkts\_s &     (9\%) flow\_duration &           (8\%) dst\_port &  (8\%) subflow\_bwd\_byts &        (46\%) bwd\_pkts\_s \\
                                                                       &    (7\%) bwd\_header\_len &  (6\%) subflow\_bwd\_byts &        (7\%) pkt\_len\_std &   (8\%) totlen\_bwd\_pkts &       (18\%) pkt\_len\_max \\
                                                                       & (6\%) init\_fwd\_win\_byts &   (6\%) tot

In [4]:
countForDS = []
for i in range(0,tableInfo.shape[0], 10):
    temp = list(np.reshape(tableInfo[tableCols[1:]][i:(i+10)].values,(1,50))[0])
    relevantes = []
    for col in cols:
        relevantes.extend([temp.count(col)])
    
    fi_df = pd.DataFrame({'feature_importance': relevantes, 'feature_name': cols})
    fi_df.sort_values(by=['feature_importance'], ascending=False, inplace=True)
    print("\n",i)
    print(fi_df[['feature_importance','feature_name']].head(3))
    fi_df = fi_df[fi_df.feature_importance > 0]
    countForDS.extend(fi_df['feature_name'])


 0
    feature_importance     feature_name
34                   4   bwd_header_len
2                    4    flow_duration
6                    3  totlen_bwd_pkts

 10
    feature_importance  feature_name
0                    5      dst_port
42                   3  fin_flag_cnt
41                   2   pkt_len_var

 20
    feature_importance  feature_name
3                    3  tot_fwd_pkts
51                   3  pkt_size_avg
36                   2    bwd_pkts_s

 30
    feature_importance    feature_name
0                    5        dst_port
33                   3  fwd_header_len
45                   2    psh_flag_cnt

 40
    feature_importance       feature_name
34                   4     bwd_header_len
54                   4   subflow_fwd_pkts
58                   4  init_bwd_win_byts


In [5]:
countForModel = []
q = 3
for model in tableCols[1:]:
    lero = tableInfo[model].value_counts()
    print("\n"+model)
    for i in range(q):
        print(i, lero.index[i],lero[i])
        if lero[i]>0:
            countForModel.extend([lero.index[i]])


DT
0 dst_port 5
1 init_fwd_win_byts 3
2 fwd_header_len 2

LR
0 flow_iat_max 3
1 flow_duration 3
2 totlen_bwd_pkts 2

MLP
0 dst_port 3
1 pkt_len_min 2
2 tot_fwd_pkts 2

SVM
0 bwd_header_len 3
1 fwd_act_data_pkts 3
2 totlen_bwd_pkts 2

XGB
0 bwd_pkts_s 3
1 totlen_bwd_pkts 2
2 fwd_iat_mean 2
