In [22]:
import pandas as pd

from sklearn.feature_selection import mutual_info_classif
from sklearn.ensemble import ExtraTreesClassifier

In [23]:
df = pd.read_csv("./datasets/original/dataSetCleaned.csv")
df.head(3)

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Src Port,Dst Port,Protocol,Flow Duration,Total Fwd Packet,Total Bwd packets,Total Length of Fwd Packet,Total Length of Bwd Packet,...,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Device Class
0,0,0,58239.0,443.0,6.0,39685.0,2.0,0.0,0.0,0.0,...,32.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Mobile
1,1,1,56889.0,443.0,6.0,14719483.0,9.0,7.0,339.0,3900.0,...,32.0,195918.0,0.0,195918.0,195918.0,14523565.0,0.0,14523565.0,14523565.0,Mobile
2,2,2,80.0,60542.0,6.0,34116.0,1.0,1.0,0.0,0.0,...,32.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Mobile


In [29]:
df['Device Class'].value_counts()

Mobile     1140338
Non Tor     828450
Iot         110394
Name: Device Class, dtype: int64

In [30]:
def to_category(value):
    if value == "Iot":
        return 1
    return 0

df['Device Class'] = df['Device Class'].apply(to_category)

In [31]:
df['Device Class'].value_counts()

0    1968788
1     110394
Name: Device Class, dtype: int64

In [32]:
linhas  = df.columns.values.tolist()
colunas = ["Mutual information", "Importance", "Correlation", "Variance"]

In [33]:
X = df.drop(['Device Class'], axis=1)
y = df['Device Class']

## Information Gain

* [information gain](https://machinelearningmastery.com/information-gain-and-mutual-information/)
* [scikit-learn mutual information](http://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.mutual_info_classif.html)

In [34]:
#ig_values = mutual_info_classif(X, y)
#ig_values

## Feature Importance

* [sklearn feature importance](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.ExtraTreesClassifier.html)

In [35]:
model = ExtraTreesClassifier()
model.fit(X, y)
fi_values = model.feature_importances_
fi_values

array([0.31163765, 0.1721111 , 0.02374209, 0.03425698, 0.00926761,
       0.0128969 , 0.0028181 , 0.00203529, 0.00221452, 0.00152614,
       0.00888355, 0.00375368, 0.00470733, 0.00588669, 0.00716858,
       0.00577339, 0.00649623, 0.00662887, 0.00234767, 0.00916487,
       0.00432967, 0.00395146, 0.00685677, 0.00640542, 0.01366615,
       0.0072437 , 0.0046466 , 0.00705191, 0.00297006, 0.00554192,
       0.00253949, 0.00202931, 0.00373386, 0.00164687, 0.00465418,
       0.        , 0.        , 0.00441244, 0.00562589, 0.00480361,
       0.01246696, 0.00724967, 0.01178774, 0.00510587, 0.00826739,
       0.00776177, 0.00862664, 0.01659955, 0.00062661, 0.00367284,
       0.00901392, 0.00498542, 0.00799327, 0.00600969, 0.00611667,
       0.00261766, 0.00193772, 0.00206434, 0.00170739, 0.03759533,
       0.01065206, 0.00160389, 0.07786432, 0.00117842, 0.0006077 ,
       0.00334833, 0.00251561, 0.00407202, 0.00214411, 0.00515046,
       0.00323006])

## Correlation

In [36]:
aux = df.corr()['Device Class']
aux[aux.isna()].index.values.tolist()

['Bwd PSH Flags', 'Bwd URG Flags']

In [37]:
co_values = df.corr()['Device Class'].values
co_values

array([ 3.88365113e-01, -3.21784247e-01,  4.80948085e-02,  1.33169113e-01,
       -7.36351809e-03,  6.15818203e-03,  5.37859646e-02,  4.49395484e-02,
        2.95253137e-02,  3.30139436e-02,  3.38522969e-02, -4.72336906e-02,
        4.59675623e-03,  1.92374570e-02, -1.89610714e-02, -6.65530955e-02,
       -4.11126658e-02, -1.98907762e-02, -2.31745746e-02,  8.73454937e-02,
       -3.44178667e-02,  4.44111432e-03, -3.23761412e-03, -3.58939283e-02,
        1.17126658e-02, -2.13220585e-02,  1.13181883e-02,  1.01241560e-02,
       -2.57101406e-02,  4.78197931e-02,  4.83063782e-03,  2.66627179e-02,
        2.50381629e-02, -9.55574423e-03,  7.97248494e-03,             nan,
                   nan,  4.85564139e-02,  3.56113889e-02,  5.41943429e-02,
        1.23069969e-01, -4.89446567e-02, -1.81207286e-04, -2.60245396e-02,
       -1.21109066e-02, -1.59056963e-02,  2.22071870e-02,  3.38672507e-02,
       -7.46600712e-03,  7.97248494e-03,  7.38252609e-03, -5.88276061e-03,
       -3.35840261e-02,  

## Variance

In [None]:
vr_values = df.var().values
vr_values

In [None]:
feature_dictionary = {
    "Mutual information": ig_values, 
    "Importance": fi_values, 
    "Correlation": co_values, 
    "Variance": vr_values
}

features_df = pd.DataFrame(feature_dictionary)
features_df.head(5)

In [None]:
features_df.to_csv("./resultados/features_df.csv", index=False)

## Final Scores

In [4]:
final_scores = pd.read_csv("./features_df.csv", index_col=[0])

Unnamed: 0,Mutual information,Importance,Correlation,Variance
0,0.207529,0.337283,0.388365,3.602505e+11
1,0.081483,0.172033,-0.321784,9.804030e+10
2,0.055187,0.021675,0.048095,4.590485e+08
3,0.090831,0.028616,0.133169,3.717237e+08
4,0.198192,0.011126,-0.007364,2.699031e+01
5,0.056752,0.010392,0.006158,1.467099e+15
6,0.019895,0.003073,0.053786,3.410610e+04
7,0.011199,0.002375,0.044940,6.040979e+04
8,0.059909,0.001949,0.029525,5.419955e+10
9,0.025981,0.001320,0.033014,1.065404e+11


In [16]:
final_scores['Abs Correlation'] = final_scores['Correlation'].abs()

In [17]:
final_scores.sort_values(by=["Mutual information"], ascending=False).head(10)

Unnamed: 0,Mutual information,Importance,Correlation,Variance,Abs Correlation
0,0.207529,0.337283,0.388365,360250500000.0,0.388365
4,0.198192,0.011126,-0.007364,26.99031,0.007364
62,0.190465,0.071198,-0.087304,117.5233,0.087304
59,0.176396,0.04152,-0.077153,612520700.0,0.077153
60,0.104439,0.010991,-0.030302,112830600.0,0.030302
51,0.095622,0.003645,-0.005883,0.3997772,0.005883
3,0.090831,0.028616,0.133169,371723700.0,0.133169
1,0.081483,0.172033,-0.321784,98040300000.0,0.321784
43,0.08079,0.00598,-0.026025,32547.18,0.026025
50,0.076279,0.008803,0.007383,0.2438427,0.007383


In [32]:
final_scores.sort_values(by=["Importance"], ascending=False).head(10)

Unnamed: 0,Mutual information,Importance,Correlation,Variance,Abs Correlation,Colunas
62,0.190465,0.071198,-0.087304,117.5233,0.087304,Fwd Seg Size Min
59,0.176396,0.04152,-0.077153,612520700.0,0.077153,FWD Init Win Bytes
3,0.090831,0.028616,0.133169,371723700.0,0.133169,Dst Port
2,0.055187,0.021675,0.048095,459048500.0,0.048095,Src Port
47,0.030071,0.014035,0.033867,0.1901711,0.033867,SYN Flag Count
24,0.04456,0.011665,0.011713,1417488000000000.0,0.011713,Fwd IAT Total
4,0.198192,0.011126,-0.007364,26.99031,0.007364,Protocol
60,0.104439,0.010991,-0.030302,112830600.0,0.030302,Bwd Init Win Bytes
5,0.056752,0.010392,0.006158,1467099000000000.0,0.006158,Flow Duration
40,0.044135,0.009863,0.12307,119176200.0,0.12307,Bwd Packets/s


In [31]:
final_scores.sort_values(by=["Abs Correlation"], ascending=False).head(10)

Unnamed: 0,Mutual information,Importance,Correlation,Variance,Abs Correlation,Colunas
3,0.090831,0.028616,0.133169,371723700.0,0.133169,Dst Port
40,0.044135,0.009863,0.12307,119176200.0,0.12307,Bwd Packets/s
64,0.00858,0.000974,0.090121,310517900000.0,0.090121,Active Std
19,0.054174,0.008102,0.087345,903433000.0,0.087345,Flow Packets/s
62,0.190465,0.071198,-0.087304,117.5233,0.087304,Fwd Seg Size Min
59,0.176396,0.04152,-0.077153,612520700.0,0.077153,FWD Init Win Bytes
68,0.008121,0.001592,0.076151,12482310000000.0,0.076151,Idle Std
15,0.019063,0.005691,-0.066553,4928.357,0.066553,Bwd Packet Length Min
39,0.053994,0.005763,0.054194,559240100.0,0.054194,Fwd Packets/s
6,0.019895,0.003073,0.053786,34106.1,0.053786,Total Fwd Packet


In [34]:
final_scores.sort_values(by=["Variance"], ascending=False).head(10)

Unnamed: 0,Mutual information,Importance,Correlation,Variance,Abs Correlation,Colunas
5,0.056752,0.010392,0.006158,1467099000000000.0,0.006158,Flow Duration
24,0.04456,0.011665,0.011713,1417488000000000.0,0.011713,Fwd IAT Total
29,0.029909,0.005601,0.04782,950183700000000.0,0.04782,Bwd IAT Total
22,0.071537,0.006289,-0.003238,423340000000000.0,0.003238,Flow IAT Max
69,0.035706,0.004679,0.010871,381258500000000.0,0.010871,Idle Max
27,0.056038,0.00563,0.010124,355894500000000.0,0.010124,Fwd IAT Max
67,0.031759,0.002646,-0.001629,347403400000000.0,0.001629,Idle Mean
70,0.031078,0.005665,-0.014168,333054500000000.0,0.014168,Idle Min
32,0.042326,0.003328,0.025038,310016500000000.0,0.025038,Bwd IAT Max
25,0.043441,0.005637,-0.021322,170081600000000.0,0.021322,Fwd IAT Mean


In [25]:
final_scores['Colunas'] = df.columns.values[:71]
final_scores.drop([0,1], axis=0, inplace=True)
final_scores

Unnamed: 0,Mutual information,Importance,Correlation,Variance,Abs Correlation,Colunas
2,0.055187,0.021675,0.048095,4.590485e+08,0.048095,Src Port
3,0.090831,0.028616,0.133169,3.717237e+08,0.133169,Dst Port
4,0.198192,0.011126,-0.007364,2.699031e+01,0.007364,Protocol
5,0.056752,0.010392,0.006158,1.467099e+15,0.006158,Flow Duration
6,0.019895,0.003073,0.053786,3.410610e+04,0.053786,Total Fwd Packet
7,0.011199,0.002375,0.044940,6.040979e+04,0.044940,Total Bwd packets
8,0.059909,0.001949,0.029525,5.419955e+10,0.029525,Total Length of Fwd Packet
9,0.025981,0.001320,0.033014,1.065404e+11,0.033014,Total Length of Bwd Packet
10,0.055065,0.009619,0.033852,1.157967e+05,0.033852,Fwd Packet Length Max
11,0.026651,0.004784,-0.047234,4.681729e+03,0.047234,Fwd Packet Length Min


In [33]:
keeped_columns = final_scores.sort_values(by=["Importance"], ascending=False)['Colunas'].head(10).values.tolist()
df[keeped_columns].head(2)

Unnamed: 0,Fwd Seg Size Min,FWD Init Win Bytes,Dst Port,Src Port,SYN Flag Count,Fwd IAT Total,Protocol,Bwd Init Win Bytes,Flow Duration,Bwd Packets/s
0,32.0,349.0,443.0,58239.0,0.0,39685.0,6.0,-1.0,39685.0,0.0
1,32.0,65535.0,443.0,56889.0,1.0,14719483.0,6.0,118.0,14719483.0,0.47556


In [26]:
final_scores.to_csv("./features_df.csv", index=False)