In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_selection import VarianceThreshold

### Train data

In [2]:
train = pd.read_csv('../../ml-data/cyberattack_detection/2_processed_data/A1_train_data.csv', sep=',')

In [3]:
train.var()

TotalPackets       1.000000e+00
TotalBytes         1.000000e+00
SourceBytes        1.000000e+00
PacketsSec         1.000000e+00
TotalBytesSec      1.000000e+00
SourceBytesSec     1.000000e+00
Protocol           4.323418e+00
SrcIPAddress       1.087833e+07
SrcPort            1.128722e+09
Direction          8.882026e-01
DestIPAddress      5.283201e+10
DestPort           3.477727e+08
State              1.154620e+04
SrcTypeService     3.679547e-01
DestTypeService    1.647496e+00
dtype: float64

In [4]:
varianceThreshold = VarianceThreshold(0.9)

In [5]:
varianceThreshold.fit(train)

VarianceThreshold(threshold=0.9)

In [6]:
varianceThreshold.get_support()

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
       False,  True,  True,  True, False,  True])

In [7]:
rawTrainVT = varianceThreshold.transform(train)

In [8]:
column_names = train.columns[varianceThreshold.get_support()].values

In [9]:
trainVT = pd.DataFrame(data = rawTrainVT, columns = column_names)

In [10]:
trainVT.head(10)

Unnamed: 0,TotalPackets,TotalBytes,SourceBytes,PacketsSec,TotalBytesSec,SourceBytesSec,Protocol,SrcIPAddress,SrcPort,DestIPAddress,DestPort,State,DestTypeService
0,-0.008024,-0.008615,-0.003897,-0.296301,-0.308511,-0.141071,11.0,5663.0,123467.0,475015.0,57944.0,120.0,0.0
1,-0.010103,-0.008919,-0.004151,-0.296351,-0.30852,-0.141076,11.0,5663.0,123658.0,46220.0,34823.0,327.0,4.0
2,-0.010363,-0.008918,-0.004149,-0.296351,-0.30852,-0.141076,11.0,5663.0,123546.0,76319.0,34823.0,327.0,4.0
3,-0.010363,-0.008917,-0.004151,-0.296351,-0.30852,-0.141076,11.0,5663.0,123606.0,452301.0,34823.0,327.0,4.0
4,-0.010623,-0.008919,-0.004149,-0.296351,-0.30852,-0.141076,11.0,5663.0,123487.0,79211.0,34823.0,327.0,4.0
5,-0.010623,-0.008919,-0.00415,-0.296351,-0.30852,-0.141076,11.0,5663.0,123586.0,74826.0,34823.0,327.0,4.0
6,-0.010623,-0.008918,-0.004149,-0.296351,-0.30852,-0.141076,11.0,5663.0,123505.0,50030.0,34823.0,327.0,4.0
7,-0.010103,-0.008918,-0.004151,-0.296351,-0.30852,-0.141076,11.0,5663.0,123565.0,554260.0,34823.0,327.0,4.0
8,-0.010103,-0.008918,-0.004146,-0.296351,-0.30852,-0.141076,11.0,5663.0,123639.0,493475.0,34823.0,327.0,4.0
9,-0.010103,-0.008919,-0.004147,-0.296351,-0.30852,-0.141076,11.0,5663.0,123717.0,496785.0,34823.0,327.0,4.0


In [11]:
trainVT.to_csv('../../ml-data/cyberattack_detection/3_selected_features/A1_train_data_VT.csv', sep=',', index=False)

### Test data

In [12]:
test = pd.read_csv('../../ml-data/cyberattack_detection/2_processed_data/A1_test_data.csv', sep=',')

In [13]:
rawTestTV = varianceThreshold.transform(test)

In [14]:
testTV = pd.DataFrame(data = rawTestTV , columns = column_names)

In [15]:
testTV.head()

Unnamed: 0,TotalPackets,TotalBytes,SourceBytes,PacketsSec,TotalBytesSec,SourceBytesSec,Protocol,SrcIPAddress,SrcPort,DestIPAddress,DestPort,State,DestTypeService
0,0.010949,-0.007308,-0.001807,-0.296307,-0.308519,-0.141075,11.0,1638.0,161264.0,465335.0,31090.0,189.0,0.0
1,0.00913,-0.007336,-0.001802,-0.296307,-0.308518,-0.141075,11.0,1638.0,161264.0,465335.0,31090.0,189.0,0.0
2,0.011209,-0.007299,-0.00189,-0.296307,-0.308518,-0.141075,11.0,1638.0,161264.0,465335.0,31090.0,189.0,0.0
3,0.013548,-0.006943,-0.001699,-0.296307,-0.308518,-0.141075,11.0,1638.0,161264.0,465335.0,31090.0,189.0,0.0
4,0.00965,-0.00731,-0.002012,-0.296307,-0.308518,-0.141075,11.0,1638.0,161264.0,465335.0,31090.0,189.0,0.0


In [16]:
testTV.to_csv('../../ml-data/cyberattack_detection/3_selected_features/A1_test_data_VT.csv', sep=',', index=False)

End