# Importar bibliotecas básicas

In [99]:
!pip install scikit-learn



In [100]:
!pip install sklearn.preprocessing



In [101]:
!pip install cython



In [102]:
!pip install --upgrade pip



In [103]:
!pip install seaborn



In [104]:
!pip install plotly



In [105]:
!pip install --upgrade numpy



In [106]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import sklearn

# Importar dados

In [107]:
data = pd.read_csv('data/Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX.csv')

In [108]:
data.sample(300)

Unnamed: 0,Destination Port,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,...,min_seg_size_forward,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
167143,3269,7,1,1,0,6,0,0,0.000000,0.000000,...,40,0.0,0.0,0,0,0.0,0.0,0,0,PortScan
170718,32777,68,1,1,0,6,0,0,0.000000,0.000000,...,40,0.0,0.0,0,0,0.0,0.0,0,0,PortScan
140360,7800,32,1,1,2,6,2,2,2.000000,0.000000,...,24,0.0,0.0,0,0,0.0,0.0,0,0,PortScan
104027,5903,37,1,1,2,6,2,2,2.000000,0.000000,...,24,0.0,0.0,0,0,0.0,0.0,0,0,PortScan
83320,443,272473,10,9,559,3988,202,0,55.900000,73.025795,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10677,53,874,2,2,62,94,31,31,31.000000,0.000000,...,32,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
52639,22,1307566,41,42,2728,6634,456,0,66.536585,110.129945,...,32,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
268311,53,131010,4,2,132,272,33,33,33.000000,0.000000,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
183025,1062,71,1,1,0,6,0,0,0.000000,0.000000,...,40,0.0,0.0,0,0,0.0,0.0,0,0,PortScan


# Tratamento de dados

In [109]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 286467 entries, 0 to 286466
Data columns (total 79 columns):
 #   Column                        Non-Null Count   Dtype  
---  ------                        --------------   -----  
 0    Destination Port             286467 non-null  int64  
 1    Flow Duration                286467 non-null  int64  
 2    Total Fwd Packets            286467 non-null  int64  
 3    Total Backward Packets       286467 non-null  int64  
 4   Total Length of Fwd Packets   286467 non-null  int64  
 5    Total Length of Bwd Packets  286467 non-null  int64  
 6    Fwd Packet Length Max        286467 non-null  int64  
 7    Fwd Packet Length Min        286467 non-null  int64  
 8    Fwd Packet Length Mean       286467 non-null  float64
 9    Fwd Packet Length Std        286467 non-null  float64
 10  Bwd Packet Length Max         286467 non-null  int64  
 11   Bwd Packet Length Min        286467 non-null  int64  
 12   Bwd Packet Length Mean       286467 non-nul

In [110]:
data.isnull().sum()

 Destination Port              0
 Flow Duration                 0
 Total Fwd Packets             0
 Total Backward Packets        0
Total Length of Fwd Packets    0
                              ..
Idle Mean                      0
 Idle Std                      0
 Idle Max                      0
 Idle Min                      0
 Label                         0
Length: 79, dtype: int64

In [111]:
# Replace inf and -inf with NaN
data.replace([np.inf, -np.inf], np.nan, inplace=True)
print(data)

         Destination Port   Flow Duration   Total Fwd Packets  \
0                      22         1266342                  41   
1                      22         1319353                  41   
2                      22             160                   1   
3                      22         1303488                  41   
4                   35396              77                   1   
...                   ...             ...                 ...   
286462                443          196135                  49   
286463                443          378424                  49   
286464                443          161800                  70   
286465                443          142864                  50   
286466                443          186928                  46   

         Total Backward Packets  Total Length of Fwd Packets  \
0                            44                         2664   
1                            44                         2664   
2                          

In [112]:
data.dropna(inplace=True)

In [113]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 286096 entries, 0 to 286466
Data columns (total 79 columns):
 #   Column                        Non-Null Count   Dtype  
---  ------                        --------------   -----  
 0    Destination Port             286096 non-null  int64  
 1    Flow Duration                286096 non-null  int64  
 2    Total Fwd Packets            286096 non-null  int64  
 3    Total Backward Packets       286096 non-null  int64  
 4   Total Length of Fwd Packets   286096 non-null  int64  
 5    Total Length of Bwd Packets  286096 non-null  int64  
 6    Fwd Packet Length Max        286096 non-null  int64  
 7    Fwd Packet Length Min        286096 non-null  int64  
 8    Fwd Packet Length Mean       286096 non-null  float64
 9    Fwd Packet Length Std        286096 non-null  float64
 10  Bwd Packet Length Max         286096 non-null  int64  
 11   Bwd Packet Length Min        286096 non-null  int64  
 12   Bwd Packet Length Mean       286096 non-null  fl

In [114]:
data.head()

Unnamed: 0,Destination Port,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,...,min_seg_size_forward,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,22,1266342,41,44,2664,6954,456,0,64.97561,109.864573,...,32,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
1,22,1319353,41,44,2664,6954,456,0,64.97561,109.864573,...,32,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
2,22,160,1,1,0,0,0,0,0.0,0.0,...,32,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
3,22,1303488,41,42,2728,6634,456,0,66.536585,110.129945,...,32,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
4,35396,77,1,2,0,0,0,0,0.0,0.0,...,32,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN


# Divisão entre previsores e classe

In [115]:
df_train_y = data[' Label']
df_train_x = data.drop(' Label', axis=1)


In [116]:
for i,r in data.iterrows():
    if data.at[i, ' Label'] == "BENIGN":
        data.at[i, ' Label'] = 1
    else:
        data.at[i, ' Label'] = 0
        

# Normalização

In [117]:
from sklearn.preprocessing import MinMaxScaler

minmax_scaler = MinMaxScaler()
x_norm        = minmax_scaler.fit_transform(df_train_x)
data          = pd.DataFrame(x_norm)
data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,68,69,70,71,72,73,74,75,76,77
0,0.000336,0.01055296,0.012829,0.012105,0.011466,0.000972,0.032737,0.0,0.019038,0.031022,...,0.011673,0.533333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.000336,0.01099472,0.012829,0.012105,0.011466,0.000972,0.032737,0.0,0.019038,0.031022,...,0.011673,0.533333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.000336,1.441667e-06,0.0,0.000275,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.533333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.000336,0.01086251,0.012829,0.011554,0.011741,0.000928,0.032737,0.0,0.019495,0.031097,...,0.011673,0.533333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.541314,7.500002e-07,0.0,0.00055,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.533333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# PCA - redução de dimensões

In [118]:
# from sklearn.decomposition import PCA
# import matplotlib.pyplot as plt
# import numpy as np

# pca = PCA().fit(x_norm)
# plt.plot(np.cumsum(pca.explained_variance_ratio_), label='Variancia explicada')
# plt.axhline(0.95,color='red',linestyle='--',label='95% percentual')
# plt.grid(True, linestyle='--')
# plt.legend()

In [119]:
# pca       = PCA(n_components=8)
# dados_pca = pca.fit_transform(x_norm)
# sns.scatterplot(x=dados_pca[:,0],y=dados_pca[:,1],hue=Y)

# Salvar bases de dados

In [120]:
data.columns

RangeIndex(start=0, stop=78, step=1)

In [121]:
data = data.drop_duplicates()

In [122]:
# Convert only float64 columns to float32
data = data.apply(lambda x: x.astype('float32') if x.dtype == 'float64' or x.dtype == 'int64' else x)

print(data.dtypes)


0     float32
1     float32
2     float32
3     float32
4     float32
       ...   
73    float32
74    float32
75    float32
76    float32
77    float32
Length: 78, dtype: object


In [123]:
df_train_x.to_csv('train_x', index=False)
df_train_y.to_csv('train_y', index=False)
