# Modelo predictivo
El fichero analizado es 'TotalFeatures-ISCXFlowMeter.csv'

### Importaciones

In [1]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
import pandas as pd

### Funciones

In [2]:
labelencoder = LabelEncoder()

In [3]:
def train_val_test_split(df, rstate=42, shuffle=True, stratify=None):
    strat = df[stratify] if stratify else None
    train_set, test_set = train_test_split(
        df, test_size=0.4, random_state=rstate, shuffle=shuffle, stratify=strat)
    strat = test_set[stratify] if stratify else None
    val_set, test_set = train_test_split(
        test_set, test_size=0.5, random_state=rstate, shuffle=shuffle, stratify=strat)
    return (train_set, val_set, test_set)

In [4]:
def remove_labels(df, label_name):
    X = df.drop(label_name, axis=1)
    y = df[label_name].copy()
    return (X, y)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

### Lectura de los datos

In [None]:
df = pd.read_csv('TotalFeatures-ISCXFlowMeter.csv')
df

Unnamed: 0,duration,total_fpackets,total_bpackets,total_fpktl,total_bpktl,min_fpktl,min_bpktl,max_fpktl,max_bpktl,mean_fpktl,...,mean_idle,max_idle,std_idle,FFNEPD,Init_Win_bytes_forward,Init_Win_bytes_backward,RRT_samples_clnt,Act_data_pkt_forward,min_seg_size_forward,calss
0,1020586,668,1641,35692,2276876,52,52,679,1390,53.431138,...,0.0,-1,0.000000e+00,2,4194240,1853440,1640,668,32,benign
1,80794,1,1,75,124,75,124,75,124,75.000000,...,0.0,-1,0.000000e+00,2,0,0,0,1,0,benign
2,998,3,0,187,0,52,-1,83,-1,62.333333,...,0.0,-1,0.000000e+00,4,101888,-1,0,3,32,benign
3,189868,9,9,1448,6200,52,52,706,1390,160.888889,...,0.0,-1,0.000000e+00,2,4194240,2722560,8,9,32,benign
4,110577,4,6,528,1422,52,52,331,1005,132.000000,...,0.0,-1,0.000000e+00,2,155136,31232,5,4,32,benign
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
631950,530,1,1,74,334,74,334,74,334,74.000000,...,0.0,-1,0.000000e+00,2,0,0,0,1,0,benign
631951,50240627,23,24,4767,6107,52,52,533,855,207.260870,...,9842879.0,9964749,1.196806e+05,2,317952,107008,11,23,32,GeneralMalware
631952,35471450,1,2,52,104,52,52,52,52,52.000000,...,35300000.0,35290631,0.000000e+00,2,3904,88704,1,1,32,asware
631953,41713629,12,26,1821,18643,40,40,489,1390,151.750000,...,20200000.0,32711382,1.770000e+07,2,227456,2432,23,12,20,benign


### Constantes

In [None]:
OBJECTIVE_VAR = 'calss'
OBJECTIVE_VAR

### Descripción datos

La variable de clasificación `['calss']` es la única que aparece como un objeto, en formato String. 

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 631955 entries, 0 to 631954
Data columns (total 80 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   duration                 631955 non-null  int64  
 1   total_fpackets           631955 non-null  int64  
 2   total_bpackets           631955 non-null  int64  
 3   total_fpktl              631955 non-null  int64  
 4   total_bpktl              631955 non-null  int64  
 5   min_fpktl                631955 non-null  int64  
 6   min_bpktl                631955 non-null  int64  
 7   max_fpktl                631955 non-null  int64  
 8   max_bpktl                631955 non-null  int64  
 9   mean_fpktl               631955 non-null  float64
 10  mean_bpktl               631955 non-null  float64
 11  std_fpktl                631955 non-null  float64
 12  std_bpktl                631955 non-null  float64
 13  total_fiat               631955 non-null  int64  
 14  tota

In [None]:
df.describe()

Unnamed: 0,duration,total_fpackets,total_bpackets,total_fpktl,total_bpktl,min_fpktl,min_bpktl,max_fpktl,max_bpktl,mean_fpktl,...,min_idle,mean_idle,max_idle,std_idle,FFNEPD,Init_Win_bytes_forward,Init_Win_bytes_backward,RRT_samples_clnt,Act_data_pkt_forward,min_seg_size_forward
count,631955.0,631955.0,631955.0,631955.0,631955.0,631955.0,631955.0,631955.0,631955.0,631955.0,...,631955.0,631955.0,631955.0,631955.0,631955.0,631955.0,631955.0,631955.0,631955.0,631955.0
mean,21952450.0,6.728514,10.431934,954.0172,12060.42,141.475727,44.357688,263.675901,183.248084,174.959706,...,19973270.0,20312280.0,20752380.0,466387.5,2.360896,962079.6,310451.9,9.733144,6.72471,19.965713
std,190057800.0,174.161354,349.424019,82350.4,482471.6,157.68088,89.099554,289.644383,371.863224,162.024811,...,189798600.0,189790200.0,189972100.0,6199704.0,3.04181,1705655.0,664795.6,347.877923,174.13813,14.914261
min,-18.0,0.0,0.0,0.0,0.0,-1.0,-1.0,-1.0,-1.0,0.0,...,-1.0,0.0,-1.0,0.0,2.0,-1.0,-1.0,0.0,0.0,0.0
25%,0.0,1.0,0.0,69.0,0.0,52.0,-1.0,52.0,-1.0,52.0,...,-1.0,0.0,-1.0,0.0,2.0,0.0,-1.0,0.0,1.0,0.0
50%,24450.0,1.0,0.0,184.0,0.0,52.0,-1.0,83.0,-1.0,83.0,...,-1.0,0.0,-1.0,0.0,2.0,87616.0,-1.0,0.0,1.0,32.0
75%,1759751.0,3.0,1.0,427.0,167.0,108.0,52.0,421.0,115.0,356.0,...,1013498.0,1291379.0,1306116.0,0.0,2.0,304640.0,90496.0,1.0,3.0,32.0
max,44310760000.0,48255.0,74768.0,40496440.0,103922200.0,1390.0,1390.0,1500.0,1390.0,1390.0,...,44310720000.0,44300000000.0,44310720000.0,847000000.0,2269.0,4194240.0,4194240.0,74524.0,48255.0,44.0


In [None]:
df[OBJECTIVE_VAR].value_counts()

benign            471597
asware            155613
GeneralMalware      4745
Name: calss, dtype: int64

### Manipulación datos
Podemos transformar la variable objetivo a un valor numérico.

In [None]:
original_df = df.copy()
df[OBJECTIVE_VAR] = labelencoder.fit_transform(df[OBJECTIVE_VAR])
df[OBJECTIVE_VAR].value_counts()


2    471597
1    155613
0      4745
Name: calss, dtype: int64

#### Definition
- 0: GeneralMalware
- 1: asware
- 2: benign

In [None]:
corr_matrix = df.corr()
corr_matrix[OBJECTIVE_VAR].sort_values(ascending=False)

calss              1.000000
min_flowpktl       0.298014
min_fpktl          0.271343
mean_fpktl         0.211892
fAvgSegmentSize    0.211892
                     ...   
furg_cnt                NaN
burg_cnt                NaN
flow_urg                NaN
flow_cwr                NaN
flow_ece                NaN
Name: calss, Length: 80, dtype: float64

In [None]:
df.corr()

Unnamed: 0,duration,total_fpackets,total_bpackets,total_fpktl,total_bpktl,min_fpktl,min_bpktl,max_fpktl,max_bpktl,mean_fpktl,...,mean_idle,max_idle,std_idle,FFNEPD,Init_Win_bytes_forward,Init_Win_bytes_backward,RRT_samples_clnt,Act_data_pkt_forward,min_seg_size_forward,calss
duration,1.000000,0.004837,0.004011,0.001673,0.003518,-0.064100,-0.027231,0.008761,0.042925,-0.043746,...,0.998901,0.999458,0.047582,0.016532,0.027610,0.029712,0.003785,0.004838,0.082955,-0.067066
total_fpackets,0.004837,1.000000,0.924622,0.425756,0.904007,-0.018958,0.005252,0.024685,0.086255,-0.007910,...,0.001614,0.002267,0.017229,0.016089,0.050201,0.059224,0.902713,0.999866,0.018198,-0.018377
total_bpackets,0.004011,0.924622,1.000000,0.156780,0.997268,-0.017667,0.006912,0.018170,0.086886,-0.016104,...,0.000922,0.001617,0.016230,-0.000493,0.048190,0.058435,0.997580,0.924746,0.015124,-0.019430
total_fpktl,0.001673,0.425756,0.156780,1.000000,0.090082,-0.003099,0.000803,0.021278,0.022088,0.022409,...,0.000335,0.000609,0.009896,0.001657,0.013283,0.015991,0.088422,0.425789,0.005477,-0.000679
total_bpktl,0.003518,0.904007,0.997268,0.090082,1.000000,-0.014926,0.005966,0.012560,0.079905,-0.017328,...,0.000812,0.001452,0.014336,-0.000293,0.043571,0.053134,0.999616,0.904129,0.012139,-0.019838
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Init_Win_bytes_backward,0.029712,0.059224,0.058435,0.015991,0.053134,-0.268444,0.038319,0.429893,0.593143,-0.030004,...,0.026959,0.029512,0.097316,-0.052507,0.811204,1.000000,0.056761,0.059242,0.333701,-0.069405
RRT_samples_clnt,0.003785,0.902713,0.997580,0.088422,0.999616,-0.016659,0.006156,0.015727,0.084280,-0.017595,...,0.000893,0.001560,0.015200,-0.000437,0.046784,0.056761,1.000000,0.902834,0.014299,-0.019679
Act_data_pkt_forward,0.004838,0.999866,0.924746,0.425789,0.904129,-0.018947,0.005264,0.024705,0.086278,-0.007893,...,0.001617,0.002269,0.017233,0.000734,0.050220,0.059242,0.902834,1.000000,0.018229,-0.018391
min_seg_size_forward,0.082955,0.018198,0.015124,0.005477,0.012139,-0.686154,-0.189824,-0.074763,0.217989,-0.524024,...,0.077943,0.079324,0.048803,0.052177,0.394743,0.333701,0.014299,0.018229,1.000000,-0.258352


### Particionado de las muestras
Particionamos la muestra inicial para poder entrenar un modelo y posteriormente testear su eficacia.

In [None]:
train_set, val_set, test_set = train_val_test_split(df)
print('El Training Dataset contiene [{0}] registros'.format(len(train_set)))
print('El Validation Dataset contiene [{0}] registros'.format(len(val_set)))
print('El Test Dataset contiene [{0}] registros'.format(len(test_set)))

El Training Dataset contiene [379173] registros
El Validation Dataset contiene [126391] registros
El Test Dataset contiene [126391] registros


Separamos los vectores X y Y de los tres subconjuntos.

In [None]:
X_train, y_train = remove_labels(train_set, OBJECTIVE_VAR)
X_val, y_val = remove_labels(val_set, OBJECTIVE_VAR)
X_test, y_test = remove_labels(test_set, OBJECTIVE_VAR)

### Creación modelo predictivo - Random Forest

#### Hiperparametros
Buscamos los mejores hiperparámetros para nuestro modelo de predicción.

##### OPCIÓN 1
Buscamos los mejores valores dentro de una matriz de parámetros sobre la que testear cuales se ajustan más a las muestras X y Y. Esta será la opción que nos aporte mejores resultados.

In [None]:
param_grid = [
    {'n_estimators': [100, 500, 1000], 'max_leaf_nodes': [16, 24, 36]},
    {'bootstrap': [False], 'n_estimators': [100, 500], 'max_features': [2, 3, 4]},
  ]

rnd_clf = RandomForestClassifier(n_jobs=-1, random_state=42)

grid_search = GridSearchCV(rnd_clf, param_grid, cv=5,
                           scoring='f1_weighted', return_train_score=True)

grid_search.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=RandomForestClassifier(n_jobs=-1, random_state=42),
             param_grid=[{'max_leaf_nodes': [16, 24, 36],
                          'n_estimators': [100, 500, 1000]},
                         {'bootstrap': [False], 'max_features': [2, 3, 4],
                          'n_estimators': [100, 500]}],
             return_train_score=True, scoring='f1_weighted')

In [None]:
grid_search.best_params_

{'bootstrap': False, 'max_features': 4, 'n_estimators': 500}

In [None]:
grid_search.best_estimator_

RandomForestClassifier(bootstrap=False, max_features=4, n_estimators=500,
                       n_jobs=-1, random_state=42)

In [None]:
cvres = grid_search.cv_results_

for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print("F1 score:", mean_score, "-", "Parámetros:", params)

F1 score: 0.7923799683849088 - Parámetros: {'max_leaf_nodes': 16, 'n_estimators': 100}
F1 score: 0.7925780047375502 - Parámetros: {'max_leaf_nodes': 16, 'n_estimators': 500}
F1 score: 0.7926827876796984 - Parámetros: {'max_leaf_nodes': 16, 'n_estimators': 1000}
F1 score: 0.8055197663651619 - Parámetros: {'max_leaf_nodes': 24, 'n_estimators': 100}
F1 score: 0.805532754533511 - Parámetros: {'max_leaf_nodes': 24, 'n_estimators': 500}
F1 score: 0.806128393074076 - Parámetros: {'max_leaf_nodes': 24, 'n_estimators': 1000}
F1 score: 0.8162156111668564 - Parámetros: {'max_leaf_nodes': 36, 'n_estimators': 100}
F1 score: 0.8168965814969431 - Parámetros: {'max_leaf_nodes': 36, 'n_estimators': 500}
F1 score: 0.8167781554292425 - Parámetros: {'max_leaf_nodes': 36, 'n_estimators': 1000}
F1 score: 0.9209855440608206 - Parámetros: {'bootstrap': False, 'max_features': 2, 'n_estimators': 100}
F1 score: 0.9213535726827834 - Parámetros: {'bootstrap': False, 'max_features': 2, 'n_estimators': 500}
F1 score

El conjunto de parámetros que mejor se ajusta al modelo descrito seria:

``
F1 score: 0.9239721144145896 - Parámetros: {'bootstrap': False, 'max_features': 4, 'n_estimators': 500}
``

##### Opción 2
Como segunda opción definimos esos parámetros utilizando RandomizedSearchCV, que buscará los mejores parámetros provando valores alatorios. Si bien esta opción nos aporta peores resultados, es una forma alternativa de generar hiperparámetros

In [None]:
rdn_clf2 = RandomForestClassifier(n_jobs=-1, random_state=42)

parameters = {
    "max_depth": range(1, 11),
    "min_samples_split": range(2, 21),
    "min_samples_leaf": range(1, 5)
}

rdn_search = RandomizedSearchCV(rdn_clf2, parameters, cv=5,
                                scoring='f1_weighted', return_train_score=True)

rdn_search.fit(X_train, y_train)

RandomizedSearchCV(cv=5,
                   estimator=RandomForestClassifier(n_jobs=-1, random_state=42),
                   param_distributions={'max_depth': range(1, 11),
                                        'min_samples_leaf': range(1, 5),
                                        'min_samples_split': range(2, 21)},
                   return_train_score=True, scoring='f1_weighted')

In [None]:
rdn_search.best_params_

{'min_samples_split': 13, 'min_samples_leaf': 3, 'max_depth': 10}

In [None]:
rdn_search.best_estimator_

RandomForestClassifier(max_depth=10, min_samples_leaf=3, min_samples_split=13,
                       n_jobs=-1, random_state=42)

In [None]:
cvres2 = rdn_search.cv_results_
for mean_score, params in zip(cvres2["mean_test_score"], cvres2["params"]):
    print("F1 score:", mean_score, "-", "Parámetros:", params)

F1 score: 0.8834181849250303 - Parámetros: {'min_samples_split': 13, 'min_samples_leaf': 3, 'max_depth': 10}
F1 score: 0.853092438963707 - Parámetros: {'min_samples_split': 2, 'min_samples_leaf': 1, 'max_depth': 8}
F1 score: 0.7863405341167612 - Parámetros: {'min_samples_split': 16, 'min_samples_leaf': 2, 'max_depth': 4}
F1 score: 0.8099940636668856 - Parámetros: {'min_samples_split': 3, 'min_samples_leaf': 3, 'max_depth': 5}
F1 score: 0.8099057283221462 - Parámetros: {'min_samples_split': 7, 'min_samples_leaf': 4, 'max_depth': 5}
F1 score: 0.637609857931601 - Parámetros: {'min_samples_split': 17, 'min_samples_leaf': 2, 'max_depth': 1}
F1 score: 0.637609857931601 - Parámetros: {'min_samples_split': 18, 'min_samples_leaf': 3, 'max_depth': 1}
F1 score: 0.7560664862771537 - Parámetros: {'min_samples_split': 17, 'min_samples_leaf': 1, 'max_depth': 3}
F1 score: 0.8301212648237131 - Parámetros: {'min_samples_split': 9, 'min_samples_leaf': 4, 'max_depth': 7}
F1 score: 0.8819879911241412 - Par

El conjunto de parámetros que mejor se ajusta al modelo descrito seria:

``
F1 score: 0.853092438963707 - Parámetros: {'min_samples_split': 2, 'min_samples_leaf': 1, 'max_depth': 8}
``

### Modelo predictivo
Dado el resultado de los análisis anteriores, encontramos los parámetros para definir el modelo final.

In [None]:
grid_search.best_estimator_.get_params()

{'bootstrap': False,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 4,
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 500,
 'n_jobs': -1,
 'oob_score': False,
 'random_state': 42,
 'verbose': 0,
 'warm_start': False}

Seleccionamos el mejor modelo de estimación.

In [None]:
clf_model = grid_search.best_estimator_

Y con los datos contenidos en el dataset de entrenamiento podemos entrenar el modelo para ofrecer resultados.

In [None]:
y_train_pred = clf_model.predict(X_train)

Podemos evaluar el "score" del modelo entrenado evaluando la Y del modelo de entrenamiento contra la predicción resultante de ese modelo.

In [None]:
print("F1 score Train Set:", f1_score(y_train_pred, y_train, average='weighted'))

F1 score Train Set: 0.981243306508183


Con esos datos nos da un "accuracy" de más de un **98%**.

Finalmente, evaluamos el conjunto de datos de evaluación:

In [None]:
y_val_pred = clf_rnd.predict(X_val)

In [None]:
print("F1 score Validation Set:", f1_score(y_val_pred, y_val, average='weighted'))

F1 score Validation Set: 0.9329474731171657


El score obtenido es de **93%**.