In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime 
from csv_dataset import CsvDataset
from sklearn.pipeline import Pipeline
from sklearn.ensemble import IsolationForest

import math

from sklearn.svm import LinearSVC
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import ExtraTreesClassifier

import wrangling 
from sklearn.base import BaseEstimator, TransformerMixin

from aggregatorResetIndex import AggregatorResetIndex
from ascending_orderer import AscendingOrderer
from column_dropper import ColumnDropper
from column_selector import ColumnSelector
from ObjectToCategory import ObjectToCategory
from GroupByTimeFrec import GroupByTimeFrec

In [2]:
CsvDataset.merge_to_csv('2017-01-19-pseudoDarkleech-Rig-V-2nd-run-sends-Cerber-ransomware.csv', 'test5.csv', 'rdo1.csv')

In [None]:
data = pd.read_csv(
    "rdo1.csv",
    parse_dates=["ts", "te"],  # Seleccionamos las columnas que tienen fechas para que las trate como tal
    index_col="ts",  # Usamos el tiempo de inicio como índice
    usecols=[  # Elegimos solo las siguientes columnas del archivo csv
        'ts', 'te', 'td', 'sa', 'da', 'sp', 'dp', 'pr', 'flg', 'ipkt','ibyt','type'],
    engine = "c",  # Mejor rendimiento y uso de la memoria
    dtype = {  # We specify the column data types so pandas doesn't have to infer them
        'ts': 'str',
        'te': 'str',
        'td': 'float',
        'sa': 'str',
        'da': 'str',
        'sp': 'int',
        'dp': 'int',
        'pr': 'str',
        'flg': 'str',
        'ipkt': 'int',
        'ibyt': 'int',
        'type':'str',
    },
)


In [None]:
data2 = data.copy()
data2['ts_n']=data.index

t1= ObjectToCategory('type','type_c')
t1.fit(data)
data2= t1.transform(data2)
data2

In [None]:
estimator = IsolationForest(n_estimators=100, contamination=0.01, max_samples=256)
estimator

In [None]:
transforms = Pipeline([('orderAsc', AscendingOrderer(['ts_n', 'te'])),('groupFlows', GroupByTimeFrec("5s")),('aggregate', AggregatorResetIndex(['sa','pr','da','ts_n', 'type_c'], 'count')),('selectColumns',ColumnSelector(['sa','pr','da', 'ts_n','type_c'])),('aggregate2', AggregatorResetIndex(['sa','pr','ts_n', 'type_c'], 'count')),('categorize_sa', ObjectToCategory('sa','sa_c')), ('categorize_pr', ObjectToCategory('pr','pr_c')), ('categorize_ts', ObjectToCategory('ts_n','ts_c')), ('dropColumns',ColumnDropper(['id', 'sa', 'da', 'pr', 'ts_n','ipkt','ibyt']))])
transforms

In [None]:
df_transformed = transforms.transform(data2)
df_transformed

In [None]:
Y= df_transformed.type_c #labels (type) 
X= df_transformed.drop('type_c',1) #data
X.shape

In [None]:
clf = ExtraTreesClassifier()
clf = clf.fit(X, Y)
clf.feature_importances_
model = SelectFromModel(clf, prefit=True) #by default the median of the features importance
X_new = model.transform(X)
X_new.shape #2 ts_c
X_new
df_transformed_fs = pd.DataFrame(X_new,index=X_new[:,0])
df_transformed_fs

In [None]:
estimator.fit(X_new)
prediction = estimator.predict(X_new)
df_transformed_fs['prediction'] = prediction
df_transformed_fs['type_c']=Y
df_transformed_fs

In [None]:
df_transformed_fs[(df_transformed_fs['prediction'] == -1) & (df_transformed_fs['type_c'] == 1)]

In [None]:
FP= df_transformed_fs[(df_transformed_fs['prediction'] == -1) & (df_transformed_fs['type_c'] == 0)]
TN= df_transformed_fs[(df_transformed_fs['prediction'] == 1) & (df_transformed_fs['type_c'] == 0)]
print ('FP:')
print (FP.size/df_transformed_fs.shape[1]/df_transformed_fs.shape[0]) 
print ('TN:')
print (TN.size/df_transformed_fs.shape[1]/df_transformed_fs.shape[0]) 