In [3]:
import featuretools as ft
from featuretools.primitives import CumMean, Percentile
import pandas as pd

In [4]:
cyber_df = pd.read_csv("CyberFLTenDays.csv").sample(1000)
cyber_df.index.name = "log_id"
cyber_df.reset_index(inplace=True, drop=False)

In [5]:
es = ft.EntitySet("CyberLL")
# create an index column
cyber_df["name_host_pair"] = cyber_df["src_name"].str.cat(
                                [cyber_df["dest_name"],
                                 cyber_df["src_host"],
                                 cyber_df["dest_host"]],
                                sep=' / ')
cyber_df["src_pair"] = cyber_df["src_name"].str.cat(
                                 cyber_df["src_host"],
                                 sep=' / ')
cyber_df["dest_pair"] = cyber_df["dest_name"].str.cat(
                                 cyber_df["dest_host"],
                                 sep=' / ')
es.entity_from_dataframe("log",
                         cyber_df,
                         index="log_id",
                         time_index="secs")
es.normalize_entity(base_entity_id="log",
                    new_entity_id="name_host_pairs",
                    index="name_host_pair",
                    additional_variables=["src_name", "dest_name",
                                          "src_host", "dest_host",
                                          "src_pair",
                                          "dest_pair",
                                          "label"])
es.normalize_entity(base_entity_id="name_host_pairs",
                    new_entity_id="src_pairs",
                    index="src_pair",
                    additional_variables=["src_name", "src_host"])
es.normalize_entity(base_entity_id="src_pairs",
                    new_entity_id="src_names",
                    index="src_name")
es.normalize_entity(base_entity_id="src_pairs",
                    new_entity_id="src_hosts",
                    index="src_host")
es.normalize_entity(base_entity_id="name_host_pairs",
                    new_entity_id="dest_pairs",
                    index="dest_pair",
                    additional_variables=["dest_name", "dest_host"])
es.normalize_entity(base_entity_id="dest_pairs",
                    new_entity_id="dest_names",
                    index="dest_name")
es.normalize_entity(base_entity_id="dest_pairs",
                    new_entity_id="dest_hosts",
                    index="dest_host")

Entityset: CyberLL
  Entities:
    log (shape = [1000, 7])
    name_host_pairs (shape = [943, 5])
    src_pairs (shape = [913, 4])
    src_names (shape = [814, 2])
    src_hosts (shape = [487, 2])
    ...And 3 more
  Relationships:
    log.name_host_pair -> name_host_pairs.name_host_pair
    name_host_pairs.src_pair -> src_pairs.src_pair
    src_pairs.src_name -> src_names.src_name
    src_pairs.src_host -> src_hosts.src_host
    name_host_pairs.dest_pair -> dest_pairs.dest_pair
    ...and 2 more

In [6]:
def generate_cutoffs(cyber_df, index_col, after_n_obs):
    grouped = cyber_df.groupby(index_col)[index_col].count()
    grouped.name = "count"
    enough_examples = grouped[grouped > after_n_obs].to_frame().reset_index()
    enough_examples = cyber_df[cyber_df[index_col].isin(enough_examples[index_col])]
    cutoffs = enough_examples.groupby(index_col)[[index_col, "secs"]].apply(lambda x: x.iloc[after_n_obs])
    return cutoffs

In [7]:
# predict after 3 observations
after_n_obs = 3

In [8]:
# features on src_name
cutoffs = generate_cutoffs(cyber_df, "src_name", after_n_obs)
fm, fl = ft.dfs(entityset=es, target_entity="src_names", cutoff_time=cutoffs, verbose=True, max_depth=3)

Built 388 features
Elapsed: 00:10 | Remaining: 00:00 | Progress: 100%|██████████|| Calculated: 17/17 cutoff times


In [9]:
## features on src_host
cutoffs = generate_cutoffs(cyber_df, "src_host", after_n_obs)
fm, fl = ft.dfs(entityset=es, target_entity="src_hosts", cutoff_time=cutoffs, verbose=True, max_depth=3)

Built 388 features
Elapsed: 00:20 | Remaining: 00:00 | Progress: 100%|██████████|| Calculated: 31/31 cutoff times


In [10]:
## features on dest_name
cutoffs = generate_cutoffs(cyber_df, "dest_name", after_n_obs)
fm, fl = ft.dfs(entityset=es, target_entity="dest_names", cutoff_time=cutoffs, verbose=True, max_depth=3)

Built 388 features
Elapsed: 00:09 | Remaining: 00:00 | Progress: 100%|██████████|| Calculated: 16/16 cutoff times


In [11]:
## features on dest_host
cutoffs = generate_cutoffs(cyber_df, "dest_host", after_n_obs)
fm, fl = ft.dfs(entityset=es, target_entity="dest_hosts", cutoff_time=cutoffs, verbose=True, max_depth=3)

Built 388 features
Elapsed: 00:21 | Remaining: 00:00 | Progress: 100%|██████████|| Calculated: 30/30 cutoff times


In [12]:
# features on src_name/dest_name/src_host/dest_host
cutoffs = generate_cutoffs(cyber_df, "name_host_pair", after_n_obs)
fm, fl = ft.dfs(entityset=es, target_entity="name_host_pairs", cutoff_time=cutoffs, verbose=True, max_depth=2, trans_primitives=[CumMean, Percentile])

Built 59 features
Elapsed: 00:02 | Remaining: 00:00 | Progress: 100%|██████████|| Calculated: 5/5 cutoff times


In [13]:
# merge entities together to access the index variables created in the process of normalizing
merged = (es['log'].df
                   .merge(es['name_host_pairs'].df)
                   .merge(es['src_pairs'].df)
                   .merge(es['dest_pairs'].df))

In [14]:
# features on src_name/src_host
cutoffs = generate_cutoffs(merged, 'src_pair', after_n_obs)
fm, fl = ft.dfs(entityset=es, target_entity="src_pairs", cutoff_time=cutoffs, verbose=True, max_depth=2, trans_primitives=[CumMean, Percentile])

Built 123 features
Elapsed: 00:03 | Remaining: 00:00 | Progress: 100%|██████████|| Calculated: 3/3 cutoff times


In [15]:
# features on dest_name/dest_host
cutoffs = generate_cutoffs(merged, 'dest_pair', after_n_obs)
fm, fl = ft.dfs(entityset=es, target_entity="dest_pairs", cutoff_time=cutoffs, verbose=True, max_depth=2, trans_primitives=[CumMean, Percentile])

Built 123 features
Elapsed: 00:03 | Remaining: 00:00 | Progress: 100%|██████████|| Calculated: 3/3 cutoff times
