In [13]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import IsolationForest
import joblib

time_freq = '15s'
image_length = 40

In [14]:
# Functions Cell

# Function to transform DataFrame from packets to packets per time_group variable.
# Returns DataFrame with MAC addresses, average volumes, and average delta times 
# of packets every time_group 
# Input: DataFrame, string (MAC address)
# Return DataFrame
def transform(df, mac):
    df['Epoch Time'] = pd.to_datetime(df['Epoch Time'], errors='coerce')
    packets_per_min = df.set_index('Epoch Time').groupby(pd.Grouper(freq=time_freq))['Length'].mean()
    packets_per_min = packets_per_min.fillna(0)

    time_deltas = df.set_index('Epoch Time').groupby(pd.Grouper(freq=time_freq))['Delta Time'].mean()
    time_deltas = time_deltas.fillna(time_deltas.mean())
    df = pd.merge(packets_per_min, time_deltas, left_index=True, right_index=True)

    df['MAC Source'] = mac
    return df

# Function to train and save an Isolation Forest model for a specific MAC address.
# This model is used to return outliers for input time frequencies
# Input: DataFrame, string (MAC address)
# Return: void
def train_and_save_S_model(df, mac):
    #df = df[df['MAC Source']==mac]
    # df = transform(df)

    X = df[['Length', 'Delta Time']]
    clf = IsolationForest(random_state=0, n_estimators=2).fit(X.values)
    
    filename = '../models2'
    joblib.dump(clf, filename + '\\' + mac.replace(':','') + '.sav')


# Function that takes an IsolationForest model and an "image," or time frequency, of data
# and detects the outliers.
# Input: DataFrame, model (IsolationForest)
def detect_outliers(df, comparison_model):
    X = df[['Length', 'Delta Time']]
    #print(X)
    X['scores']=comparison_model.decision_function(X[X.columns[0:2]].values)
    X['anomaly']=comparison_model.predict(X[X.columns[0:2]].values)
    anomaly=df.loc[X['anomaly']==-1]

    # second run through
    # comparison_model.fit(anomaly[anomaly.columns[0:2]].values)
    # X['scores']=comparison_model.decision_function(X[X.columns[0:2]].values)
    # X['anomaly']=comparison_model.predict(X[X.columns[0:2]].values)
    # anomaly2=df.loc[X['anomaly']==-1]
    # comparison_model.fit(anomaly[['No.', 'Delta Time']])

    return len(anomaly) # percentage of outliers


# Function that takes two time interval images of data and an outlier model, returning the absolute
# difference in outliers between the two.
# Input: DataFrame (anchor image), DataFrame (test image), model (IsolationForest) 
def compare_models(df_1, df_2, outlier_model, label):

    percent_of_outliers_anchor = detect_outliers(df_1, outlier_model)
    percent_of_outliers_2 = detect_outliers(df_2, outlier_model)
    print(percent_of_outliers_2, percent_of_outliers_anchor, '=', percent_of_outliers_2 - percent_of_outliers_anchor, label)

    return abs(percent_of_outliers_2 - percent_of_outliers_anchor) #((abs(percent_of_outliers_2 - percent_of_outliers_anchor))/percent_of_outliers_anchor)*100


# Function to extract the time groups (images) of data from a given dataset (network traffic).
# The input data should only be for one MAC, if not, it should be sorted where it can be grouped.
# e.g. if time_freq = '15s' and image_length = 60, the time frame is 15 minutes of data per image.
#
# Input: DataFrame
# Return: List of Dataframes 
def extract_images_and_labes(df):
    df = pd.DataFrame(df)
    df['Time'] = df.index
    df.index = pd.RangeIndex(len(df.index))
    groups = df.groupby([df.index // image_length])
    
    mac_images = []
    mac_labels = []
    for _, g in groups:

        image = g[['Length', 'Delta Time']]

        mac_images.append(image)
        mac_labels.append(g['MAC Source'].unique()[0])
    return mac_images, mac_labels

# Function to transform data into time intervals, then return grouped time frequencies
# and labels from the input DataFrame.
# Input: DataFrame
# Return: List of DataFrames, List of Strings
def transform_data(df):
    macs = ['00:0c:29:9d:9e:9e','00:80:f4:09:51:3b','48:5b:39:64:40:79','00:0c:29:e6:14:0d']
    df = df[df['MAC Source'].isin(macs)]
    grouped_mac = df.groupby(df['MAC Source'])
    mac_arr_df = [grouped_mac.get_group(d) for d in df['MAC Source'].unique()]

    images = []
    labels = []
    for df in mac_arr_df:
        df['Epoch Time'] = pd.to_datetime(df['Epoch Time'], errors='coerce')
        packets_per_min = df.set_index('Epoch Time').groupby(pd.Grouper(freq=time_freq))['No.'].count()
        packets_per_min = packets_per_min.fillna(0)

        time_deltas = df.set_index('Epoch Time').groupby(pd.Grouper(freq=time_freq))['Delta Time'].mean()
        time_deltas = time_deltas.fillna(time_deltas.mean())
        df_final = pd.merge(packets_per_min, time_deltas, left_index=True, right_index=True)

        # Assigns the MAC address to the 
        df_final['MAC Source'] = df['MAC Source'].unique()[0]

        df = df_final[:-1] # get rid of extra row at end
        imgs, lbls = extract_images_and_labes(df_final[:-1])
        images += imgs
        labels += lbls

    return images, labels


# Function to create positive and negavite combinations of images for the 
# Desicion Tree model training phase. It takes both images and labels from specific MAC
# and all MACs to create both the positive and negative image pairs. 
# The dimensions of the pairs is (len(images)*2, 2, image_length).
# e.g. (90, 2, 60): 90 pairs, 2 columns (features), 60 rows
#
# len(images)*2: for each images a positive and negative match are generated, hence the times 2
# 2: two columns for each feature - avg volume, avg time delta
# image length: how many rows in the image (depending on the time groupings and frequency)
# 
# Inupt: List of DataFrames, List of Strings, List of DataFrames, List of Strings
# Return: List of Pair of Lists, List of Strings
def make_pairs(images, labels, images_extra, labels_extra):
    
    pairImages = []
    pairLabels = []

    df = pd.DataFrame([i for i in range(len(images))], columns=['idx'])
    df['label'] = labels

    df_extra = pd.DataFrame([i for i in range(len(images_extra))], columns=['idx'])
    df_extra['label'] = labels_extra

    for idx, image in enumerate(images):
        label = labels[idx]
        
        pos_df = df
        pos_img_idx = pos_df.sample().to_numpy()[0][0]
        pos_img = images[pos_img_idx]
        pairImages.append((image, pos_img))
        pairLabels.append(['Normal'])

        neg_df = df_extra[df_extra['label'] != label]
        neg_img_idx = neg_df.sample().to_numpy()[0][0]
        neg_img = images_extra[neg_img_idx]
        pairImages.append((image, neg_img))
        pairLabels.append(['Rouge'])
    
    return (pairImages, pairLabels)

# Function to train and save the comparison model. This takes the encodings (or absolute differences) 
# of two devices and the labels mapped to the encodings (whether or not the images match). The model
# then trains and recognizes what device images match and which do not.
# Input: List of ints, List of strings, string (MAC address)
def train_and_save_C_model(encodings, labels, mac):
    model = DecisionTreeClassifier()
    model.fit(encodings, labels)

    filename = '../models2' + mac.replace(':','') + '_compare.sav'
    joblib.dump(model, filename)
    return model
    



In [15]:
# Train and save the outlier models for each auth device

data = '../../test_data/csv2/eth2dump-clean-6h_1.csv'
df = pd.read_csv(data)
df = df[(df['MAC Source']=='00:0c:29:e6:14:0d') | (df['MAC Source']=='00:0c:29:9d:9e:9e') | (df['MAC Source']=='48:5b:39:64:40:79') | (df['MAC Source']=='00:80:f4:09:51:3b')]

grouped_ip = df.groupby(df['MAC Source'])
macs_arr = [grouped_ip.get_group(d) for d in df['MAC Source'].unique()]
# iterate through each mac/device
for mac in macs_arr:
    name = mac['MAC Source'].unique()[0]
    df = transform(mac, name)
    # save model
    train_and_save_S_model(df, name)



In [16]:
# start to train the DT difference models for each auth device

# iterate through clean data and compbine into one DataFrame, also grouping the data by time frequency declared by time_freq
import glob
dfs = []
files = glob.glob('../../test_data/csv2/*.csv')
for file in files[0:3]:
    print(file)
    macs = ['00:0c:29:9d:9e:9e','00:80:f4:09:51:3b','48:5b:39:64:40:79','00:0c:29:e6:14:0d']
    df = pd.read_csv(file)
    df = df[df['MAC Source'].isin(macs)]
    grouped_ip = df.groupby(df['MAC Source'])
    macs_arr = [grouped_ip.get_group(d) for d in df['MAC Source'].unique()]
    macs = []
    for mac in macs_arr:
        label = mac['MAC Source'].unique()[0]
        mac = transform(mac, label)
        macs.append(mac[:-1])
    df = pd.concat(macs)
    dfs.append(df)
df = pd.concat(dfs)

C:\Users\carlo\Documents\College\reu_cyber\test_data\csv2\eth2dump-clean-0,5h_1.csv
C:\Users\carlo\Documents\College\reu_cyber\test_data\csv2\eth2dump-clean-1h_1.csv
C:\Users\carlo\Documents\College\reu_cyber\test_data\csv2\eth2dump-clean-6h_1.csv


In [17]:
# extract all time frame images for negative matches used later
all_images, all_labels = extract_images_and_labes(df)
len(all_images)

135

In [18]:
# group clean data by MAC address
grouped_ip = df.groupby(df['MAC Source'])
macs_arr = [grouped_ip.get_group(d) for d in df['MAC Source'].unique()]

# EXPLAINATION:

# iterate through MACs 
# for each mac: 
#   get its time interval images 
#   get both positive and negative matches for training, as well as the labels indicating their relationship
#   for each pair/match:
#       get absolute difference between the two
#   train the DT on the absolute difference encodins (similarity score) and their labels (match or not)

# models should now know which image differences are similar and declared positive, or vice verse 


for mac_df in macs_arr:
    mac_label = mac_df['MAC Source'].unique()[0]
    print('mac_df =',mac_label)
    mac_images, mac_labels = extract_images_and_labes(mac_df)
    
    pairs, labels = make_pairs(mac_images, mac_labels, all_images, all_labels)
    
    filename = '../models2'
    model = joblib.load(filename + '/' + mac_label.replace(':','') + '.sav')

    encodings = []
    labels_return = []
    for pair, label in zip(pairs,labels):
        result = [compare_models(pair[0], pair[1], model, label[0])]
        encodings.append(result)
        labels_return.append(label[0])
    model = train_and_save_C_model(encodings, labels_return, mac_label)
    

mac_df = 00:0c:29:9d:9e:9e
1 0 = 1 Normal
40 0 = 40 Rouge
5 5 = 0 Normal
40 5 = 35 Rouge
2 0 = 2 Normal
40 0 = 40 Rouge
5 8 = -3 Normal
40 8 = 32 Rouge
1 2 = -1 Normal
40 2 = 38 Rouge
2 3 = -1 Normal
40 3 = 37 Rouge
2 4 = -2 Normal
40 4 = 36 Rouge
2 4 = -2 Normal
40 4 = 36 Rouge
1 1 = 0 Normal
40 1 = 39 Rouge
3 2 = 1 Normal
37 2 = 35 Rouge
6 4 = 2 Normal
40 4 = 36 Rouge
3 2 = 1 Normal
40 2 = 38 Rouge
3 1 = 2 Normal
40 1 = 39 Rouge
3 14 = -11 Normal
40 14 = 26 Rouge
4 3 = 1 Normal
40 3 = 37 Rouge
1 1 = 0 Normal
40 1 = 39 Rouge
0 4 = -4 Normal
40 4 = 36 Rouge
3 2 = 1 Normal
40 2 = 38 Rouge
5 2 = 3 Normal
40 2 = 38 Rouge
2 14 = -12 Normal
40 14 = 26 Rouge
3 4 = -1 Normal
40 4 = 36 Rouge
4 1 = 3 Normal
40 1 = 39 Rouge
4 4 = 0 Normal
40 4 = 36 Rouge
3 4 = -1 Normal
40 4 = 36 Rouge
4 6 = -2 Normal
40 6 = 34 Rouge
2 7 = -5 Normal
40 7 = 33 Rouge
3 3 = 0 Normal
37 3 = 34 Rouge
3 3 = 0 Normal
40 3 = 37 Rouge
6 3 = 3 Normal
40 3 = 37 Rouge
6 5 = 1 Normal
40 5 = 35 Rouge
3 1 = 2 Normal
40 1 = 39 