## SKLearn Practise!
CODE OBTAINED FROM wisepythagoras @ https://github.com/wisepythagoras/website-fingerprinting/blob/master/utils.py

This code appears to use a 40 dimensional feature packets sizes (for the first 40 pkts), as well as 1 dimensional features: ratio of incoming to outgoing, number of incoming, number of outgoing, total number of pkts, total size of incoming packets.

From this example, I think that the way you combine these multidimensional features with the single dimensional features is just by unpacking the multi and including the whole thing in one big array, then our training data should be an array of these combined feature arrays.

In [1]:
from sklearn import datasets
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB, BernoulliNB, MultinomialNB
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score
from joblib import dump
import random

iris = datasets.load_iris()

In [4]:
def shuffle(x, y):
    """ Shuffle the datasets. """

    for n in range(len(x) - 1):
        rnd = random.randint(0, (len(x) - 1))
        x1 = x[rnd]
        x2 = x[rnd - 1]

        y1 = y[rnd]
        y2 = y[rnd - 1]

        x[rnd - 1] = x1
        x[rnd] = x2

        y[rnd - 1] = y1
        y[rnd] = y2

    return x, y


def train(streams, labels):
    """ This function trains the classifier with the data. """

    # Shuffle the arrays.
    streams, labels = shuffle(streams, labels)

    # TODO: I should do 10-fold cross validation like they do in the paper, this is like 90/10 but 10 times
    # there are sklearn methods to do this
    # NOTE: I think these streams / training data ratios may be wrong for me
    
    stream_amount = len(streams)
    training_size = int(stream_amount * 0.9)

    # Get 70% of the streams for training purposes.
    training_x = streams[:training_size]
    training_y = labels[:training_size]

    # Get 30% of the streams for testing purposes
    testing_x = streams[training_size:]
    testing_y = labels[training_size:]

    print("Training size: {}".format(training_size))
    print("Testing size:  {}".format(stream_amount - training_size))

    # Initialize the classifier.
    # NOTE: I will not use KNeighbours, instead random forest
    clf = KNeighborsClassifier()

    # Now lets train our KNN classifier.
    clf = clf.fit(training_x, training_y)

    # Save a snapshot of this classifier.
    dump(clf, "./classifier-nb.dmp", compress=9)

    # Get the prediction.
    predictions = clf.predict(testing_x)

    print("Accuracy: %s%%" % (accuracy_score(testing_y, predictions) * 100,))

{'data': array([[5.1, 3.5, 1.4, 0.2],
       [4.9, 3. , 1.4, 0.2],
       [4.7, 3.2, 1.3, 0.2],
       [4.6, 3.1, 1.5, 0.2],
       [5. , 3.6, 1.4, 0.2],
       [5.4, 3.9, 1.7, 0.4],
       [4.6, 3.4, 1.4, 0.3],
       [5. , 3.4, 1.5, 0.2],
       [4.4, 2.9, 1.4, 0.2],
       [4.9, 3.1, 1.5, 0.1],
       [5.4, 3.7, 1.5, 0.2],
       [4.8, 3.4, 1.6, 0.2],
       [4.8, 3. , 1.4, 0.1],
       [4.3, 3. , 1.1, 0.1],
       [5.8, 4. , 1.2, 0.2],
       [5.7, 4.4, 1.5, 0.4],
       [5.4, 3.9, 1.3, 0.4],
       [5.1, 3.5, 1.4, 0.3],
       [5.7, 3.8, 1.7, 0.3],
       [5.1, 3.8, 1.5, 0.3],
       [5.4, 3.4, 1.7, 0.2],
       [5.1, 3.7, 1.5, 0.4],
       [4.6, 3.6, 1. , 0.2],
       [5.1, 3.3, 1.7, 0.5],
       [4.8, 3.4, 1.9, 0.2],
       [5. , 3. , 1.6, 0.2],
       [5. , 3.4, 1.6, 0.4],
       [5.2, 3.5, 1.5, 0.2],
       [5.2, 3.4, 1.4, 0.2],
       [4.7, 3.2, 1.6, 0.2],
       [4.8, 3.1, 1.6, 0.2],
       [5.4, 3.4, 1.5, 0.4],
       [5.2, 4.1, 1.5, 0.1],
       [5.5, 4.2, 1.4, 0.2],
     

In [None]:
# SOME CODE I FOUND THE WEBBBB

import os
import re
import sys
import json
import utils


# Read the configuration and start training.
# NOTE: config stores a list of the websites that have been captured (more specifically the domain names)
with open('config.json') as fp:
    print("* Parsing configuration")

    # Load the configuration from the file.
    config = json.load(fp)

    # This is where all the streams are going to live.
    streams = []

    # This is where all the labels are going to live.
    labels = []
    labels_str = []
    base_labels = [None] * len(config['pcaps'])

    # The base label starts from 1 and increments after that.
    current_label = 1
    # NOTE: this would not match my format, also re == regex
    pat = re.compile(".*-curl\.pcap$")

    for domain in config['pcaps']:
        # Set the base label.
        base_labels[current_label - 1] = domain

        # Increment the label
        current_label += 1

    #utils.empty_csv()
    current_label = 1

    for domain in config['pcaps']:
        print(" - {}".format(domain))
        i = 0

        # Traverse the directory for all the pcaps.
        # NOTE: my files don't match this format
        for file in os.listdir('./pcaps/{}'.format(domain)):
            if file.endswith(".pcap") and (pat.match(file) is None):
                # if i > 20:
                #     break

                # This is the pcap file we'll be reading at this point.
                file = os.path.join("./pcaps/{}".format(domain), file)

                # Read the pcap file.
                data = utils.read_pcap_file(file)

                # Append the data to the streams array.
                streams.append(data)

                # Append everything to the log.
                utils.append_to_csv(domain, data)

                # Add a label for the new file.
                labels.append(current_label)
                labels_str.append(domain)

                i += 1

        print(f"    {i} pcap files")

        # Increment the label
        current_label += 1

    # Finally train the classifier.
    utils.train(streams, labels)

In [None]:
## FOR LOGGING

def empty_log():
    """ Empties the CSV file. """

    with open("./fingerprints.csv", 'w') as f:
        f.write("")


def append_to_log(domain, data):
    """ Append the information to the log file. """

    with open("./fingerprints.csv", 'a') as f:
        f.write("{},{}\n".format(domain, ','.join(str(num) for num in data)))


In [None]:
## FOR CONTEXT THIS IS WHAT THEIR READ PCAP FUNC LOOKS LIKE:

def read_pcap_file(file):
    """ Read the pcap file and return the sizes of the packets. """

    # Read the file.
    fp = open(file, 'rb')

    # Create the pcap object
    pcap = dpkt.pcap.Reader(fp)

    # This is the array that will contain all the packet sizes.
    sizes = [0] * 40
    i = 0

    # Hold the addresses of the outgoing agent.
    outgoing_addr = None

    outgoing_packets = 0
    incoming_packets = 0
    total_number_of_packets = 0

    # This will contain the total size of the incoming packets.
    incoming_size = 0

    # Loop through all the packets and save the sizes.
    for ts, buf in pcap:
        packet_size = len(buf)
        is_outgoing = True

        # Parse the Ethernet packet.
        eth = dpkt.ethernet.Ethernet(buf)

        # Parse the IP packet.
        ip = eth.data

        # Get the source addresses.
        src = inet_to_str(ip.src)

        if total_number_of_packets == 0:
            # Get the address of the outgoing agents. The target user is the
            # outgoing agent, and the incoming packets are the server/website.
            outgoing_addr = src
            outgoing_packets += 1

        elif src == outgoing_addr:
            # Increment the outgoing packets.
            outgoing_packets += 1

        else:
            # Increment the incoming packets.
            incoming_packets += 1

            # Increment the size of the incoming packets.
            incoming_size += packet_size

            # This is an incoming packet.
            is_outgoing = False

        if i < 40:
            # Add the size to the array.
            sizes[i] = packet_size if is_outgoing else -packet_size

            # Increment the index.
            i += 1

        # Increment the total amount of packets.
        total_number_of_packets += 1

    # Get the ratio.
    ratio = float(incoming_packets) / (outgoing_packets if outgoing_packets != 0 else 1)

    # Print some details.
    print(f'OUT: {outgoing_packets},' +
            f'IN: {incoming_packets},' +
            f'TOTAL: {total_number_of_packets},' +
            f'SIZE: {incoming_size},' +
            f'RATIO: {ratio}')

    # Reverse the array to append the other information.
    sizes.reverse()

    # Add the ratio of incoming to outgoing packets.
    sizes.append(ratio)

    # Add the number of incoming packets.
    sizes.append(incoming_packets)

    # Add the number of outgoing packets.
    sizes.append(outgoing_packets)

    # Add the number of total packets.
    sizes.append(total_number_of_packets)

    # Add the total size of the incoming packets.
    sizes.append(incoming_size)

    # Reverse the array again so that the sizes are in order.
    sizes.reverse()

    # Finally return the sizes.
    return sizes
