# TMA attack detection for IoT

In [None]:
# Configurations
printer = False
filename = 'conn.log.labeled'

---
# Imports

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
%cd drive/My\ Drive
%ls

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.decomposition import PCA
from sklearn.utils import shuffle
import matplotlib.pyplot as plt
# from ttictoc import TicToc
import pandas as pd
import numpy as np
import joblib
try:
    from StringIO import StringIO ## for Python 2
except ImportError:
    from io import StringIO ## for Python 3

---
# Data loading

In [None]:
def read_csv(filename, headmark='#fields', comment='#', sep='\t'):
  w = []
  for line in open(filename):
    if not line.startswith(comment):
      w.append(line)
    elif line.startswith(headmark):
      w.append(line[8:])
  lines = "".join(w)
  return pd.read_csv(StringIO(lines), sep=sep)

dt = read_csv(filename)

## Data sizing & previewing

In [None]:
w = ['uid','id.orig_h','id.resp_h','proto','conn_state','history','service','duration','orig_bytes','resp_bytes','local_orig','local_resp']
for col in w:
  try:
    del dt[col]
    if printer: print('{} : deleted'.format(col))
  except:
    if printer: print('{} : not found'.format(col))

In [None]:
NROWS, NCOLS = dt.shape
print()
print("- rows  =",NROWS)
print("- atrs =",NCOLS)
if printer: dt.head()

## Shuffle

In [None]:
dt = pd.DataFrame(shuffle(dt))
if printer: dt.head()

## Single validation

In [None]:
THRESHOLD = NROWS * 2 // 3
print('67% + 33%')
print(THRESHOLD,'+',NROWS-THRESHOLD)

In [None]:
from sklearn import preprocessing

#enc = preprocessing.LabelEncoder()

for col in dt.columns:
  if col=='tunnel_parents   label   detailed-label':
    dt[col] = dt[col].astype('str')
    #dt[col] = enc.fit_transform(dt[col])
    #dt[col] = dt[col].astype('float')
  else:
    dt[col] = dt[col].astype('float')
    
if printer: dt.head()

---
# Classification methods

In [None]:
def PClassification(name, clf, saveFilename=False, loadFilename=False, ejTrain=True):
    # Dataset
    r = THRESHOLD
    c = dt.shape[1]-1
    train_data = dt.iloc[:r,:c]
    train_answ = dt.iloc[:r,c]
    check_data = dt.iloc[r:,:c]
    check_answ = dt.iloc[r:,c]
    
    # Load model
    if loadFilename: clf = joblib.load(loadFilename)

    # Train model
    if ejTrain: clf.fit(train_data, train_answ)

    # Save model
    if saveFilename: joblib.dump(clf, saveFilename)
    
    # Prediction
    preds = pd.Series(clf.predict(check_data), name='preds')
    reals = pd.Series(check_answ, name='reales')
    reals.index = range(reals.shape[0])
    
    # Results
    print(name)
    print('acc: {:.2f}%'.format(100*accuracy_score(reals, preds)))
    print()
    print(pd.crosstab(reals, preds))
    print()
    return clf

## Decision Tree

In [None]:
%%time
from sklearn.tree import DecisionTreeClassifier
PClassification('Decision Tree', DecisionTreeClassifier())

## Random Forest (n_est=10)

In [None]:
%%time
from sklearn.ensemble import RandomForestClassifier
PClassification('Random Forest', RandomForestClassifier(n_estimators=10))

---
# Testing parameters

In [None]:
def classification(clf):
    r = THRESHOLD
    c = dt.shape[1]-1
    train_data = dt.iloc[:r,:c]
    train_answ = dt.iloc[:r,c]
    check_data = dt.iloc[r:,:c]
    check_answ = dt.iloc[r:,c]
    
    clf.fit(train_data, train_answ)
    
    preds = pd.Series(clf.predict(check_data), name='preds')
    reals = pd.Series(check_answ, name='reales')
    reals.index = range(reals.shape[0])
    
    acc = round(100*accuracy_score(reals, preds),2)
    return acc    

## Decision Tree (max_depth)

In [None]:
df = pd.DataFrame(columns=['max_depth', 'time(ms)', 'acc(%)'])
t = TicToc()
N = 100

for i in range(1,10+1):
    t1=t.tic()
    v = [classification(DecisionTreeClassifier(max_depth=i)) for j in range(N)]
    t2=t.toc()
    r = round(np.mean(sorted(v)[N//20:-(N//20)]),2)
    tm=round(1000*t.elapsed/N,2)
    df.loc[i-1] = [i,tm,r]

In [None]:
df

In [None]:
p=df.plot(0,[1,2], title='Decision Tree',
        legend=True, grid = True,
        subplots=True, sharex=True)

## Random Forest (max_depth)

In [None]:
df = pd.DataFrame(columns=['max_depth', 'time(ms)', 'acc(%)'])
t = TicToc()
N = 100

for i in range(1,10+1):
    t1=t.tic()
    v = [classification(RandomForestClassifier(n_estimators=10,max_depth=i)) for j in range(N)]
    r = round(np.mean(sorted(v)[N//20:-(N//20)]),2)
    t2=t.toc()
    tm=round(1000*t.elapsed/N,2)
    df.loc[i-1] = [i,tm,r]

In [None]:
df

In [None]:
p=df.plot(0,[1,2], title='Random Forest',
        legend=True, grid = True,
        subplots=True, sharex=True)