# TMA attack detection for IoT

In [None]:
# Configurations
printer = False
filename = 'conn.log.labeled'

Here is a list of incremental estimators for Classification:
- [sklearn.naive_bayes.MultinomialNB](https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.MultinomialNB.html#sklearn.naive_bayes.MultinomialNB)
- [sklearn.naive_bayes.BernoulliNB](https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.BernoulliNB.html#sklearn.naive_bayes.BernoulliNB)
- [sklearn.linear_model.Perceptron](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.Perceptron.html#sklearn.linear_model.Perceptron)
- [sklearn.linear_model.SGDClassifier](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.SGDClassifier.html#sklearn.linear_model.SGDClassifier)
- [sklearn.linear_model.PassiveAggressiveClassifier](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.PassiveAggressiveClassifier.html#sklearn.linear_model.PassiveAggressiveClassifier)
- [sklearn.neural_network.MLPClassifier](https://scikit-learn.org/stable/modules/generated/sklearn.neural_network.MLPClassifier.html#sklearn.neural_network.MLPClassifier)



---
# Imports

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
%cd drive/My\ Drive
%ls

/content/drive/My Drive
[0m[01;34mBD[0m/  conn.log.labeled  conn.log.labeled_old  profile.jpg  [01;34mtfg-javier-frances[0m/


In [None]:
from sklearn.metrics import accuracy_score
from sklearn.decomposition import PCA
from sklearn.utils import shuffle
import matplotlib.pyplot as plt
# from ttictoc import TicToc
import pandas as pd
import numpy as np
import joblib
try:
    from StringIO import StringIO ## for Python 2
except ImportError:
    from io import StringIO ## for Python 3

---
# Data loading

In [None]:
def read_csv(filename, headmark='#fields', comment='#', sep='\t'):
  w = []
  for line in open(filename):
    if not line.startswith(comment):
      w.append(line)
    elif line.startswith(headmark):
      w.append(line[8:])
  lines = "".join(w)
  return pd.read_csv(StringIO(lines), sep=sep)

dt = read_csv(filename)

## Data sizing & previewing

In [None]:
w = ['uid','id.orig_h','id.resp_h','proto','conn_state','history','service','duration','orig_bytes','resp_bytes','local_orig','local_resp']
for col in w:
  try:
    del dt[col]
    if printer: print('{} : deleted'.format(col))
  except:
    if printer: print('{} : not found'.format(col))

In [None]:
NROWS, NCOLS = dt.shape
print()
print("- rows  =",NROWS)
print("- atrs =",NCOLS)
if printer: dt.head()


- rows  = 1008748
- atrs = 9


## Shuffle

In [None]:
dt = pd.DataFrame(shuffle(dt))
if printer: dt.head()

## Single validation

In [None]:
THRESHOLD = NROWS * 2 // 3
print('67% + 33%')
print(THRESHOLD,'+',NROWS-THRESHOLD)

67% + 33%
672498 + 336250


In [None]:
from sklearn import preprocessing

#enc = preprocessing.LabelEncoder()

for col in dt.columns:
  if col=='tunnel_parents   label   detailed-label':
    dt[col] = dt[col].astype('str')
    #dt[col] = enc.fit_transform(dt[col])
    #dt[col] = dt[col].astype('float')
  else:
    dt[col] = dt[col].astype('float')
    
if printer: dt.head()

---
# Classification methods

In [None]:
def PClassification(name, clf, saveFilename=False, loadFilename=False, ejTrain=True):
    # Dataset
    r = THRESHOLD
    c = dt.shape[1]-1
    train_data = dt.iloc[:r,:c]
    train_answ = dt.iloc[:r,c]
    check_data = dt.iloc[r:,:c]
    check_answ = dt.iloc[r:,c]
    
    # Load model
    if loadFilename: clf = joblib.load(loadFilename)

    # Train model
    if ejTrain: clf.fit(train_data, train_answ)

    # Save model
    if saveFilename: joblib.dump(clf, saveFilename)
    
    # Prediction
    preds = pd.Series(clf.predict(check_data), name='preds')
    reals = pd.Series(check_answ, name='reales')
    reals.index = range(reals.shape[0])
    
    # Results
    print(name)
    print('acc: {:.2f}%'.format(100*accuracy_score(reals, preds)))
    print()
    print(pd.crosstab(reals, preds))
    print()

## Multinomial Naive-Bayes

In [None]:
%%time
from sklearn.naive_bayes import MultinomialNB
PClassification('MultinomialNB', MultinomialNB())

MultinomialNB
acc: 62.83%

preds                                            (empty)   Benign   -  ...  (empty)   Malicious   PartOfAHorizontalPortScan
reales                                                                 ...                                                 
(empty)   Benign   -                                           120342  ...                                            20544
(empty)   Malicious   C&C                                           0  ...                                                0
(empty)   Malicious   PartOfAHorizontalPortScan                     0  ...                                            90908

[3 rows x 3 columns]

CPU times: user 4.25 s, sys: 121 ms, total: 4.37 s
Wall time: 4.3 s


## Bernoulli Naive-Bayes

In [None]:
%%time
from sklearn.naive_bayes import BernoulliNB
PClassification('BernoulliNB', BernoulliNB())

BernoulliNB
acc: 53.69%

preds                                            (empty)   Benign   -  ...  (empty)   Malicious   PartOfAHorizontalPortScan
reales                                                                 ...                                                 
(empty)   Benign   -                                             1095  ...                                           155709
(empty)   Malicious   C&C                                           0  ...                                                2
(empty)   Malicious   PartOfAHorizontalPortScan                     0  ...                                           179426

[3 rows x 3 columns]

CPU times: user 4.29 s, sys: 141 ms, total: 4.43 s
Wall time: 4.31 s


## Perceptron

In [None]:
%%time
from sklearn.linear_model import Perceptron
PClassification('Perceptron', Perceptron(tol=1e-3, random_state=0))

Perceptron
acc: 46.63%

preds                                            (empty)   Benign   -
reales                                                               
(empty)   Benign   -                                           156804
(empty)   Malicious   C&C                                           2
(empty)   Malicious   PartOfAHorizontalPortScan                179444

CPU times: user 4.21 s, sys: 106 ms, total: 4.31 s
Wall time: 4.19 s


## SGDClassifier

In [None]:
%%time
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
PClassification('SGDClassifier', make_pipeline(StandardScaler(), SGDClassifier(max_iter=1000, tol=1e-3)))

SGDClassifier
acc: 97.37%

preds                                            (empty)   Benign   -  (empty)   Malicious   PartOfAHorizontalPortScan
reales                                                                                                                
(empty)   Benign   -                                           148567                                             8237
(empty)   Malicious   C&C                                           2                                                0
(empty)   Malicious   PartOfAHorizontalPortScan                   620                                           178824

CPU times: user 4.11 s, sys: 121 ms, total: 4.23 s
Wall time: 4.13 s


## PassiveAggressiveClassifier

In [None]:
%%time
from sklearn.linear_model import PassiveAggressiveClassifier
PClassification('PassiveAggressiveClassifier', PassiveAggressiveClassifier(max_iter=1000, random_state=0, tol=1e-3))

PassiveAggressiveClassifier
acc: 46.63%

preds                                            (empty)   Benign   -
reales                                                               
(empty)   Benign   -                                           156804
(empty)   Malicious   C&C                                           2
(empty)   Malicious   PartOfAHorizontalPortScan                179444

CPU times: user 3.85 s, sys: 95.9 ms, total: 3.95 s
Wall time: 3.84 s


## MLPClassifier

In [None]:
%%time
from sklearn.neural_network import MLPClassifier
PClassification('MLPClassifier 1', MLPClassifier(max_iter=100, verbose=True, activation='logistic', random_state=1))
#PClassification('MLPClassifier 2', None, 'sav2', 'sav1')
#PClassification('MLPClassifier 3', 0, 'sav3', 'sav2')

Iteration 1, loss = 0.69352318
Iteration 2, loss = 0.69151566
Iteration 3, loss = 0.69150483
Iteration 4, loss = 0.69155671
Iteration 5, loss = 0.69160449
Iteration 6, loss = 0.69156550
Iteration 7, loss = 0.69170536
Iteration 8, loss = 0.69168704
Iteration 9, loss = 0.69180217
Iteration 10, loss = 0.69155103
Iteration 11, loss = 0.69165549
Iteration 12, loss = 0.69153093
Iteration 13, loss = 0.69150479
Training loss did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.
MLPClassifier 1
acc: 53.37%

preds                                            (empty)   Malicious   PartOfAHorizontalPortScan
reales                                                                                          
(empty)   Benign   -                                                                      156804
(empty)   Malicious   C&C                                                                      2
(empty)   Malicious   PartOfAHorizontalPortScan                                       