In [1]:
import pandas as pd
import io
import requests
import numpy as np
import os
import matplotlib.pyplot as plt
import pylab as pl
import tensorflow.contrib.learn as skflow
import string


#from sklearn.utils.multiclass import unique_labels
from scipy.stats import zscore
from sklearn.model_selection import train_test_split
from sklearn import metrics
from keras.models import Sequential
from keras.layers.core import Dense, Activation
from keras.callbacks import EarlyStopping
from sklearn import preprocessing
from sklearn.metrics import confusion_matrix, roc_curve, auc, average_precision_score, precision_recall_curve
from inspect import signature
%matplotlib inline

path = "UNSW-NB15_1.csv"
# This file is a CSV, just no CSV extension or headers
df = pd.read_csv(path, header=None)
print(df[0:3])

print("Read {} rows.".format(len(df)))
# df = df.sample(frac=0.1, replace=False) # Uncomment this line to sample only 10% of the dataset
 # For now, just drop NA's (rows with missing values)

# The CSV file has no column heads, so add them
df.columns = [
    'srcip',
    'sport',
    'dstip',
    'dsport',
    'proto',
    'state',
    'dur',
    'sbyte',
    'dbytes',
    'sttl',
    'dttl',
    'sloss',
    'dloss',
    'service',
    'Sload',
    'Dload',
    'Spkts',
    'Dpkts',
    'swin',
    'dwin',
    'stcpb',
    'dtcpb',
    'smeansz',
    'dmeansz',
    'trans',
    'res',
    'Sjit',
    'Djit',
    'Stime', #QUITAR
    'Ltime', #QUITAR
    'Sintpkt',
    'Dintpkt',
    'tcprtt',
    'synack',
    'ackdat',
    'is_sm_ips_ports',
    'ct_state_ttl',
    'ct_flw_http_mthd',
    'is_ftp_login',
    'ct_ftp_cmd',
    'ct_srv_src',
    'ct_srv_dst',
    'ct_dst_ltm',
    'ct_src_ltm', 
    'ct_src_dport_ltm',
    'ct_dst_sport_lt', 
    'ct_dst_src_ltm',
    'attack_cat',
    'Label'
]

df.dropna(inplace=True,axis=1)

Using TensorFlow backend.
  interactivity=interactivity, compiler=compiler, result=result)


           0      1              2     3    4    5         6    7    8   9   \
0  59.166.0.0   1390  149.171.126.6    53  udp  CON  0.001055  132  164  31   
1  59.166.0.0  33661  149.171.126.9  1024  udp  CON  0.036133  528  304  31   
2  59.166.0.6   1464  149.171.126.7    53  udp  CON  0.001119  146  178  31   

   ...  39  40  41 42  43  44  45  46   47  48  
0  ...   0   3   7  1   3   1   1   1  NaN   0  
1  ...   0   2   4  2   3   1   1   2  NaN   0  
2  ...   0  12   8  1   2   2   1   1  NaN   0  

[3 rows x 49 columns]
Read 700001 rows.


In [2]:
df[0:6]

Unnamed: 0,srcip,sport,dstip,dsport,proto,state,dur,sbyte,dbytes,sttl,...,is_ftp_login,ct_ftp_cmd,ct_srv_src,ct_srv_dst,ct_dst_ltm,ct_src_ltm,ct_src_dport_ltm,ct_dst_sport_lt,ct_dst_src_ltm,Label
0,59.166.0.0,1390,149.171.126.6,53,udp,CON,0.001055,132,164,31,...,0,0,3,7,1,3,1,1,1,0
1,59.166.0.0,33661,149.171.126.9,1024,udp,CON,0.036133,528,304,31,...,0,0,2,4,2,3,1,1,2,0
2,59.166.0.6,1464,149.171.126.7,53,udp,CON,0.001119,146,178,31,...,0,0,12,8,1,2,2,1,1,0
3,59.166.0.5,3593,149.171.126.5,53,udp,CON,0.001209,132,164,31,...,0,0,6,9,1,1,1,1,1,0
4,59.166.0.3,49664,149.171.126.0,53,udp,CON,0.001169,146,178,31,...,0,0,7,9,1,1,1,1,1,0
5,59.166.0.0,32119,149.171.126.9,111,udp,CON,0.078339,568,312,31,...,0,0,2,4,2,3,1,1,2,0


In [3]:
ENCODING = 'utf-8'

def expand_categories(values):
    result = []
    s = values.value_counts()
    t = float(len(values))
    for v in s.index:
        result.append("{}:{}%".format(v,round(100*(s[v]/t),2)))
    return "[{}]".format(",".join(result))
        
def analyze(filename):
    print()
    print("Analyzing: {}".format(filename))
    df = pd.read_csv(filename,encoding=ENCODING)
    cols = df.columns.values
    total = float(len(df))

    print("{} rows".format(int(total)))
    for col in cols:
        uniques = df[col].unique()
        unique_count = len(uniques)
        if unique_count>100:
            print("** {}:{} ({}%)".format(col,unique_count,int(((unique_count)/total)*100)))
        else:
            print("** {}:{}".format(col,expand_categories(df[col])))
            expand_categories(df[col])

#analyze(path)


# display 5 rows
# df[0:5]

# Encode text values to dummy variables(i.e. [1,0,0],[0,1,0],[0,0,1] for red,green,blue)
def encode_text_dummy(df, name):
    dummies = pd.get_dummies(df[name])
    for x in dummies.columns:
        dummy_name = f"{name}-{x}"
        df[dummy_name] = dummies[x]
    df.drop(name, axis=1, inplace=True)

#Clean 'date' column and convert to Int type
def clean_date(s):
    s = ''.join([i for i in s if i not in frozenset(string.punctuation)])
    s_removed = s.replace(" ", "")
    s_int = int(s_removed)
    return s_int

########## CLEAN IP #######################
def clean_ip(s):
    s = ''.join([i for i in s if i not in frozenset(string.punctuation)])
    s_int = int(s)
    return s_int

# Encode a numeric column as zscores
def encode_numeric_zscore(df, name, mean=None, sd=None):
    if mean is None:
        mean = df[name].mean()

    if sd is None:
        sd = df[name].std()

    df[name] = (df[name] - mean) / sd

#MINMAX -1 1
# Encode a column to a range between normalized_low and normalized_high.
def min_max_1(df, name, normalized_low=-1, normalized_high=1,
                         data_low=None, data_high=None):
    if data_low is None:
        data_low = min(df[name])
        data_high = max(df[name])

    df[name] = ((df[name] - data_low) / (data_high - data_low)) \
        * (normalized_high - normalized_low) + normalized_low

#MINMAX 0 1
def min_max_0(df, name, normalized_low=0, normalized_high=1,
                         data_low=None, data_high=None):
    if data_low is None:
        data_low = min(df[name])
        data_high = max(df[name])

    df[name] = ((df[name] - data_low) / (data_high - data_low)) \
        * (normalized_high - normalized_low) + normalized_low

    
# Encode text values to indexes(i.e. [1],[2],[3] for red,green,blue).
def encode_text_index(df, name):
    le = preprocessing.LabelEncoder()
    df[name] = le.fit_transform(df[name])
    return le.classes_

# Convert a Pandas dataframe to the x,y inputs that TensorFlow needs
def to_xy(df, target):
    result = []
    for x in df.columns:
        if x != target:
            result.append(x)
    # find out the type of the target column.  Is it really this hard? :(
    target_type = df[target].dtypes
    target_type = target_type[0] if hasattr(
        target_type, '__iter__') else target_type
    # Encode to int for classification, float otherwise. TensorFlow likes 32 bits.
    if target_type in (np.int64, np.int32):
        # Classification
        dummies = pd.get_dummies(df[target])
        return df[result].values.astype(np.float32), dummies.values.astype(np.float32)
    # Regression
    return df[result].values.astype(np.float32), df[[target]].values.astype(np.float32)

# Plot a confusion matrix.
# cm is the confusion matrix, names are the names of the classes.
def plot_confusion_matrix(cm, names, title='Confusion matrix', cmap=plt.cm.Blues):
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(names))
    plt.xticks(tick_marks, names, rotation=45)
    plt.yticks(tick_marks, names)
    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

In [4]:

df['srcip'] = df['srcip'].apply(clean_ip)
df['dstip'] = df['dstip'].apply(clean_ip)

In [5]:

min_max_0(df, 'srcip')#
encode_numeric_zscore((df, 'sport')

SyntaxError: unexpected EOF while parsing (<ipython-input-5-ae55868a1ecf>, line 3)

In [6]:
encode_numeric_zscore(df, 'sport')

TypeError: unsupported operand type(s) for +: 'int' and 'str'

In [7]:
 df.dtypes

srcip                 int64
sport                object
dstip                 int64
dsport               object
proto                object
state                object
dur                 float64
sbyte                 int64
dbytes                int64
sttl                  int64
dttl                  int64
sloss                 int64
dloss                 int64
service              object
Sload               float64
Dload               float64
Spkts                 int64
Dpkts                 int64
swin                  int64
dwin                  int64
stcpb                 int64
dtcpb                 int64
smeansz               int64
dmeansz               int64
trans                 int64
res                   int64
Sjit                float64
Djit                float64
Stime                 int64
Ltime                 int64
Sintpkt             float64
Dintpkt             float64
tcprtt              float64
synack              float64
ackdat              float64
is_sm_ips_ports     

In [None]:
encode_text_dummy(df, 'sport')

In [1]:
sport = df['sport']

NameError: name 'df' is not defined

In [2]:
df[0:3]

NameError: name 'df' is not defined

In [4]:
import pandas as pd
import io
import requests
import numpy as np
import os
import matplotlib.pyplot as plt
import pylab as pl
import tensorflow.contrib.learn as skflow
import string


#from sklearn.utils.multiclass import unique_labels
from scipy.stats import zscore
from sklearn.model_selection import train_test_split
from sklearn import metrics
from keras.models import Sequential
from keras.layers.core import Dense, Activation
from keras.callbacks import EarlyStopping
from sklearn import preprocessing
from sklearn.metrics import confusion_matrix, roc_curve, auc, average_precision_score, precision_recall_curve
from inspect import signature
%matplotlib inline

path = "UNSW-NB15_2.csv"
# This file is a CSV, just no CSV extension or headers
df = pd.read_csv(path, header=None)
print(df[0:3])

print("Read {} rows.".format(len(df)))
# df = df.sample(frac=0.1, replace=False) # Uncomment this line to sample only 10% of the dataset
 # For now, just drop NA's (rows with missing values)

# The CSV file has no column heads, so add them
df.columns = [
    'srcip',
    'sport',
    'dstip',
    'dsport',
    'proto',
    'state',
    'dur',
    'sbyte',
    'dbytes',
    'sttl',
    'dttl',
    'sloss',
    'dloss',
    'service',
    'Sload',
    'Dload',
    'Spkts',
    'Dpkts',
    'swin',
    'dwin',
    'stcpb',
    'dtcpb',
    'smeansz',
    'dmeansz',
    'trans',
    'res',
    'Sjit',
    'Djit',
    'Stime', #QUITAR
    'Ltime', #QUITAR
    'Sintpkt',
    'Dintpkt',
    'tcprtt',
    'synack',
    'ackdat',
    'is_sm_ips_ports',
    'ct_state_ttl',
    'ct_flw_http_mthd',
    'is_ftp_login',
    'ct_ftp_cmd',
    'ct_srv_src',
    'ct_srv_dst',
    'ct_dst_ltm',
    'ct_src_ltm', 
    'ct_src_dport_ltm',
    'ct_dst_sport_lt', 
    'ct_dst_src_ltm',
    'attack_cat',
    'Label'
]

df.dropna(inplace=True,axis=1)

df.drop('sport', 1, inplace=True)#
df.drop('dsport', 1, inplace=True)#
df.drop('state', 1, inplace=True)#
df.drop('dur', 1, inplace=True)#
df.drop('Sload', 1, inplace=True)#
df.drop('Dload', 1, inplace=True)#
df.drop('Stime', 1, inplace=True)#
df.drop('Ltime', 1, inplace=True)#
df.drop('ct_src_ltm', 1, inplace=True)#

ENCODING = 'utf-8'

def expand_categories(values):
    result = []
    s = values.value_counts()
    t = float(len(values))
    for v in s.index:
        result.append("{}:{}%".format(v,round(100*(s[v]/t),2)))
    return "[{}]".format(",".join(result))
        
def analyze(filename):
    print()
    print("Analyzing: {}".format(filename))
    df = pd.read_csv(filename,encoding=ENCODING)
    cols = df.columns.values
    total = float(len(df))

    print("{} rows".format(int(total)))
    for col in cols:
        uniques = df[col].unique()
        unique_count = len(uniques)
        if unique_count>100:
            print("** {}:{} ({}%)".format(col,unique_count,int(((unique_count)/total)*100)))
        else:
            print("** {}:{}".format(col,expand_categories(df[col])))
            expand_categories(df[col])

#analyze(path)


# display 5 rows
# df[0:5]

# Encode text values to dummy variables(i.e. [1,0,0],[0,1,0],[0,0,1] for red,green,blue)
def encode_text_dummy(df, name):
    dummies = pd.get_dummies(df[name])
    for x in dummies.columns:
        dummy_name = f"{name}-{x}"
        df[dummy_name] = dummies[x]
    df.drop(name, axis=1, inplace=True)

#Clean 'date' column and convert to Int type
def clean_date(s):
    s = ''.join([i for i in s if i not in frozenset(string.punctuation)])
    s_removed = s.replace(" ", "")
    s_int = int(s_removed)
    return s_int

########## CLEAN IP #######################
def clean_ip(s):
    s = ''.join([i for i in s if i not in frozenset(string.punctuation)])
    s_int = int(s)
    return s_int

# Encode a numeric column as zscores
def encode_numeric_zscore(df, name, mean=None, sd=None):
    if mean is None:
        mean = df[name].mean()

    if sd is None:
        sd = df[name].std()

    df[name] = (df[name] - mean) / sd

#MINMAX -1 1
# Encode a column to a range between normalized_low and normalized_high.
def min_max_1(df, name, normalized_low=-1, normalized_high=1,
                         data_low=None, data_high=None):
    if data_low is None:
        data_low = min(df[name])
        data_high = max(df[name])

    df[name] = ((df[name] - data_low) / (data_high - data_low)) \
        * (normalized_high - normalized_low) + normalized_low

#MINMAX 0 1
def min_max_0(df, name, normalized_low=0, normalized_high=1,
                         data_low=None, data_high=None):
    if data_low is None:
        data_low = min(df[name])
        data_high = max(df[name])

    df[name] = ((df[name] - data_low) / (data_high - data_low)) \
        * (normalized_high - normalized_low) + normalized_low

    
# Encode text values to indexes(i.e. [1],[2],[3] for red,green,blue).
def encode_text_index(df, name):
    le = preprocessing.LabelEncoder()
    df[name] = le.fit_transform(df[name])
    return le.classes_

# Convert a Pandas dataframe to the x,y inputs that TensorFlow needs
def to_xy(df, target):
    result = []
    for x in df.columns:
        if x != target:
            result.append(x)
    # find out the type of the target column.  Is it really this hard? :(
    target_type = df[target].dtypes
    target_type = target_type[0] if hasattr(
        target_type, '__iter__') else target_type
    # Encode to int for classification, float otherwise. TensorFlow likes 32 bits.
    if target_type in (np.int64, np.int32):
        # Classification
        dummies = pd.get_dummies(df[target])
        return df[result].values.astype(np.float32), dummies.values.astype(np.float32)
    # Regression
    return df[result].values.astype(np.float32), df[[target]].values.astype(np.float32)

# Plot a confusion matrix.
# cm is the confusion matrix, names are the names of the classes.
def plot_confusion_matrix(cm, names, title='Confusion matrix', cmap=plt.cm.Blues):
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(names))
    plt.xticks(tick_marks, names, rotation=45)
    plt.yticks(tick_marks, names)
    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')


  interactivity=interactivity, compiler=compiler, result=result)


           0      1              2      3    4    5         6     7      8   \
0  59.166.0.0   6055  149.171.126.5  54145  tcp  FIN  0.072974  4238  60788   
1  59.166.0.0   7832  149.171.126.3   5607  tcp  FIN  0.144951  5174  91072   
2  59.166.0.8  11397  149.171.126.6     21  tcp  FIN  0.116107  2934   3742   

   9   ...  39  40  41 42  43  44  45  46   47  48  
0  31  ...   0  13  13  6   7   1   1   2  NaN   0  
1  31  ...   0  13  13  6   7   1   1   2  NaN   0  
2  31  ...   1   1   2  7   5   1   1   4  NaN   0  

[3 rows x 49 columns]
Read 700001 rows.


In [5]:
sport = df['sport']

KeyError: 'sport'

In [6]:
import pandas as pd
import io
import requests
import numpy as np
import os
import matplotlib.pyplot as plt
import pylab as pl
import tensorflow.contrib.learn as skflow
import string


#from sklearn.utils.multiclass import unique_labels
from scipy.stats import zscore
from sklearn.model_selection import train_test_split
from sklearn import metrics
from keras.models import Sequential
from keras.layers.core import Dense, Activation
from keras.callbacks import EarlyStopping
from sklearn import preprocessing
from sklearn.metrics import confusion_matrix, roc_curve, auc, average_precision_score, precision_recall_curve
from inspect import signature
%matplotlib inline

path = "UNSW-NB15_2.csv"
# This file is a CSV, just no CSV extension or headers
df = pd.read_csv(path, header=None)
print(df[0:3])

print("Read {} rows.".format(len(df)))
# df = df.sample(frac=0.1, replace=False) # Uncomment this line to sample only 10% of the dataset
 # For now, just drop NA's (rows with missing values)

# The CSV file has no column heads, so add them
df.columns = [
    'srcip',
    'sport',
    'dstip',
    'dsport',
    'proto',
    'state',
    'dur',
    'sbyte',
    'dbytes',
    'sttl',
    'dttl',
    'sloss',
    'dloss',
    'service',
    'Sload',
    'Dload',
    'Spkts',
    'Dpkts',
    'swin',
    'dwin',
    'stcpb',
    'dtcpb',
    'smeansz',
    'dmeansz',
    'trans',
    'res',
    'Sjit',
    'Djit',
    'Stime', #QUITAR
    'Ltime', #QUITAR
    'Sintpkt',
    'Dintpkt',
    'tcprtt',
    'synack',
    'ackdat',
    'is_sm_ips_ports',
    'ct_state_ttl',
    'ct_flw_http_mthd',
    'is_ftp_login',
    'ct_ftp_cmd',
    'ct_srv_src',
    'ct_srv_dst',
    'ct_dst_ltm',
    'ct_src_ltm', 
    'ct_src_dport_ltm',
    'ct_dst_sport_lt', 
    'ct_dst_src_ltm',
    'attack_cat',
    'Label'
]

  interactivity=interactivity, compiler=compiler, result=result)


           0      1              2      3    4    5         6     7      8   \
0  59.166.0.0   6055  149.171.126.5  54145  tcp  FIN  0.072974  4238  60788   
1  59.166.0.0   7832  149.171.126.3   5607  tcp  FIN  0.144951  5174  91072   
2  59.166.0.8  11397  149.171.126.6     21  tcp  FIN  0.116107  2934   3742   

   9   ...  39  40  41 42  43  44  45  46   47  48  
0  31  ...   0  13  13  6   7   1   1   2  NaN   0  
1  31  ...   0  13  13  6   7   1   1   2  NaN   0  
2  31  ...   1   1   2  7   5   1   1   4  NaN   0  

[3 rows x 49 columns]
Read 700001 rows.


In [7]:
sport = df['sport']

In [9]:
sport.astype('int64')

0          6055
1          7832
2         11397
3          3804
4         14339
5         39094
6         10845
7         45642
8          1931
9         25724
10        49668
11        14951
12        27545
13        56591
14        27855
15        50296
16        30115
17         5573
18        23062
19        34913
20        16182
21        60312
22        20134
23        50600
24        55009
25        56851
26        51278
27         8780
28        63194
29         8844
          ...  
699971    48965
699972     1042
699973    28354
699974    18455
699975    47831
699976    35609
699977    33127
699978    40487
699979    12841
699980    27449
699981     2020
699982    49344
699983    15030
699984    63132
699985    61247
699986     6870
699987     7894
699988    10143
699989    61561
699990     1043
699991     1043
699992    27775
699993     1043
699994     1043
699995     1043
699996     1043
699997     1043
699998     1043
699999     1043
700000    18247
Name: sport, Length: 700

In [10]:
df['sport'].astype('int64')

0          6055
1          7832
2         11397
3          3804
4         14339
5         39094
6         10845
7         45642
8          1931
9         25724
10        49668
11        14951
12        27545
13        56591
14        27855
15        50296
16        30115
17         5573
18        23062
19        34913
20        16182
21        60312
22        20134
23        50600
24        55009
25        56851
26        51278
27         8780
28        63194
29         8844
          ...  
699971    48965
699972     1042
699973    28354
699974    18455
699975    47831
699976    35609
699977    33127
699978    40487
699979    12841
699980    27449
699981     2020
699982    49344
699983    15030
699984    63132
699985    61247
699986     6870
699987     7894
699988    10143
699989    61561
699990     1043
699991     1043
699992    27775
699993     1043
699994     1043
699995     1043
699996     1043
699997     1043
699998     1043
699999     1043
700000    18247
Name: sport, Length: 700

In [11]:
encode_text_zscore(df, 'sport')

NameError: name 'encode_text_zscore' is not defined

In [12]:
import pandas as pd
import io
import requests
import numpy as np
import os
import matplotlib.pyplot as plt
import pylab as pl
import tensorflow.contrib.learn as skflow
import string


#from sklearn.utils.multiclass import unique_labels
from scipy.stats import zscore
from sklearn.model_selection import train_test_split
from sklearn import metrics
from keras.models import Sequential
from keras.layers.core import Dense, Activation
from keras.callbacks import EarlyStopping
from sklearn import preprocessing
from sklearn.metrics import confusion_matrix, roc_curve, auc, average_precision_score, precision_recall_curve
from inspect import signature
%matplotlib inline

path = "UNSW-NB15_2.csv"
# This file is a CSV, just no CSV extension or headers
df = pd.read_csv(path, header=None)
print(df[0:3])

print("Read {} rows.".format(len(df)))
# df = df.sample(frac=0.1, replace=False) # Uncomment this line to sample only 10% of the dataset
 # For now, just drop NA's (rows with missing values)

# The CSV file has no column heads, so add them
df.columns = [
    'srcip',
    'sport',
    'dstip',
    'dsport',
    'proto',
    'state',
    'dur',
    'sbyte',
    'dbytes',
    'sttl',
    'dttl',
    'sloss',
    'dloss',
    'service',
    'Sload',
    'Dload',
    'Spkts',
    'Dpkts',
    'swin',
    'dwin',
    'stcpb',
    'dtcpb',
    'smeansz',
    'dmeansz',
    'trans',
    'res',
    'Sjit',
    'Djit',
    'Stime', #QUITAR
    'Ltime', #QUITAR
    'Sintpkt',
    'Dintpkt',
    'tcprtt',
    'synack',
    'ackdat',
    'is_sm_ips_ports',
    'ct_state_ttl',
    'ct_flw_http_mthd',
    'is_ftp_login',
    'ct_ftp_cmd',
    'ct_srv_src',
    'ct_srv_dst',
    'ct_dst_ltm',
    'ct_src_ltm', 
    'ct_src_dport_ltm',
    'ct_dst_sport_lt', 
    'ct_dst_src_ltm',
    'attack_cat',
    'Label'
]

df.dropna(inplace=True,axis=1)

  interactivity=interactivity, compiler=compiler, result=result)


           0      1              2      3    4    5         6     7      8   \
0  59.166.0.0   6055  149.171.126.5  54145  tcp  FIN  0.072974  4238  60788   
1  59.166.0.0   7832  149.171.126.3   5607  tcp  FIN  0.144951  5174  91072   
2  59.166.0.8  11397  149.171.126.6     21  tcp  FIN  0.116107  2934   3742   

   9   ...  39  40  41 42  43  44  45  46   47  48  
0  31  ...   0  13  13  6   7   1   1   2  NaN   0  
1  31  ...   0  13  13  6   7   1   1   2  NaN   0  
2  31  ...   1   1   2  7   5   1   1   4  NaN   0  

[3 rows x 49 columns]
Read 700001 rows.


In [13]:
ENCODING = 'utf-8'

def expand_categories(values):
    result = []
    s = values.value_counts()
    t = float(len(values))
    for v in s.index:
        result.append("{}:{}%".format(v,round(100*(s[v]/t),2)))
    return "[{}]".format(",".join(result))
        
def analyze(filename):
    print()
    print("Analyzing: {}".format(filename))
    df = pd.read_csv(filename,encoding=ENCODING)
    cols = df.columns.values
    total = float(len(df))

    print("{} rows".format(int(total)))
    for col in cols:
        uniques = df[col].unique()
        unique_count = len(uniques)
        if unique_count>100:
            print("** {}:{} ({}%)".format(col,unique_count,int(((unique_count)/total)*100)))
        else:
            print("** {}:{}".format(col,expand_categories(df[col])))
            expand_categories(df[col])

#analyze(path)


# display 5 rows
# df[0:5]

# Encode text values to dummy variables(i.e. [1,0,0],[0,1,0],[0,0,1] for red,green,blue)
def encode_text_dummy(df, name):
    dummies = pd.get_dummies(df[name])
    for x in dummies.columns:
        dummy_name = f"{name}-{x}"
        df[dummy_name] = dummies[x]
    df.drop(name, axis=1, inplace=True)

#Clean 'date' column and convert to Int type
def clean_date(s):
    s = ''.join([i for i in s if i not in frozenset(string.punctuation)])
    s_removed = s.replace(" ", "")
    s_int = int(s_removed)
    return s_int

########## CLEAN IP #######################
def clean_ip(s):
    s = ''.join([i for i in s if i not in frozenset(string.punctuation)])
    s_int = int(s)
    return s_int

# Encode a numeric column as zscores
def encode_numeric_zscore(df, name, mean=None, sd=None):
    if mean is None:
        mean = df[name].mean()

    if sd is None:
        sd = df[name].std()

    df[name] = (df[name] - mean) / sd

#MINMAX -1 1
# Encode a column to a range between normalized_low and normalized_high.
def min_max_1(df, name, normalized_low=-1, normalized_high=1,
                         data_low=None, data_high=None):
    if data_low is None:
        data_low = min(df[name])
        data_high = max(df[name])

    df[name] = ((df[name] - data_low) / (data_high - data_low)) \
        * (normalized_high - normalized_low) + normalized_low

#MINMAX 0 1
def min_max_0(df, name, normalized_low=0, normalized_high=1,
                         data_low=None, data_high=None):
    if data_low is None:
        data_low = min(df[name])
        data_high = max(df[name])

    df[name] = ((df[name] - data_low) / (data_high - data_low)) \
        * (normalized_high - normalized_low) + normalized_low

    
# Encode text values to indexes(i.e. [1],[2],[3] for red,green,blue).
def encode_text_index(df, name):
    le = preprocessing.LabelEncoder()
    df[name] = le.fit_transform(df[name])
    return le.classes_

# Convert a Pandas dataframe to the x,y inputs that TensorFlow needs
def to_xy(df, target):
    result = []
    for x in df.columns:
        if x != target:
            result.append(x)
    # find out the type of the target column.  Is it really this hard? :(
    target_type = df[target].dtypes
    target_type = target_type[0] if hasattr(
        target_type, '__iter__') else target_type
    # Encode to int for classification, float otherwise. TensorFlow likes 32 bits.
    if target_type in (np.int64, np.int32):
        # Classification
        dummies = pd.get_dummies(df[target])
        return df[result].values.astype(np.float32), dummies.values.astype(np.float32)
    # Regression
    return df[result].values.astype(np.float32), df[[target]].values.astype(np.float32)

# Plot a confusion matrix.
# cm is the confusion matrix, names are the names of the classes.
def plot_confusion_matrix(cm, names, title='Confusion matrix', cmap=plt.cm.Blues):
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(names))
    plt.xticks(tick_marks, names, rotation=45)
    plt.yticks(tick_marks, names)
    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')




df['srcip'] = df['srcip'].apply(clean_ip)
df['dstip'] = df['dstip'].apply(clean_ip)

In [14]:
df['sport'].astype('int64')

0          6055
1          7832
2         11397
3          3804
4         14339
5         39094
6         10845
7         45642
8          1931
9         25724
10        49668
11        14951
12        27545
13        56591
14        27855
15        50296
16        30115
17         5573
18        23062
19        34913
20        16182
21        60312
22        20134
23        50600
24        55009
25        56851
26        51278
27         8780
28        63194
29         8844
          ...  
699971    48965
699972     1042
699973    28354
699974    18455
699975    47831
699976    35609
699977    33127
699978    40487
699979    12841
699980    27449
699981     2020
699982    49344
699983    15030
699984    63132
699985    61247
699986     6870
699987     7894
699988    10143
699989    61561
699990     1043
699991     1043
699992    27775
699993     1043
699994     1043
699995     1043
699996     1043
699997     1043
699998     1043
699999     1043
700000    18247
Name: sport, Length: 700

In [15]:
df['sport'] = df['sport'].astype('int64')

In [16]:
encode_numeric_zscore(df, 'sport')

In [17]:
encode_numeric_zscore(df, 'dur')

In [18]:
df[0:10]

Unnamed: 0,srcip,sport,dstip,dsport,proto,state,dur,sbyte,dbytes,sttl,...,ct_state_ttl,ct_ftp_cmd,ct_srv_src,ct_srv_dst,ct_dst_ltm,ct_src_ltm,ct_src_dport_ltm,ct_dst_sport_lt,ct_dst_src_ltm,Label
0,5916600,-1.270343,1491711265,54145,tcp,FIN,-0.163935,4238,60788,31,...,0,0,13,13,6,7,1,1,2,0
1,5916600,-1.18103,1491711263,5607,tcp,FIN,-0.140845,5174,91072,31,...,0,0,13,13,6,7,1,1,2,0
2,5916608,-1.00185,1491711266,21,tcp,FIN,-0.150098,2934,3742,31,...,0,1,1,2,7,5,1,1,4,0
3,5916600,-1.38348,1491711263,53,udp,CON,-0.187029,146,178,31,...,0,0,13,13,6,7,1,1,2,0
4,5916608,-0.853982,1491711266,14724,tcp,FIN,-0.175001,8928,320,31,...,0,0,8,20,7,5,1,1,4,0
5,5916608,0.390226,1491711263,53,udp,CON,-0.187016,130,162,31,...,0,0,8,13,6,5,1,1,1,0
6,5916600,-1.029594,1491711267,5190,tcp,FIN,-0.185534,1064,2260,31,...,0,0,13,9,1,7,2,1,1,0
7,5916603,0.719334,1491711265,80,tcp,FIN,-0.180923,1036,824,31,...,0,0,18,13,6,5,2,1,1,0
8,5916604,-1.477619,1491711266,6881,tcp,FIN,0.863609,13766,548216,31,...,0,0,12,20,7,2,1,1,1,0
9,5916608,-0.281762,1491711266,5190,tcp,FIN,-0.138169,1272,2572,31,...,0,0,8,20,2,1,1,1,1,0


In [19]:
encode_numeric_zscore(df, 'Sload')

In [20]:
encode_numeric_zscore(df, 'Dload')

In [21]:
encode_numeric_zscore(df, 'is_sm_ips_ports')

In [23]:
encode_numeric_zscore(df, 'is_ftp_login')

KeyError: 'is_ftp_login'

In [25]:
df['is_ftp_login']

KeyError: 'is_ftp_login'

In [26]:
df[0:2]

Unnamed: 0,srcip,sport,dstip,dsport,proto,state,dur,sbyte,dbytes,sttl,...,ct_state_ttl,ct_ftp_cmd,ct_srv_src,ct_srv_dst,ct_dst_ltm,ct_src_ltm,ct_src_dport_ltm,ct_dst_sport_lt,ct_dst_src_ltm,Label
0,5916600,-1.270343,1491711265,54145,tcp,FIN,-0.163935,4238,60788,31,...,0,0,13,13,6,7,1,1,2,0
1,5916600,-1.18103,1491711263,5607,tcp,FIN,-0.140845,5174,91072,31,...,0,0,13,13,6,7,1,1,2,0


In [27]:
min_max_0(df, 'ct_flw_http_mthd')

KeyError: 'ct_flw_http_mthd'

In [28]:
encode_numeric_zscore(df, 'ct_src_ltm')

In [30]:
import pandas as pd
import io
import requests
import numpy as np
import os
import matplotlib.pyplot as plt
import pylab as pl
import tensorflow.contrib.learn as skflow
import string


#from sklearn.utils.multiclass import unique_labels
from scipy.stats import zscore
from sklearn.model_selection import train_test_split
from sklearn import metrics
from keras.models import Sequential
from keras.layers.core import Dense, Activation
from keras.callbacks import EarlyStopping
from sklearn import preprocessing
from sklearn.metrics import confusion_matrix, roc_curve, auc, average_precision_score, precision_recall_curve
from inspect import signature
%matplotlib inline

path = "UNSW-NB15_2.csv"
# This file is a CSV, just no CSV extension or headers
df = pd.read_csv(path, header=None)
print(df[0:3])

print("Read {} rows.".format(len(df)))
# df = df.sample(frac=0.1, replace=False) # Uncomment this line to sample only 10% of the dataset
 # For now, just drop NA's (rows with missing values)

# The CSV file has no column heads, so add them
df.columns = [
    'srcip',
    'sport',
    'dstip',
    'dsport',
    'proto',
    'state',
    'dur',
    'sbyte',
    'dbytes',
    'sttl',
    'dttl',
    'sloss',
    'dloss',
    'service',
    'Sload',
    'Dload',
    'Spkts',
    'Dpkts',
    'swin',
    'dwin',
    'stcpb',
    'dtcpb',
    'smeansz',
    'dmeansz',
    'trans',
    'res',
    'Sjit',
    'Djit',
    'Stime', #QUITAR
    'Ltime', #QUITAR
    'Sintpkt',
    'Dintpkt',
    'tcprtt',
    'synack',
    'ackdat',
    'is_sm_ips_ports',
    'ct_state_ttl',
    'ct_flw_http_mthd',
    'is_ftp_login',
    'ct_ftp_cmd',
    'ct_srv_src',
    'ct_srv_dst',
    'ct_dst_ltm',
    'ct_src_ltm', 
    'ct_src_dport_ltm',
    'ct_dst_sport_lt', 
    'ct_dst_src_ltm',
    'attack_cat',
    'Label'
]

df.dropna(inplace=True,axis=1)

# df.drop('sport', 1, inplace=True)#
# df.drop('dsport', 1, inplace=True)#
# df.drop('state', 1, inplace=True)#
# df.drop('dur', 1, inplace=True)#
# df.drop('Sload', 1, inplace=True)#
# df.drop('Dload', 1, inplace=True)#
# df.drop('Stime', 1, inplace=True)#
# df.drop('Ltime', 1, inplace=True)#
# df.drop('ct_src_ltm', 1, inplace=True)#

ENCODING = 'utf-8'

def expand_categories(values):
    result = []
    s = values.value_counts()
    t = float(len(values))
    for v in s.index:
        result.append("{}:{}%".format(v,round(100*(s[v]/t),2)))
    return "[{}]".format(",".join(result))
        
def analyze(filename):
    print()
    print("Analyzing: {}".format(filename))
    df = pd.read_csv(filename,encoding=ENCODING)
    cols = df.columns.values
    total = float(len(df))

    print("{} rows".format(int(total)))
    for col in cols:
        uniques = df[col].unique()
        unique_count = len(uniques)
        if unique_count>100:
            print("** {}:{} ({}%)".format(col,unique_count,int(((unique_count)/total)*100)))
        else:
            print("** {}:{}".format(col,expand_categories(df[col])))
            expand_categories(df[col])

#analyze(path)


# display 5 rows
# df[0:5]

# Encode text values to dummy variables(i.e. [1,0,0],[0,1,0],[0,0,1] for red,green,blue)
def encode_text_dummy(df, name):
    dummies = pd.get_dummies(df[name])
    for x in dummies.columns:
        dummy_name = f"{name}-{x}"
        df[dummy_name] = dummies[x]
    df.drop(name, axis=1, inplace=True)

#Clean 'date' column and convert to Int type
def clean_date(s):
    s = ''.join([i for i in s if i not in frozenset(string.punctuation)])
    s_removed = s.replace(" ", "")
    s_int = int(s_removed)
    return s_int

########## CLEAN IP #######################
def clean_ip(s):
    s = ''.join([i for i in s if i not in frozenset(string.punctuation)])
    s_int = int(s)
    return s_int

# Encode a numeric column as zscores
def encode_numeric_zscore(df, name, mean=None, sd=None):
    if mean is None:
        mean = df[name].mean()

    if sd is None:
        sd = df[name].std()

    df[name] = (df[name] - mean) / sd

#MINMAX -1 1
# Encode a column to a range between normalized_low and normalized_high.
def min_max_1(df, name, normalized_low=-1, normalized_high=1,
                         data_low=None, data_high=None):
    if data_low is None:
        data_low = min(df[name])
        data_high = max(df[name])

    df[name] = ((df[name] - data_low) / (data_high - data_low)) \
        * (normalized_high - normalized_low) + normalized_low

#MINMAX 0 1
def min_max_0(df, name, normalized_low=0, normalized_high=1,
                         data_low=None, data_high=None):
    if data_low is None:
        data_low = min(df[name])
        data_high = max(df[name])

    df[name] = ((df[name] - data_low) / (data_high - data_low)) \
        * (normalized_high - normalized_low) + normalized_low

    
# Encode text values to indexes(i.e. [1],[2],[3] for red,green,blue).
def encode_text_index(df, name):
    le = preprocessing.LabelEncoder()
    df[name] = le.fit_transform(df[name])
    return le.classes_

# Convert a Pandas dataframe to the x,y inputs that TensorFlow needs
def to_xy(df, target):
    result = []
    for x in df.columns:
        if x != target:
            result.append(x)
    # find out the type of the target column.  Is it really this hard? :(
    target_type = df[target].dtypes
    target_type = target_type[0] if hasattr(
        target_type, '__iter__') else target_type
    # Encode to int for classification, float otherwise. TensorFlow likes 32 bits.
    if target_type in (np.int64, np.int32):
        # Classification
        dummies = pd.get_dummies(df[target])
        return df[result].values.astype(np.float32), dummies.values.astype(np.float32)
    # Regression
    return df[result].values.astype(np.float32), df[[target]].values.astype(np.float32)

# Plot a confusion matrix.
# cm is the confusion matrix, names are the names of the classes.
def plot_confusion_matrix(cm, names, title='Confusion matrix', cmap=plt.cm.Blues):
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(names))
    plt.xticks(tick_marks, names, rotation=45)
    plt.yticks(tick_marks, names)
    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')




df['srcip'] = df['srcip'].apply(clean_ip)
df['dstip'] = df['dstip'].apply(clean_ip)

df['sport'] = df['sport'].astype('int64')
df['dsport'] = df['dsport'].astype('int64')

# Now encode the feature vector

min_max_0(df, 'srcip')#
min_max_0(df, 'sport')

min_max_0(df, 'dstip') #
min_max_0(df, 'dsport')

encode_text_dummy(df, 'proto') #
#encode_text_dummy(df, 'state')

min_max_0(df, 'dur')

min_max_0(df, 'sbyte') #
min_max_0(df, 'dbytes')#
min_max_0(df, 'sttl')#
min_max_0(df, 'dttl')#
min_max_0(df, 'sloss')#
min_max_0(df, 'dloss')#
encode_text_dummy(df, 'service') #
min_max_0(df, 'Sload')

min_max_0(df, 'Dload')

min_max_0(df, 'Spkts')#
min_max_0(df, 'Dpkts')#
min_max_0(df, 'swin')#
min_max_0(df, 'dwin')#
min_max_0(df, 'stcpb')#
min_max_0(df, 'dtcpb')#
min_max_0(df, 'smeansz')#
min_max_0(df, 'dmeansz')#
min_max_0(df, 'trans')#
min_max_0(df, 'res')#
min_max_0(df, 'Sjit')#
min_max_0(df, 'Djit')#

min_max_0(df, 'Sintpkt')#
min_max_0(df, 'Dintpkt')#
min_max_0(df, 'tcprtt')#
min_max_0(df, 'synack')#
min_max_0(df, 'ackdat')#
encode_numeric_zscore(df, 'is_sm_ips_ports')#ojo
min_max_0(df, 'ct_state_ttl')#
min_max_0(df, 'ct_flw_http_mthd')#
encode_numeric_zscore(df, 'is_ftp_login')#ojo
min_max_0(df, 'ct_ftp_cmd')#
min_max_0(df, 'ct_srv_src')#
min_max_0(df, 'ct_srv_dst')#
min_max_0(df, 'ct_dst_ltm')#
min_max_0(df, 'ct_src_ltm')

min_max_0(df, 'ct_src_dport_ltm')#
min_max_0(df, 'ct_dst_sport_lt')#
min_max_0(df, 'ct_dst_src_ltm')#
#encode_numeric_zscore(df, 'attack_cat')#
encode_text_dummy(df, 'attack_cat')

outcomes = encode_text_index(df, 'Label')#
num_classes = len(outcomes)

           0      1              2      3    4    5         6     7      8   \
0  59.166.0.0   6055  149.171.126.5  54145  tcp  FIN  0.072974  4238  60788   
1  59.166.0.0   7832  149.171.126.3   5607  tcp  FIN  0.144951  5174  91072   
2  59.166.0.8  11397  149.171.126.6     21  tcp  FIN  0.116107  2934   3742   

   9   ...  39  40  41 42  43  44  45  46   47  48  
0  31  ...   0  13  13  6   7   1   1   2  NaN   0  
1  31  ...   0  13  13  6   7   1   1   2  NaN   0  
2  31  ...   1   1   2  7   5   1   1   4  NaN   0  

[3 rows x 49 columns]
Read 700001 rows.


ValueError: invalid literal for int() with base 10: '0xcc09'

In [31]:
df['dsport'].dtype

dtype('O')

In [32]:
df['dsport']

0         54145
1          5607
2            21
3            53
4         14724
5            53
6          5190
7            80
8          6881
9          5190
10           80
11           53
12          111
13        39730
14           53
15           53
16        31010
17           80
18         5190
19         5190
20           53
21           53
22         5190
23        49439
24           53
25           21
26          143
27           53
28        31010
29           53
          ...  
699971    46884
699972       80
699973       53
699974      179
699975     5190
699976     6881
699977       53
699978       21
699979       80
699980     5190
699981       25
699982    24108
699983     6881
699984       21
699985      110
699986       80
699987     5972
699988       53
699989       53
699990       53
699991       53
699992       80
699993       53
699994       53
699995       53
699996       53
699997       53
699998       53
699999       53
700000     7662
Name: dsport, Length: 70

In [36]:
df['dsport'] = df['dsport'].astype('int64')

ValueError: invalid literal for int() with base 10: '0xcc09'

In [34]:
df['dsport'] = df['dsport'].astype('float64')

ValueError: could not convert string to float: '0xcc09'

In [35]:
min_max_0(df, 'dsport')

TypeError: '<' not supported between instances of 'str' and 'int'

In [37]:
df[0:10]

Unnamed: 0,srcip,sport,dstip,dsport,proto,state,dur,sbyte,dbytes,sttl,...,ct_state_ttl,ct_ftp_cmd,ct_srv_src,ct_srv_dst,ct_dst_ltm,ct_src_ltm,ct_src_dport_ltm,ct_dst_sport_lt,ct_dst_src_ltm,Label
0,5916600,6055,1491711265,54145,tcp,FIN,0.072974,4238,60788,31,...,0,0,13,13,6,7,1,1,2,0
1,5916600,7832,1491711263,5607,tcp,FIN,0.144951,5174,91072,31,...,0,0,13,13,6,7,1,1,2,0
2,5916608,11397,1491711266,21,tcp,FIN,0.116107,2934,3742,31,...,0,1,1,2,7,5,1,1,4,0
3,5916600,3804,1491711263,53,udp,CON,0.000986,146,178,31,...,0,0,13,13,6,7,1,1,2,0
4,5916608,14339,1491711266,14724,tcp,FIN,0.03848,8928,320,31,...,0,0,8,20,7,5,1,1,4,0
5,5916608,39094,1491711263,53,udp,CON,0.001026,130,162,31,...,0,0,8,13,6,5,1,1,1,0
6,5916600,10845,1491711267,5190,tcp,FIN,0.005645,1064,2260,31,...,0,0,13,9,1,7,2,1,1,0
7,5916603,45642,1491711265,80,tcp,FIN,0.020018,1036,824,31,...,0,0,18,13,6,5,2,1,1,0
8,5916604,1931,1491711266,6881,tcp,FIN,3.27602,13766,548216,31,...,0,0,12,20,7,2,1,1,1,0
9,5916608,25724,1491711266,5190,tcp,FIN,0.153293,1272,2572,31,...,0,0,8,20,2,1,1,1,1,0


In [38]:
encode_numeric_zscore(df, 'sport')

In [39]:
df['dsport']

0         54145
1          5607
2            21
3            53
4         14724
5            53
6          5190
7            80
8          6881
9          5190
10           80
11           53
12          111
13        39730
14           53
15           53
16        31010
17           80
18         5190
19         5190
20           53
21           53
22         5190
23        49439
24           53
25           21
26          143
27           53
28        31010
29           53
          ...  
699971    46884
699972       80
699973       53
699974      179
699975     5190
699976     6881
699977       53
699978       21
699979       80
699980     5190
699981       25
699982    24108
699983     6881
699984       21
699985      110
699986       80
699987     5972
699988       53
699989       53
699990       53
699991       53
699992       80
699993       53
699994       53
699995       53
699996       53
699997       53
699998       53
699999       53
700000     7662
Name: dsport, Length: 70

In [40]:
import pandas as pd
import io
import requests
import numpy as np
import os
import matplotlib.pyplot as plt
import pylab as pl
import tensorflow.contrib.learn as skflow
import string


#from sklearn.utils.multiclass import unique_labels
from scipy.stats import zscore
from sklearn.model_selection import train_test_split
from sklearn import metrics
from keras.models import Sequential
from keras.layers.core import Dense, Activation
from keras.callbacks import EarlyStopping
from sklearn import preprocessing
from sklearn.metrics import confusion_matrix, roc_curve, auc, average_precision_score, precision_recall_curve
from inspect import signature
%matplotlib inline

path = "UNSW-NB15_2.csv"
# This file is a CSV, just no CSV extension or headers
df = pd.read_csv(path, header=None)
print(df[0:3])

print("Read {} rows.".format(len(df)))
# df = df.sample(frac=0.1, replace=False) # Uncomment this line to sample only 10% of the dataset
 # For now, just drop NA's (rows with missing values)

# The CSV file has no column heads, so add them
df.columns = [
    'srcip',
    'sport',
    'dstip',
    'dsport',
    'proto',
    'state',
    'dur',
    'sbyte',
    'dbytes',
    'sttl',
    'dttl',
    'sloss',
    'dloss',
    'service',
    'Sload',
    'Dload',
    'Spkts',
    'Dpkts',
    'swin',
    'dwin',
    'stcpb',
    'dtcpb',
    'smeansz',
    'dmeansz',
    'trans',
    'res',
    'Sjit',
    'Djit',
    'Stime', #QUITAR
    'Ltime', #QUITAR
    'Sintpkt',
    'Dintpkt',
    'tcprtt',
    'synack',
    'ackdat',
    'is_sm_ips_ports',
    'ct_state_ttl',
    'ct_flw_http_mthd',
    'is_ftp_login',
    'ct_ftp_cmd',
    'ct_srv_src',
    'ct_srv_dst',
    'ct_dst_ltm',
    'ct_src_ltm', 
    'ct_src_dport_ltm',
    'ct_dst_sport_lt', 
    'ct_dst_src_ltm',
    'attack_cat',
    'Label'
]

df.dropna(inplace=True,axis=1)

# df.drop('sport', 1, inplace=True)#
# df.drop('dsport', 1, inplace=True)#
# df.drop('state', 1, inplace=True)#
# df.drop('dur', 1, inplace=True)#
# df.drop('Sload', 1, inplace=True)#
# df.drop('Dload', 1, inplace=True)#
# df.drop('Stime', 1, inplace=True)#
# df.drop('Ltime', 1, inplace=True)#
# df.drop('ct_src_ltm', 1, inplace=True)#

ENCODING = 'utf-8'

def expand_categories(values):
    result = []
    s = values.value_counts()
    t = float(len(values))
    for v in s.index:
        result.append("{}:{}%".format(v,round(100*(s[v]/t),2)))
    return "[{}]".format(",".join(result))
        
def analyze(filename):
    print()
    print("Analyzing: {}".format(filename))
    df = pd.read_csv(filename,encoding=ENCODING)
    cols = df.columns.values
    total = float(len(df))

    print("{} rows".format(int(total)))
    for col in cols:
        uniques = df[col].unique()
        unique_count = len(uniques)
        if unique_count>100:
            print("** {}:{} ({}%)".format(col,unique_count,int(((unique_count)/total)*100)))
        else:
            print("** {}:{}".format(col,expand_categories(df[col])))
            expand_categories(df[col])

#analyze(path)


# display 5 rows
# df[0:5]

# Encode text values to dummy variables(i.e. [1,0,0],[0,1,0],[0,0,1] for red,green,blue)
def encode_text_dummy(df, name):
    dummies = pd.get_dummies(df[name])
    for x in dummies.columns:
        dummy_name = f"{name}-{x}"
        df[dummy_name] = dummies[x]
    df.drop(name, axis=1, inplace=True)

#Clean 'date' column and convert to Int type
def clean_date(s):
    s = ''.join([i for i in s if i not in frozenset(string.punctuation)])
    s_removed = s.replace(" ", "")
    s_int = int(s_removed)
    return s_int

########## CLEAN IP #######################
def clean_ip(s):
    s = ''.join([i for i in s if i not in frozenset(string.punctuation)])
    s_int = int(s)
    return s_int

# Encode a numeric column as zscores
def encode_numeric_zscore(df, name, mean=None, sd=None):
    if mean is None:
        mean = df[name].mean()

    if sd is None:
        sd = df[name].std()

    df[name] = (df[name] - mean) / sd

#MINMAX -1 1
# Encode a column to a range between normalized_low and normalized_high.
def min_max_1(df, name, normalized_low=-1, normalized_high=1,
                         data_low=None, data_high=None):
    if data_low is None:
        data_low = min(df[name])
        data_high = max(df[name])

    df[name] = ((df[name] - data_low) / (data_high - data_low)) \
        * (normalized_high - normalized_low) + normalized_low

#MINMAX 0 1
def min_max_0(df, name, normalized_low=0, normalized_high=1,
                         data_low=None, data_high=None):
    if data_low is None:
        data_low = min(df[name])
        data_high = max(df[name])

    df[name] = ((df[name] - data_low) / (data_high - data_low)) \
        * (normalized_high - normalized_low) + normalized_low

    
# Encode text values to indexes(i.e. [1],[2],[3] for red,green,blue).
def encode_text_index(df, name):
    le = preprocessing.LabelEncoder()
    df[name] = le.fit_transform(df[name])
    return le.classes_

# Convert a Pandas dataframe to the x,y inputs that TensorFlow needs
def to_xy(df, target):
    result = []
    for x in df.columns:
        if x != target:
            result.append(x)
    # find out the type of the target column.  Is it really this hard? :(
    target_type = df[target].dtypes
    target_type = target_type[0] if hasattr(
        target_type, '__iter__') else target_type
    # Encode to int for classification, float otherwise. TensorFlow likes 32 bits.
    if target_type in (np.int64, np.int32):
        # Classification
        dummies = pd.get_dummies(df[target])
        return df[result].values.astype(np.float32), dummies.values.astype(np.float32)
    # Regression
    return df[result].values.astype(np.float32), df[[target]].values.astype(np.float32)

# Plot a confusion matrix.
# cm is the confusion matrix, names are the names of the classes.
def plot_confusion_matrix(cm, names, title='Confusion matrix', cmap=plt.cm.Blues):
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(names))
    plt.xticks(tick_marks, names, rotation=45)
    plt.yticks(tick_marks, names)
    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')




df['srcip'] = df['srcip'].apply(clean_ip)
df['dstip'] = df['dstip'].apply(clean_ip)

df['sport'] = df['sport'].astype('int64')
df['dsport'] = df['dsport'].astype('int64')

# Now encode the feature vector

min_max_0(df, 'srcip')#
min_max_0(df, 'sport')

min_max_0(df, 'dstip') #
min_max_0(df, 'dsport')

encode_text_dummy(df, 'proto') #
#encode_text_dummy(df, 'state')

min_max_0(df, 'dur')

min_max_0(df, 'sbyte') #
min_max_0(df, 'dbytes')#
min_max_0(df, 'sttl')#
min_max_0(df, 'dttl')#
min_max_0(df, 'sloss')#
min_max_0(df, 'dloss')#
encode_text_dummy(df, 'service') #
min_max_0(df, 'Sload')

min_max_0(df, 'Dload')

min_max_0(df, 'Spkts')#
min_max_0(df, 'Dpkts')#
min_max_0(df, 'swin')#
min_max_0(df, 'dwin')#
min_max_0(df, 'stcpb')#
min_max_0(df, 'dtcpb')#
min_max_0(df, 'smeansz')#
min_max_0(df, 'dmeansz')#
min_max_0(df, 'trans')#
min_max_0(df, 'res')#
min_max_0(df, 'Sjit')#
min_max_0(df, 'Djit')#

min_max_0(df, 'Sintpkt')#
min_max_0(df, 'Dintpkt')#
min_max_0(df, 'tcprtt')#
min_max_0(df, 'synack')#
min_max_0(df, 'ackdat')#
encode_numeric_zscore(df, 'is_sm_ips_ports')#ojo
min_max_0(df, 'ct_state_ttl')#
min_max_0(df, 'ct_flw_http_mthd')#
encode_numeric_zscore(df, 'is_ftp_login')#ojo
min_max_0(df, 'ct_ftp_cmd')#
min_max_0(df, 'ct_srv_src')#
min_max_0(df, 'ct_srv_dst')#
min_max_0(df, 'ct_dst_ltm')#
min_max_0(df, 'ct_src_ltm')

min_max_0(df, 'ct_src_dport_ltm')#
min_max_0(df, 'ct_dst_sport_lt')#
min_max_0(df, 'ct_dst_src_ltm')#
#encode_numeric_zscore(df, 'attack_cat')#
encode_text_dummy(df, 'attack_cat')

outcomes = encode_text_index(df, 'Label')#
num_classes = len(outcomes)

# display 5 rows

#df.dropna(inplace=True,axis=1)
df[0:5]

  interactivity=interactivity, compiler=compiler, result=result)


           0      1              2      3    4    5         6     7      8   \
0  59.166.0.0   6055  149.171.126.5  54145  tcp  FIN  0.072974  4238  60788   
1  59.166.0.0   7832  149.171.126.3   5607  tcp  FIN  0.144951  5174  91072   
2  59.166.0.8  11397  149.171.126.6     21  tcp  FIN  0.116107  2934   3742   

   9   ...  39  40  41 42  43  44  45  46   47  48  
0  31  ...   0  13  13  6   7   1   1   2  NaN   0  
1  31  ...   0  13  13  6   7   1   1   2  NaN   0  
2  31  ...   1   1   2  7   5   1   1   4  NaN   0  

[3 rows x 49 columns]
Read 700001 rows.


KeyError: 'dsport'

In [41]:
df['dsport'] = df['dsport'].astype('int64')

KeyError: 'dsport'

In [42]:
df['dsport']

KeyError: 'dsport'

In [43]:
import pandas as pd
import io
import requests
import numpy as np
import os
import matplotlib.pyplot as plt
import pylab as pl
import tensorflow.contrib.learn as skflow
import string


#from sklearn.utils.multiclass import unique_labels
from scipy.stats import zscore
from sklearn.model_selection import train_test_split
from sklearn import metrics
from keras.models import Sequential
from keras.layers.core import Dense, Activation
from keras.callbacks import EarlyStopping
from sklearn import preprocessing
from sklearn.metrics import confusion_matrix, roc_curve, auc, average_precision_score, precision_recall_curve
from inspect import signature
%matplotlib inline

path = "UNSW-NB15_2.csv"
# This file is a CSV, just no CSV extension or headers
df = pd.read_csv(path, header=None)
print(df[0:3])

print("Read {} rows.".format(len(df)))
# df = df.sample(frac=0.1, replace=False) # Uncomment this line to sample only 10% of the dataset
 # For now, just drop NA's (rows with missing values)

# The CSV file has no column heads, so add them
df.columns = [
    'srcip',
    'sport',
    'dstip',
    'dsport',
    'proto',
    'state',
    'dur',
    'sbyte',
    'dbytes',
    'sttl',
    'dttl',
    'sloss',
    'dloss',
    'service',
    'Sload',
    'Dload',
    'Spkts',
    'Dpkts',
    'swin',
    'dwin',
    'stcpb',
    'dtcpb',
    'smeansz',
    'dmeansz',
    'trans',
    'res',
    'Sjit',
    'Djit',
    'Stime', #QUITAR
    'Ltime', #QUITAR
    'Sintpkt',
    'Dintpkt',
    'tcprtt',
    'synack',
    'ackdat',
    'is_sm_ips_ports',
    'ct_state_ttl',
    'ct_flw_http_mthd',
    'is_ftp_login',
    'ct_ftp_cmd',
    'ct_srv_src',
    'ct_srv_dst',
    'ct_dst_ltm',
    'ct_src_ltm', 
    'ct_src_dport_ltm',
    'ct_dst_sport_lt', 
    'ct_dst_src_ltm',
    'attack_cat',
    'Label'
]

df.dropna(inplace=True,axis=0)

# df.drop('sport', 1, inplace=True)#
# df.drop('dsport', 1, inplace=True)#
# df.drop('state', 1, inplace=True)#
# df.drop('dur', 1, inplace=True)#
# df.drop('Sload', 1, inplace=True)#
# df.drop('Dload', 1, inplace=True)#
# df.drop('Stime', 1, inplace=True)#
# df.drop('Ltime', 1, inplace=True)#
# df.drop('ct_src_ltm', 1, inplace=True)#

ENCODING = 'utf-8'

def expand_categories(values):
    result = []
    s = values.value_counts()
    t = float(len(values))
    for v in s.index:
        result.append("{}:{}%".format(v,round(100*(s[v]/t),2)))
    return "[{}]".format(",".join(result))
        
def analyze(filename):
    print()
    print("Analyzing: {}".format(filename))
    df = pd.read_csv(filename,encoding=ENCODING)
    cols = df.columns.values
    total = float(len(df))

    print("{} rows".format(int(total)))
    for col in cols:
        uniques = df[col].unique()
        unique_count = len(uniques)
        if unique_count>100:
            print("** {}:{} ({}%)".format(col,unique_count,int(((unique_count)/total)*100)))
        else:
            print("** {}:{}".format(col,expand_categories(df[col])))
            expand_categories(df[col])

#analyze(path)


# display 5 rows
# df[0:5]

# Encode text values to dummy variables(i.e. [1,0,0],[0,1,0],[0,0,1] for red,green,blue)
def encode_text_dummy(df, name):
    dummies = pd.get_dummies(df[name])
    for x in dummies.columns:
        dummy_name = f"{name}-{x}"
        df[dummy_name] = dummies[x]
    df.drop(name, axis=1, inplace=True)

#Clean 'date' column and convert to Int type
def clean_date(s):
    s = ''.join([i for i in s if i not in frozenset(string.punctuation)])
    s_removed = s.replace(" ", "")
    s_int = int(s_removed)
    return s_int

########## CLEAN IP #######################
def clean_ip(s):
    s = ''.join([i for i in s if i not in frozenset(string.punctuation)])
    s_int = int(s)
    return s_int

# Encode a numeric column as zscores
def encode_numeric_zscore(df, name, mean=None, sd=None):
    if mean is None:
        mean = df[name].mean()

    if sd is None:
        sd = df[name].std()

    df[name] = (df[name] - mean) / sd

#MINMAX -1 1
# Encode a column to a range between normalized_low and normalized_high.
def min_max_1(df, name, normalized_low=-1, normalized_high=1,
                         data_low=None, data_high=None):
    if data_low is None:
        data_low = min(df[name])
        data_high = max(df[name])

    df[name] = ((df[name] - data_low) / (data_high - data_low)) \
        * (normalized_high - normalized_low) + normalized_low

#MINMAX 0 1
def min_max_0(df, name, normalized_low=0, normalized_high=1,
                         data_low=None, data_high=None):
    if data_low is None:
        data_low = min(df[name])
        data_high = max(df[name])

    df[name] = ((df[name] - data_low) / (data_high - data_low)) \
        * (normalized_high - normalized_low) + normalized_low

    
# Encode text values to indexes(i.e. [1],[2],[3] for red,green,blue).
def encode_text_index(df, name):
    le = preprocessing.LabelEncoder()
    df[name] = le.fit_transform(df[name])
    return le.classes_

# Convert a Pandas dataframe to the x,y inputs that TensorFlow needs
def to_xy(df, target):
    result = []
    for x in df.columns:
        if x != target:
            result.append(x)
    # find out the type of the target column.  Is it really this hard? :(
    target_type = df[target].dtypes
    target_type = target_type[0] if hasattr(
        target_type, '__iter__') else target_type
    # Encode to int for classification, float otherwise. TensorFlow likes 32 bits.
    if target_type in (np.int64, np.int32):
        # Classification
        dummies = pd.get_dummies(df[target])
        return df[result].values.astype(np.float32), dummies.values.astype(np.float32)
    # Regression
    return df[result].values.astype(np.float32), df[[target]].values.astype(np.float32)

# Plot a confusion matrix.
# cm is the confusion matrix, names are the names of the classes.
def plot_confusion_matrix(cm, names, title='Confusion matrix', cmap=plt.cm.Blues):
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(names))
    plt.xticks(tick_marks, names, rotation=45)
    plt.yticks(tick_marks, names)
    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')




df['srcip'] = df['srcip'].apply(clean_ip)
df['dstip'] = df['dstip'].apply(clean_ip)

df['sport'] = df['sport'].astype('int64')
df['dsport'] = df['dsport'].astype('int64')

# Now encode the feature vector

min_max_0(df, 'srcip')#
min_max_0(df, 'sport')

min_max_0(df, 'dstip') #
min_max_0(df, 'dsport')

encode_text_dummy(df, 'proto') #
encode_text_dummy(df, 'state')

min_max_0(df, 'dur')

min_max_0(df, 'sbyte') #
min_max_0(df, 'dbytes')#
min_max_0(df, 'sttl')#
min_max_0(df, 'dttl')#
min_max_0(df, 'sloss')#
min_max_0(df, 'dloss')#
encode_text_dummy(df, 'service') #
min_max_0(df, 'Sload')

min_max_0(df, 'Dload')

min_max_0(df, 'Spkts')#
min_max_0(df, 'Dpkts')#
min_max_0(df, 'swin')#
min_max_0(df, 'dwin')#
min_max_0(df, 'stcpb')#
min_max_0(df, 'dtcpb')#
min_max_0(df, 'smeansz')#
min_max_0(df, 'dmeansz')#
min_max_0(df, 'trans')#
min_max_0(df, 'res')#
min_max_0(df, 'Sjit')#
min_max_0(df, 'Djit')#

min_max_0(df, 'Sintpkt')#
min_max_0(df, 'Dintpkt')#
min_max_0(df, 'tcprtt')#
min_max_0(df, 'synack')#
min_max_0(df, 'ackdat')#
encode_numeric_zscore(df, 'is_sm_ips_ports')#ojo
min_max_0(df, 'ct_state_ttl')#
min_max_0(df, 'ct_flw_http_mthd')#
encode_numeric_zscore(df, 'is_ftp_login')#ojo
min_max_0(df, 'ct_ftp_cmd')#
min_max_0(df, 'ct_srv_src')#
min_max_0(df, 'ct_srv_dst')#
min_max_0(df, 'ct_dst_ltm')#
min_max_0(df, 'ct_src_ltm')

min_max_0(df, 'ct_src_dport_ltm')#
min_max_0(df, 'ct_dst_sport_lt')#
min_max_0(df, 'ct_dst_src_ltm')#
#encode_numeric_zscore(df, 'attack_cat')#
encode_text_dummy(df, 'attack_cat')

outcomes = encode_text_index(df, 'Label')#
num_classes = len(outcomes)

# display 5 rows

#df.dropna(inplace=True,axis=1)
df[0:5]


  interactivity=interactivity, compiler=compiler, result=result)


           0      1              2      3    4    5         6     7      8   \
0  59.166.0.0   6055  149.171.126.5  54145  tcp  FIN  0.072974  4238  60788   
1  59.166.0.0   7832  149.171.126.3   5607  tcp  FIN  0.144951  5174  91072   
2  59.166.0.8  11397  149.171.126.6     21  tcp  FIN  0.116107  2934   3742   

   9   ...  39  40  41 42  43  44  45  46   47  48  
0  31  ...   0  13  13  6   7   1   1   2  NaN   0  
1  31  ...   0  13  13  6   7   1   1   2  NaN   0  
2  31  ...   1   1   2  7   5   1   1   4  NaN   0  

[3 rows x 49 columns]
Read 700001 rows.


ValueError: min() arg is an empty sequence

In [44]:
df[0:10]

Unnamed: 0,srcip,sport,dstip,dsport,proto,state,dur,sbyte,dbytes,sttl,...,ct_ftp_cmd,ct_srv_src,ct_srv_dst,ct_dst_ltm,ct_src_ltm,ct_src_dport_ltm,ct_dst_sport_lt,ct_dst_src_ltm,attack_cat,Label


In [45]:
import pandas as pd
import io
import requests
import numpy as np
import os
import matplotlib.pyplot as plt
import pylab as pl
import tensorflow.contrib.learn as skflow
import string


#from sklearn.utils.multiclass import unique_labels
from scipy.stats import zscore
from sklearn.model_selection import train_test_split
from sklearn import metrics
from keras.models import Sequential
from keras.layers.core import Dense, Activation
from keras.callbacks import EarlyStopping
from sklearn import preprocessing
from sklearn.metrics import confusion_matrix, roc_curve, auc, average_precision_score, precision_recall_curve
from inspect import signature
%matplotlib inline

path = "UNSW-NB15_2.csv"
# This file is a CSV, just no CSV extension or headers
df = pd.read_csv(path, header=None)
print(df[0:3])

print("Read {} rows.".format(len(df)))
# df = df.sample(frac=0.1, replace=False) # Uncomment this line to sample only 10% of the dataset
 # For now, just drop NA's (rows with missing values)

# The CSV file has no column heads, so add them
df.columns = [
    'srcip',
    'sport',
    'dstip',
    'dsport',
    'proto',
    'state',
    'dur',
    'sbyte',
    'dbytes',
    'sttl',
    'dttl',
    'sloss',
    'dloss',
    'service',
    'Sload',
    'Dload',
    'Spkts',
    'Dpkts',
    'swin',
    'dwin',
    'stcpb',
    'dtcpb',
    'smeansz',
    'dmeansz',
    'trans',
    'res',
    'Sjit',
    'Djit',
    'Stime', #QUITAR
    'Ltime', #QUITAR
    'Sintpkt',
    'Dintpkt',
    'tcprtt',
    'synack',
    'ackdat',
    'is_sm_ips_ports',
    'ct_state_ttl',
    'ct_flw_http_mthd',
    'is_ftp_login',
    'ct_ftp_cmd',
    'ct_srv_src',
    'ct_srv_dst',
    'ct_dst_ltm',
    'ct_src_ltm', 
    'ct_src_dport_ltm',
    'ct_dst_sport_lt', 
    'ct_dst_src_ltm',
    'attack_cat',
    'Label'
]

df.dropna(inplace=True)

# df.drop('sport', 1, inplace=True)#
# df.drop('dsport', 1, inplace=True)#
# df.drop('state', 1, inplace=True)#
# df.drop('dur', 1, inplace=True)#
# df.drop('Sload', 1, inplace=True)#
# df.drop('Dload', 1, inplace=True)#
# df.drop('Stime', 1, inplace=True)#
# df.drop('Ltime', 1, inplace=True)#
# df.drop('ct_src_ltm', 1, inplace=True)#

ENCODING = 'utf-8'

def expand_categories(values):
    result = []
    s = values.value_counts()
    t = float(len(values))
    for v in s.index:
        result.append("{}:{}%".format(v,round(100*(s[v]/t),2)))
    return "[{}]".format(",".join(result))
        
def analyze(filename):
    print()
    print("Analyzing: {}".format(filename))
    df = pd.read_csv(filename,encoding=ENCODING)
    cols = df.columns.values
    total = float(len(df))

    print("{} rows".format(int(total)))
    for col in cols:
        uniques = df[col].unique()
        unique_count = len(uniques)
        if unique_count>100:
            print("** {}:{} ({}%)".format(col,unique_count,int(((unique_count)/total)*100)))
        else:
            print("** {}:{}".format(col,expand_categories(df[col])))
            expand_categories(df[col])

#analyze(path)


# display 5 rows
# df[0:5]

# Encode text values to dummy variables(i.e. [1,0,0],[0,1,0],[0,0,1] for red,green,blue)
def encode_text_dummy(df, name):
    dummies = pd.get_dummies(df[name])
    for x in dummies.columns:
        dummy_name = f"{name}-{x}"
        df[dummy_name] = dummies[x]
    df.drop(name, axis=1, inplace=True)

#Clean 'date' column and convert to Int type
def clean_date(s):
    s = ''.join([i for i in s if i not in frozenset(string.punctuation)])
    s_removed = s.replace(" ", "")
    s_int = int(s_removed)
    return s_int

########## CLEAN IP #######################
def clean_ip(s):
    s = ''.join([i for i in s if i not in frozenset(string.punctuation)])
    s_int = int(s)
    return s_int

# Encode a numeric column as zscores
def encode_numeric_zscore(df, name, mean=None, sd=None):
    if mean is None:
        mean = df[name].mean()

    if sd is None:
        sd = df[name].std()

    df[name] = (df[name] - mean) / sd

#MINMAX -1 1
# Encode a column to a range between normalized_low and normalized_high.
def min_max_1(df, name, normalized_low=-1, normalized_high=1,
                         data_low=None, data_high=None):
    if data_low is None:
        data_low = min(df[name])
        data_high = max(df[name])

    df[name] = ((df[name] - data_low) / (data_high - data_low)) \
        * (normalized_high - normalized_low) + normalized_low

#MINMAX 0 1
def min_max_0(df, name, normalized_low=0, normalized_high=1,
                         data_low=None, data_high=None):
    if data_low is None:
        data_low = min(df[name])
        data_high = max(df[name])

    df[name] = ((df[name] - data_low) / (data_high - data_low)) \
        * (normalized_high - normalized_low) + normalized_low

    
# Encode text values to indexes(i.e. [1],[2],[3] for red,green,blue).
def encode_text_index(df, name):
    le = preprocessing.LabelEncoder()
    df[name] = le.fit_transform(df[name])
    return le.classes_

# Convert a Pandas dataframe to the x,y inputs that TensorFlow needs
def to_xy(df, target):
    result = []
    for x in df.columns:
        if x != target:
            result.append(x)
    # find out the type of the target column.  Is it really this hard? :(
    target_type = df[target].dtypes
    target_type = target_type[0] if hasattr(
        target_type, '__iter__') else target_type
    # Encode to int for classification, float otherwise. TensorFlow likes 32 bits.
    if target_type in (np.int64, np.int32):
        # Classification
        dummies = pd.get_dummies(df[target])
        return df[result].values.astype(np.float32), dummies.values.astype(np.float32)
    # Regression
    return df[result].values.astype(np.float32), df[[target]].values.astype(np.float32)

# Plot a confusion matrix.
# cm is the confusion matrix, names are the names of the classes.
def plot_confusion_matrix(cm, names, title='Confusion matrix', cmap=plt.cm.Blues):
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(names))
    plt.xticks(tick_marks, names, rotation=45)
    plt.yticks(tick_marks, names)
    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')




df['srcip'] = df['srcip'].apply(clean_ip)
df['dstip'] = df['dstip'].apply(clean_ip)

df['sport'] = df['sport'].astype('int64')
df['dsport'] = df['dsport'].astype('int64')

# Now encode the feature vector

min_max_0(df, 'srcip')#
min_max_0(df, 'sport')

min_max_0(df, 'dstip') #
min_max_0(df, 'dsport')

encode_text_dummy(df, 'proto') #
encode_text_dummy(df, 'state')

min_max_0(df, 'dur')

min_max_0(df, 'sbyte') #
min_max_0(df, 'dbytes')#
min_max_0(df, 'sttl')#
min_max_0(df, 'dttl')#
min_max_0(df, 'sloss')#
min_max_0(df, 'dloss')#
encode_text_dummy(df, 'service') #
min_max_0(df, 'Sload')

min_max_0(df, 'Dload')

min_max_0(df, 'Spkts')#
min_max_0(df, 'Dpkts')#
min_max_0(df, 'swin')#
min_max_0(df, 'dwin')#
min_max_0(df, 'stcpb')#
min_max_0(df, 'dtcpb')#
min_max_0(df, 'smeansz')#
min_max_0(df, 'dmeansz')#
min_max_0(df, 'trans')#
min_max_0(df, 'res')#
min_max_0(df, 'Sjit')#
min_max_0(df, 'Djit')#

min_max_0(df, 'Sintpkt')#
min_max_0(df, 'Dintpkt')#
min_max_0(df, 'tcprtt')#
min_max_0(df, 'synack')#
min_max_0(df, 'ackdat')#
min_max_0(df, 'is_sm_ips_ports')#ojo
min_max_0(df, 'ct_state_ttl')#
min_max_0(df, 'ct_flw_http_mthd')#
min_max_0(df, 'is_ftp_login')#ojo
min_max_0(df, 'ct_ftp_cmd')#
min_max_0(df, 'ct_srv_src')#
min_max_0(df, 'ct_srv_dst')#
min_max_0(df, 'ct_dst_ltm')#
min_max_0(df, 'ct_src_ltm')

min_max_0(df, 'ct_src_dport_ltm')#
min_max_0(df, 'ct_dst_sport_lt')#
min_max_0(df, 'ct_dst_src_ltm')#
#encode_numeric_zscore(df, 'attack_cat')#
encode_text_dummy(df, 'attack_cat')

outcomes = encode_text_index(df, 'Label')#
num_classes = len(outcomes)

# display 5 rows

#df.dropna(inplace=True,axis=1)
df[0:5]

           0      1              2      3    4    5         6     7      8   \
0  59.166.0.0   6055  149.171.126.5  54145  tcp  FIN  0.072974  4238  60788   
1  59.166.0.0   7832  149.171.126.3   5607  tcp  FIN  0.144951  5174  91072   
2  59.166.0.8  11397  149.171.126.6     21  tcp  FIN  0.116107  2934   3742   

   9   ...  39  40  41 42  43  44  45  46   47  48  
0  31  ...   0  13  13  6   7   1   1   2  NaN   0  
1  31  ...   0  13  13  6   7   1   1   2  NaN   0  
2  31  ...   1   1   2  7   5   1   1   4  NaN   0  

[3 rows x 49 columns]
Read 700001 rows.


ValueError: min() arg is an empty sequence

In [46]:
df[0:5]

Unnamed: 0,srcip,sport,dstip,dsport,proto,state,dur,sbyte,dbytes,sttl,...,ct_ftp_cmd,ct_srv_src,ct_srv_dst,ct_dst_ltm,ct_src_ltm,ct_src_dport_ltm,ct_dst_sport_lt,ct_dst_src_ltm,attack_cat,Label


In [47]:
import pandas as pd
import io
import requests
import numpy as np
import os
import matplotlib.pyplot as plt
import pylab as pl
import tensorflow.contrib.learn as skflow
import string


#from sklearn.utils.multiclass import unique_labels
from scipy.stats import zscore
from sklearn.model_selection import train_test_split
from sklearn import metrics
from keras.models import Sequential
from keras.layers.core import Dense, Activation
from keras.callbacks import EarlyStopping
from sklearn import preprocessing
from sklearn.metrics import confusion_matrix, roc_curve, auc, average_precision_score, precision_recall_curve
from inspect import signature
%matplotlib inline

path = "UNSW-NB15_2.csv"
# This file is a CSV, just no CSV extension or headers
df = pd.read_csv(path, header=None)
print(df[0:3])

print("Read {} rows.".format(len(df)))
# df = df.sample(frac=0.1, replace=False) # Uncomment this line to sample only 10% of the dataset
 # For now, just drop NA's (rows with missing values)

# The CSV file has no column heads, so add them
df.columns = [
    'srcip',
    'sport',
    'dstip',
    'dsport',
    'proto',
    'state',
    'dur',
    'sbyte',
    'dbytes',
    'sttl',
    'dttl',
    'sloss',
    'dloss',
    'service',
    'Sload',
    'Dload',
    'Spkts',
    'Dpkts',
    'swin',
    'dwin',
    'stcpb',
    'dtcpb',
    'smeansz',
    'dmeansz',
    'trans',
    'res',
    'Sjit',
    'Djit',
    'Stime', #QUITAR
    'Ltime', #QUITAR
    'Sintpkt',
    'Dintpkt',
    'tcprtt',
    'synack',
    'ackdat',
    'is_sm_ips_ports',
    'ct_state_ttl',
    'ct_flw_http_mthd',
    'is_ftp_login',
    'ct_ftp_cmd',
    'ct_srv_src',
    'ct_srv_dst',
    'ct_dst_ltm',
    'ct_src_ltm', 
    'ct_src_dport_ltm',
    'ct_dst_sport_lt', 
    'ct_dst_src_ltm',
    'attack_cat',
    'Label'
]

df.dropna(inplace=True, axis=1)
df[0:5]
print(df[0:3])

           0      1              2      3    4    5         6     7      8   \
0  59.166.0.0   6055  149.171.126.5  54145  tcp  FIN  0.072974  4238  60788   
1  59.166.0.0   7832  149.171.126.3   5607  tcp  FIN  0.144951  5174  91072   
2  59.166.0.8  11397  149.171.126.6     21  tcp  FIN  0.116107  2934   3742   

   9   ...  39  40  41 42  43  44  45  46   47  48  
0  31  ...   0  13  13  6   7   1   1   2  NaN   0  
1  31  ...   0  13  13  6   7   1   1   2  NaN   0  
2  31  ...   1   1   2  7   5   1   1   4  NaN   0  

[3 rows x 49 columns]
Read 700001 rows.


Unnamed: 0,srcip,sport,dstip,proto,state,dur,sbyte,dbytes,sttl,dttl,...,ct_state_ttl,ct_ftp_cmd,ct_srv_src,ct_srv_dst,ct_dst_ltm,ct_src_ltm,ct_src_dport_ltm,ct_dst_sport_lt,ct_dst_src_ltm,Label
0,59.166.0.0,6055,149.171.126.5,tcp,FIN,0.072974,4238,60788,31,29,...,0,0,13,13,6,7,1,1,2,0
1,59.166.0.0,7832,149.171.126.3,tcp,FIN,0.144951,5174,91072,31,29,...,0,0,13,13,6,7,1,1,2,0
2,59.166.0.8,11397,149.171.126.6,tcp,FIN,0.116107,2934,3742,31,29,...,0,1,1,2,7,5,1,1,4,0
3,59.166.0.0,3804,149.171.126.3,udp,CON,0.000986,146,178,31,29,...,0,0,13,13,6,7,1,1,2,0
4,59.166.0.8,14339,149.171.126.6,tcp,FIN,0.03848,8928,320,31,29,...,0,0,8,20,7,5,1,1,4,0


In [48]:
print("Read {} rows.".format(len(df)))

Read 700001 rows.


In [49]:
print(df[0:3])

        srcip  sport          dstip proto state       dur  sbyte  dbytes  \
0  59.166.0.0   6055  149.171.126.5   tcp   FIN  0.072974   4238   60788   
1  59.166.0.0   7832  149.171.126.3   tcp   FIN  0.144951   5174   91072   
2  59.166.0.8  11397  149.171.126.6   tcp   FIN  0.116107   2934    3742   

   sttl  dttl  ...  ct_state_ttl  ct_ftp_cmd ct_srv_src  ct_srv_dst  \
0    31    29  ...             0           0         13          13   
1    31    29  ...             0           0         13          13   
2    31    29  ...             0           1          1           2   

   ct_dst_ltm  ct_src_ltm  ct_src_dport_ltm  ct_dst_sport_lt  ct_dst_src_ltm  \
0           6           7                 1                1               2   
1           6           7                 1                1               2   
2           7           5                 1                1               4   

   Label  
0      0  
1      0  
2      0  

[3 rows x 45 columns]


In [51]:
import pandas as pd
import io
import requests
import numpy as np
import os
import matplotlib.pyplot as plt
import pylab as pl
import tensorflow.contrib.learn as skflow
import string


#from sklearn.utils.multiclass import unique_labels
from scipy.stats import zscore
from sklearn.model_selection import train_test_split
from sklearn import metrics
from keras.models import Sequential
from keras.layers.core import Dense, Activation
from keras.callbacks import EarlyStopping
from sklearn import preprocessing
from sklearn.metrics import confusion_matrix, roc_curve, auc, average_precision_score, precision_recall_curve
from inspect import signature
%matplotlib inline

path = "UNSW-NB15_2.csv"
# This file is a CSV, just no CSV extension or headers
df = pd.read_csv(path, header=None)
print(df[0:3])

print("Read {} rows.".format(len(df)))
# df = df.sample(frac=0.1, replace=False) # Uncomment this line to sample only 10% of the dataset
 # For now, just drop NA's (rows with missing values)

# The CSV file has no column heads, so add them
df.columns = [
    'srcip',
    'sport',
    'dstip',
    'dsport',
    'proto',
    'state',
    'dur',
    'sbyte',
    'dbytes',
    'sttl',
    'dttl',
    'sloss',
    'dloss',
    'service',
    'Sload',
    'Dload',
    'Spkts',
    'Dpkts',
    'swin',
    'dwin',
    'stcpb',
    'dtcpb',
    'smeansz',
    'dmeansz',
    'trans',
    'res',
    'Sjit',
    'Djit',
    'Stime', #QUITAR
    'Ltime', #QUITAR
    'Sintpkt',
    'Dintpkt',
    'tcprtt',
    'synack',
    'ackdat',
    'is_sm_ips_ports',
    'ct_state_ttl',
    'ct_flw_http_mthd',
    'is_ftp_login',
    'ct_ftp_cmd',
    'ct_srv_src',
    'ct_srv_dst',
    'ct_dst_ltm',
    'ct_src_ltm', 
    'ct_src_dport_ltm',
    'ct_dst_sport_lt', 
    'ct_dst_src_ltm',
    'attack_cat',
    'Label'
]

df.dropna(thresh=1)
df[0:5]
print(df[0:3])

           0      1              2      3    4    5         6     7      8   \
0  59.166.0.0   6055  149.171.126.5  54145  tcp  FIN  0.072974  4238  60788   
1  59.166.0.0   7832  149.171.126.3   5607  tcp  FIN  0.144951  5174  91072   
2  59.166.0.8  11397  149.171.126.6     21  tcp  FIN  0.116107  2934   3742   

   9   ...  39  40  41 42  43  44  45  46   47  48  
0  31  ...   0  13  13  6   7   1   1   2  NaN   0  
1  31  ...   0  13  13  6   7   1   1   2  NaN   0  
2  31  ...   1   1   2  7   5   1   1   4  NaN   0  

[3 rows x 49 columns]
Read 700001 rows.
        srcip  sport          dstip dsport proto state       dur  sbyte  \
0  59.166.0.0   6055  149.171.126.5  54145   tcp   FIN  0.072974   4238   
1  59.166.0.0   7832  149.171.126.3   5607   tcp   FIN  0.144951   5174   
2  59.166.0.8  11397  149.171.126.6     21   tcp   FIN  0.116107   2934   

   dbytes  sttl  ...  ct_ftp_cmd  ct_srv_src  ct_srv_dst ct_dst_ltm  \
0   60788    31  ...           0          13          13  

In [52]:
import pandas as pd
import io
import requests
import numpy as np
import os
import matplotlib.pyplot as plt
import pylab as pl
import tensorflow.contrib.learn as skflow
import string


#from sklearn.utils.multiclass import unique_labels
from scipy.stats import zscore
from sklearn.model_selection import train_test_split
from sklearn import metrics
from keras.models import Sequential
from keras.layers.core import Dense, Activation
from keras.callbacks import EarlyStopping
from sklearn import preprocessing
from sklearn.metrics import confusion_matrix, roc_curve, auc, average_precision_score, precision_recall_curve
from inspect import signature
%matplotlib inline

path = "UNSW-NB15_2.csv"
# This file is a CSV, just no CSV extension or headers
df = pd.read_csv(path, header=None)
print(df[0:3])

print("Read {} rows.".format(len(df)))
# df = df.sample(frac=0.1, replace=False) # Uncomment this line to sample only 10% of the dataset
 # For now, just drop NA's (rows with missing values)

# The CSV file has no column heads, so add them
df.columns = [
    'srcip',
    'sport',
    'dstip',
    'dsport',
    'proto',
    'state',
    'dur',
    'sbyte',
    'dbytes',
    'sttl',
    'dttl',
    'sloss',
    'dloss',
    'service',
    'Sload',
    'Dload',
    'Spkts',
    'Dpkts',
    'swin',
    'dwin',
    'stcpb',
    'dtcpb',
    'smeansz',
    'dmeansz',
    'trans',
    'res',
    'Sjit',
    'Djit',
    'Stime', #QUITAR
    'Ltime', #QUITAR
    'Sintpkt',
    'Dintpkt',
    'tcprtt',
    'synack',
    'ackdat',
    'is_sm_ips_ports',
    'ct_state_ttl',
    'ct_flw_http_mthd',
    'is_ftp_login',
    'ct_ftp_cmd',
    'ct_srv_src',
    'ct_srv_dst',
    'ct_dst_ltm',
    'ct_src_ltm', 
    'ct_src_dport_ltm',
    'ct_dst_sport_lt', 
    'ct_dst_src_ltm',
    'attack_cat',
    'Label'
]

df.dropna(thresh=1)
df[0:5]

# df.drop('sport', 1, inplace=True)#
# df.drop('dsport', 1, inplace=True)#
# df.drop('state', 1, inplace=True)#
# df.drop('dur', 1, inplace=True)#
# df.drop('Sload', 1, inplace=True)#
# df.drop('Dload', 1, inplace=True)#
# df.drop('Stime', 1, inplace=True)#
# df.drop('Ltime', 1, inplace=True)#
# df.drop('ct_src_ltm', 1, inplace=True)#

ENCODING = 'utf-8'

def expand_categories(values):
    result = []
    s = values.value_counts()
    t = float(len(values))
    for v in s.index:
        result.append("{}:{}%".format(v,round(100*(s[v]/t),2)))
    return "[{}]".format(",".join(result))
        
def analyze(filename):
    print()
    print("Analyzing: {}".format(filename))
    df = pd.read_csv(filename,encoding=ENCODING)
    cols = df.columns.values
    total = float(len(df))

    print("{} rows".format(int(total)))
    for col in cols:
        uniques = df[col].unique()
        unique_count = len(uniques)
        if unique_count>100:
            print("** {}:{} ({}%)".format(col,unique_count,int(((unique_count)/total)*100)))
        else:
            print("** {}:{}".format(col,expand_categories(df[col])))
            expand_categories(df[col])

#analyze(path)


# display 5 rows
# df[0:5]

# Encode text values to dummy variables(i.e. [1,0,0],[0,1,0],[0,0,1] for red,green,blue)
def encode_text_dummy(df, name):
    dummies = pd.get_dummies(df[name])
    for x in dummies.columns:
        dummy_name = f"{name}-{x}"
        df[dummy_name] = dummies[x]
    df.drop(name, axis=1, inplace=True)

#Clean 'date' column and convert to Int type
def clean_date(s):
    s = ''.join([i for i in s if i not in frozenset(string.punctuation)])
    s_removed = s.replace(" ", "")
    s_int = int(s_removed)
    return s_int

########## CLEAN IP #######################
def clean_ip(s):
    s = ''.join([i for i in s if i not in frozenset(string.punctuation)])
    s_int = int(s)
    return s_int

# Encode a numeric column as zscores
def encode_numeric_zscore(df, name, mean=None, sd=None):
    if mean is None:
        mean = df[name].mean()

    if sd is None:
        sd = df[name].std()

    df[name] = (df[name] - mean) / sd

#MINMAX -1 1
# Encode a column to a range between normalized_low and normalized_high.
def min_max_1(df, name, normalized_low=-1, normalized_high=1,
                         data_low=None, data_high=None):
    if data_low is None:
        data_low = min(df[name])
        data_high = max(df[name])

    df[name] = ((df[name] - data_low) / (data_high - data_low)) \
        * (normalized_high - normalized_low) + normalized_low

#MINMAX 0 1
def min_max_0(df, name, normalized_low=0, normalized_high=1,
                         data_low=None, data_high=None):
    if data_low is None:
        data_low = min(df[name])
        data_high = max(df[name])

    df[name] = ((df[name] - data_low) / (data_high - data_low)) \
        * (normalized_high - normalized_low) + normalized_low

    
# Encode text values to indexes(i.e. [1],[2],[3] for red,green,blue).
def encode_text_index(df, name):
    le = preprocessing.LabelEncoder()
    df[name] = le.fit_transform(df[name])
    return le.classes_

# Convert a Pandas dataframe to the x,y inputs that TensorFlow needs
def to_xy(df, target):
    result = []
    for x in df.columns:
        if x != target:
            result.append(x)
    # find out the type of the target column.  Is it really this hard? :(
    target_type = df[target].dtypes
    target_type = target_type[0] if hasattr(
        target_type, '__iter__') else target_type
    # Encode to int for classification, float otherwise. TensorFlow likes 32 bits.
    if target_type in (np.int64, np.int32):
        # Classification
        dummies = pd.get_dummies(df[target])
        return df[result].values.astype(np.float32), dummies.values.astype(np.float32)
    # Regression
    return df[result].values.astype(np.float32), df[[target]].values.astype(np.float32)

# Plot a confusion matrix.
# cm is the confusion matrix, names are the names of the classes.
def plot_confusion_matrix(cm, names, title='Confusion matrix', cmap=plt.cm.Blues):
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(names))
    plt.xticks(tick_marks, names, rotation=45)
    plt.yticks(tick_marks, names)
    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')




df['srcip'] = df['srcip'].apply(clean_ip)
df['dstip'] = df['dstip'].apply(clean_ip)

df['sport'] = df['sport'].astype('int64')
df['dsport'] = df['dsport'].astype('int64')

# Now encode the feature vector

min_max_0(df, 'srcip')#
min_max_0(df, 'sport')

min_max_0(df, 'dstip') #
min_max_0(df, 'dsport')

encode_text_dummy(df, 'proto') #
encode_text_dummy(df, 'state')

min_max_0(df, 'dur')

min_max_0(df, 'sbyte') #
min_max_0(df, 'dbytes')#
min_max_0(df, 'sttl')#
min_max_0(df, 'dttl')#
min_max_0(df, 'sloss')#
min_max_0(df, 'dloss')#
encode_text_dummy(df, 'service') #
min_max_0(df, 'Sload')

min_max_0(df, 'Dload')

min_max_0(df, 'Spkts')#
min_max_0(df, 'Dpkts')#
min_max_0(df, 'swin')#
min_max_0(df, 'dwin')#
min_max_0(df, 'stcpb')#
min_max_0(df, 'dtcpb')#
min_max_0(df, 'smeansz')#
min_max_0(df, 'dmeansz')#
min_max_0(df, 'trans')#
min_max_0(df, 'res')#
min_max_0(df, 'Sjit')#
min_max_0(df, 'Djit')#

min_max_0(df, 'Sintpkt')#
min_max_0(df, 'Dintpkt')#
min_max_0(df, 'tcprtt')#
min_max_0(df, 'synack')#
min_max_0(df, 'ackdat')#
min_max_0(df, 'is_sm_ips_ports')#ojo
min_max_0(df, 'ct_state_ttl')#
min_max_0(df, 'ct_flw_http_mthd')#
min_max_0(df, 'is_ftp_login')#ojo
min_max_0(df, 'ct_ftp_cmd')#
min_max_0(df, 'ct_srv_src')#
min_max_0(df, 'ct_srv_dst')#
min_max_0(df, 'ct_dst_ltm')#
min_max_0(df, 'ct_src_ltm')

min_max_0(df, 'ct_src_dport_ltm')#
min_max_0(df, 'ct_dst_sport_lt')#
min_max_0(df, 'ct_dst_src_ltm')#
#encode_numeric_zscore(df, 'attack_cat')#
encode_text_dummy(df, 'attack_cat')

outcomes = encode_text_index(df, 'Label')#
num_classes = len(outcomes)

           0      1              2      3    4    5         6     7      8   \
0  59.166.0.0   6055  149.171.126.5  54145  tcp  FIN  0.072974  4238  60788   
1  59.166.0.0   7832  149.171.126.3   5607  tcp  FIN  0.144951  5174  91072   
2  59.166.0.8  11397  149.171.126.6     21  tcp  FIN  0.116107  2934   3742   

   9   ...  39  40  41 42  43  44  45  46   47  48  
0  31  ...   0  13  13  6   7   1   1   2  NaN   0  
1  31  ...   0  13  13  6   7   1   1   2  NaN   0  
2  31  ...   1   1   2  7   5   1   1   4  NaN   0  

[3 rows x 49 columns]
Read 700001 rows.


KeyboardInterrupt: 

In [53]:
import pandas as pd
import io
import requests
import numpy as np
import os
import matplotlib.pyplot as plt
import pylab as pl
import tensorflow.contrib.learn as skflow
import string


#from sklearn.utils.multiclass import unique_labels
from scipy.stats import zscore
from sklearn.model_selection import train_test_split
from sklearn import metrics
from keras.models import Sequential
from keras.layers.core import Dense, Activation
from keras.callbacks import EarlyStopping
from sklearn import preprocessing
from sklearn.metrics import confusion_matrix, roc_curve, auc, average_precision_score, precision_recall_curve
from inspect import signature
%matplotlib inline

path = "UNSW-NB15_2.csv"
# This file is a CSV, just no CSV extension or headers
df = pd.read_csv(path, header=None)
print(df[0:3])

print("Read {} rows.".format(len(df)))
# df = df.sample(frac=0.1, replace=False) # Uncomment this line to sample only 10% of the dataset
 # For now, just drop NA's (rows with missing values)

# The CSV file has no column heads, so add them
df.columns = [
    'srcip',
    'sport',
    'dstip',
    'dsport',
    'proto',
    'state',
    'dur',
    'sbyte',
    'dbytes',
    'sttl',
    'dttl',
    'sloss',
    'dloss',
    'service',
    'Sload',
    'Dload',
    'Spkts',
    'Dpkts',
    'swin',
    'dwin',
    'stcpb',
    'dtcpb',
    'smeansz',
    'dmeansz',
    'trans',
    'res',
    'Sjit',
    'Djit',
    'Stime', #QUITAR
    'Ltime', #QUITAR
    'Sintpkt',
    'Dintpkt',
    'tcprtt',
    'synack',
    'ackdat',
    'is_sm_ips_ports',
    'ct_state_ttl',
    'ct_flw_http_mthd',
    'is_ftp_login',
    'ct_ftp_cmd',
    'ct_srv_src',
    'ct_srv_dst',
    'ct_dst_ltm',
    'ct_src_ltm', 
    'ct_src_dport_ltm',
    'ct_dst_sport_lt', 
    'ct_dst_src_ltm',
    'attack_cat',
    'Label'
]

df.dropna(thresh=1)
df[0:5]

# df.drop('sport', 1, inplace=True)#
# df.drop('dsport', 1, inplace=True)#
# df.drop('state', 1, inplace=True)#
# df.drop('dur', 1, inplace=True)#
# df.drop('Sload', 1, inplace=True)#
# df.drop('Dload', 1, inplace=True)#
# df.drop('Stime', 1, inplace=True)#
# df.drop('Ltime', 1, inplace=True)#
# df.drop('ct_src_ltm', 1, inplace=True)#

ENCODING = 'utf-8'

def expand_categories(values):
    result = []
    s = values.value_counts()
    t = float(len(values))
    for v in s.index:
        result.append("{}:{}%".format(v,round(100*(s[v]/t),2)))
    return "[{}]".format(",".join(result))
        
def analyze(filename):
    print()
    print("Analyzing: {}".format(filename))
    df = pd.read_csv(filename,encoding=ENCODING)
    cols = df.columns.values
    total = float(len(df))

    print("{} rows".format(int(total)))
    for col in cols:
        uniques = df[col].unique()
        unique_count = len(uniques)
        if unique_count>100:
            print("** {}:{} ({}%)".format(col,unique_count,int(((unique_count)/total)*100)))
        else:
            print("** {}:{}".format(col,expand_categories(df[col])))
            expand_categories(df[col])

#analyze(path)


# display 5 rows
# df[0:5]

# Encode text values to dummy variables(i.e. [1,0,0],[0,1,0],[0,0,1] for red,green,blue)
def encode_text_dummy(df, name):
    dummies = pd.get_dummies(df[name])
    for x in dummies.columns:
        dummy_name = f"{name}-{x}"
        df[dummy_name] = dummies[x]
    df.drop(name, axis=1, inplace=True)

#Clean 'date' column and convert to Int type
def clean_date(s):
    s = ''.join([i for i in s if i not in frozenset(string.punctuation)])
    s_removed = s.replace(" ", "")
    s_int = int(s_removed)
    return s_int

########## CLEAN IP #######################
def clean_ip(s):
    s = ''.join([i for i in s if i not in frozenset(string.punctuation)])
    s_int = int(s)
    return s_int

# Encode a numeric column as zscores
def encode_numeric_zscore(df, name, mean=None, sd=None):
    if mean is None:
        mean = df[name].mean()

    if sd is None:
        sd = df[name].std()

    df[name] = (df[name] - mean) / sd

#MINMAX -1 1
# Encode a column to a range between normalized_low and normalized_high.
def min_max_1(df, name, normalized_low=-1, normalized_high=1,
                         data_low=None, data_high=None):
    if data_low is None:
        data_low = min(df[name])
        data_high = max(df[name])

    df[name] = ((df[name] - data_low) / (data_high - data_low)) \
        * (normalized_high - normalized_low) + normalized_low

#MINMAX 0 1
def min_max_0(df, name, normalized_low=0, normalized_high=1,
                         data_low=None, data_high=None):
    if data_low is None:
        data_low = min(df[name])
        data_high = max(df[name])

    df[name] = ((df[name] - data_low) / (data_high - data_low)) \
        * (normalized_high - normalized_low) + normalized_low

    
# Encode text values to indexes(i.e. [1],[2],[3] for red,green,blue).
def encode_text_index(df, name):
    le = preprocessing.LabelEncoder()
    df[name] = le.fit_transform(df[name])
    return le.classes_

# Convert a Pandas dataframe to the x,y inputs that TensorFlow needs
def to_xy(df, target):
    result = []
    for x in df.columns:
        if x != target:
            result.append(x)
    # find out the type of the target column.  Is it really this hard? :(
    target_type = df[target].dtypes
    target_type = target_type[0] if hasattr(
        target_type, '__iter__') else target_type
    # Encode to int for classification, float otherwise. TensorFlow likes 32 bits.
    if target_type in (np.int64, np.int32):
        # Classification
        dummies = pd.get_dummies(df[target])
        return df[result].values.astype(np.float32), dummies.values.astype(np.float32)
    # Regression
    return df[result].values.astype(np.float32), df[[target]].values.astype(np.float32)

# Plot a confusion matrix.
# cm is the confusion matrix, names are the names of the classes.
def plot_confusion_matrix(cm, names, title='Confusion matrix', cmap=plt.cm.Blues):
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(names))
    plt.xticks(tick_marks, names, rotation=45)
    plt.yticks(tick_marks, names)
    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')




df['srcip'] = df['srcip'].apply(clean_ip)
df['dstip'] = df['dstip'].apply(clean_ip)

df['sport'] = df['sport'].astype('int64')
df['dsport'] = df['dsport'].astype('int64')

# Now encode the feature vector

min_max_0(df, 'srcip')#
min_max_0(df, 'sport')

min_max_0(df, 'dstip') #
min_max_0(df, 'dsport')

encode_text_dummy(df, 'proto') #
encode_text_dummy(df, 'state')

min_max_0(df, 'dur')

min_max_0(df, 'sbyte') #
min_max_0(df, 'dbytes')#
min_max_0(df, 'sttl')#
min_max_0(df, 'dttl')#
min_max_0(df, 'sloss')#
min_max_0(df, 'dloss')#
encode_text_dummy(df, 'service') #
min_max_0(df, 'Sload')

min_max_0(df, 'Dload')

min_max_0(df, 'Spkts')#
min_max_0(df, 'Dpkts')#
min_max_0(df, 'swin')#
min_max_0(df, 'dwin')#
min_max_0(df, 'stcpb')#
min_max_0(df, 'dtcpb')#
min_max_0(df, 'smeansz')#
min_max_0(df, 'dmeansz')#
min_max_0(df, 'trans')#
min_max_0(df, 'res')#
min_max_0(df, 'Sjit')#
min_max_0(df, 'Djit')#

min_max_0(df, 'Sintpkt')#
min_max_0(df, 'Dintpkt')#
min_max_0(df, 'tcprtt')#
min_max_0(df, 'synack')#
min_max_0(df, 'ackdat')#
min_max_0(df, 'is_sm_ips_ports')#ojo
min_max_0(df, 'ct_state_ttl')#
min_max_0(df, 'ct_flw_http_mthd')#
min_max_0(df, 'is_ftp_login')#ojo
min_max_0(df, 'ct_ftp_cmd')#
min_max_0(df, 'ct_srv_src')#
min_max_0(df, 'ct_srv_dst')#
min_max_0(df, 'ct_dst_ltm')#
min_max_0(df, 'ct_src_ltm')

min_max_0(df, 'ct_src_dport_ltm')#
min_max_0(df, 'ct_dst_sport_lt')#
min_max_0(df, 'ct_dst_src_ltm')#
#encode_numeric_zscore(df, 'attack_cat')#
encode_text_dummy(df, 'attack_cat')

outcomes = encode_text_index(df, 'Label')#
num_classes = len(outcomes)

           0      1              2      3    4    5         6     7      8   \
0  59.166.0.0   6055  149.171.126.5  54145  tcp  FIN  0.072974  4238  60788   
1  59.166.0.0   7832  149.171.126.3   5607  tcp  FIN  0.144951  5174  91072   
2  59.166.0.8  11397  149.171.126.6     21  tcp  FIN  0.116107  2934   3742   

   9   ...  39  40  41 42  43  44  45  46   47  48  
0  31  ...   0  13  13  6   7   1   1   2  NaN   0  
1  31  ...   0  13  13  6   7   1   1   2  NaN   0  
2  31  ...   1   1   2  7   5   1   1   4  NaN   0  

[3 rows x 49 columns]
Read 700001 rows.


ValueError: cannot convert float NaN to integer

In [54]:
import pandas as pd
import io
import requests
import numpy as np
import os
import matplotlib.pyplot as plt
import pylab as pl
import tensorflow.contrib.learn as skflow
import string


#from sklearn.utils.multiclass import unique_labels
from scipy.stats import zscore
from sklearn.model_selection import train_test_split
from sklearn import metrics
from keras.models import Sequential
from keras.layers.core import Dense, Activation
from keras.callbacks import EarlyStopping
from sklearn import preprocessing
from sklearn.metrics import confusion_matrix, roc_curve, auc, average_precision_score, precision_recall_curve
from inspect import signature
%matplotlib inline

path = "UNSW-NB15_2.csv"
# This file is a CSV, just no CSV extension or headers
df = pd.read_csv(path, header=None)
print(df[0:3])

print("Read {} rows.".format(len(df)))
# df = df.sample(frac=0.1, replace=False) # Uncomment this line to sample only 10% of the dataset
 # For now, just drop NA's (rows with missing values)
df.dropna(thresh=1)
# The CSV file has no column heads, so add them
df.columns = [
    'srcip',
    'sport',
    'dstip',
    'dsport',
    'proto',
    'state',
    'dur',
    'sbyte',
    'dbytes',
    'sttl',
    'dttl',
    'sloss',
    'dloss',
    'service',
    'Sload',
    'Dload',
    'Spkts',
    'Dpkts',
    'swin',
    'dwin',
    'stcpb',
    'dtcpb',
    'smeansz',
    'dmeansz',
    'trans',
    'res',
    'Sjit',
    'Djit',
    'Stime', #QUITAR
    'Ltime', #QUITAR
    'Sintpkt',
    'Dintpkt',
    'tcprtt',
    'synack',
    'ackdat',
    'is_sm_ips_ports',
    'ct_state_ttl',
    'ct_flw_http_mthd',
    'is_ftp_login',
    'ct_ftp_cmd',
    'ct_srv_src',
    'ct_srv_dst',
    'ct_dst_ltm',
    'ct_src_ltm', 
    'ct_src_dport_ltm',
    'ct_dst_sport_lt', 
    'ct_dst_src_ltm',
    'attack_cat',
    'Label'
]

df.dropna(thresh=1)
print(df[0:3])



           0      1              2      3    4    5         6     7      8   \
0  59.166.0.0   6055  149.171.126.5  54145  tcp  FIN  0.072974  4238  60788   
1  59.166.0.0   7832  149.171.126.3   5607  tcp  FIN  0.144951  5174  91072   
2  59.166.0.8  11397  149.171.126.6     21  tcp  FIN  0.116107  2934   3742   

   9   ...  39  40  41 42  43  44  45  46   47  48  
0  31  ...   0  13  13  6   7   1   1   2  NaN   0  
1  31  ...   0  13  13  6   7   1   1   2  NaN   0  
2  31  ...   1   1   2  7   5   1   1   4  NaN   0  

[3 rows x 49 columns]
Read 700001 rows.
        srcip  sport          dstip dsport proto state       dur  sbyte  \
0  59.166.0.0   6055  149.171.126.5  54145   tcp   FIN  0.072974   4238   
1  59.166.0.0   7832  149.171.126.3   5607   tcp   FIN  0.144951   5174   
2  59.166.0.8  11397  149.171.126.6     21   tcp   FIN  0.116107   2934   

   dbytes  sttl  ...  ct_ftp_cmd  ct_srv_src  ct_srv_dst ct_dst_ltm  \
0   60788    31  ...           0          13          13  

In [55]:
df.dropna(thresh=0)
print(df[0:3])


        srcip  sport          dstip dsport proto state       dur  sbyte  \
0  59.166.0.0   6055  149.171.126.5  54145   tcp   FIN  0.072974   4238   
1  59.166.0.0   7832  149.171.126.3   5607   tcp   FIN  0.144951   5174   
2  59.166.0.8  11397  149.171.126.6     21   tcp   FIN  0.116107   2934   

   dbytes  sttl  ...  ct_ftp_cmd  ct_srv_src  ct_srv_dst ct_dst_ltm  \
0   60788    31  ...           0          13          13          6   
1   91072    31  ...           0          13          13          6   
2    3742    31  ...           1           1           2          7   

   ct_src_ltm  ct_src_dport_ltm  ct_dst_sport_lt  ct_dst_src_ltm  attack_cat  \
0           7                 1                1               2         NaN   
1           7                 1                1               2         NaN   
2           5                 1                1               4         NaN   

   Label  
0      0  
1      0  
2      0  

[3 rows x 49 columns]


In [56]:
df.dropna(axis=0)
print(df[0:3])

        srcip  sport          dstip dsport proto state       dur  sbyte  \
0  59.166.0.0   6055  149.171.126.5  54145   tcp   FIN  0.072974   4238   
1  59.166.0.0   7832  149.171.126.3   5607   tcp   FIN  0.144951   5174   
2  59.166.0.8  11397  149.171.126.6     21   tcp   FIN  0.116107   2934   

   dbytes  sttl  ...  ct_ftp_cmd  ct_srv_src  ct_srv_dst ct_dst_ltm  \
0   60788    31  ...           0          13          13          6   
1   91072    31  ...           0          13          13          6   
2    3742    31  ...           1           1           2          7   

   ct_src_ltm  ct_src_dport_ltm  ct_dst_sport_lt  ct_dst_src_ltm  attack_cat  \
0           7                 1                1               2         NaN   
1           7                 1                1               2         NaN   
2           5                 1                1               4         NaN   

   Label  
0      0  
1      0  
2      0  

[3 rows x 49 columns]


In [57]:
df.size

34300049

In [58]:
df.shape

(700001, 49)

In [59]:
df.dropna(axis=1)
print(df[0:3])
df.shape

        srcip  sport          dstip dsport proto state       dur  sbyte  \
0  59.166.0.0   6055  149.171.126.5  54145   tcp   FIN  0.072974   4238   
1  59.166.0.0   7832  149.171.126.3   5607   tcp   FIN  0.144951   5174   
2  59.166.0.8  11397  149.171.126.6     21   tcp   FIN  0.116107   2934   

   dbytes  sttl  ...  ct_ftp_cmd  ct_srv_src  ct_srv_dst ct_dst_ltm  \
0   60788    31  ...           0          13          13          6   
1   91072    31  ...           0          13          13          6   
2    3742    31  ...           1           1           2          7   

   ct_src_ltm  ct_src_dport_ltm  ct_dst_sport_lt  ct_dst_src_ltm  attack_cat  \
0           7                 1                1               2         NaN   
1           7                 1                1               2         NaN   
2           5                 1                1               4         NaN   

   Label  
0      0  
1      0  
2      0  

[3 rows x 49 columns]


In [60]:
df.dropna(inplace=True, axis=1)
print(df[0:3])
df.shape

        srcip  sport          dstip dsport proto state       dur  sbyte  \
0  59.166.0.0   6055  149.171.126.5  54145   tcp   FIN  0.072974   4238   
1  59.166.0.0   7832  149.171.126.3   5607   tcp   FIN  0.144951   5174   
2  59.166.0.8  11397  149.171.126.6     21   tcp   FIN  0.116107   2934   

   dbytes  sttl  ...  ct_ftp_cmd  ct_srv_src  ct_srv_dst ct_dst_ltm  \
0   60788    31  ...           0          13          13          6   
1   91072    31  ...           0          13          13          6   
2    3742    31  ...           1           1           2          7   

   ct_src_ltm  ct_src_dport_ltm  ct_dst_sport_lt  ct_dst_src_ltm  attack_cat  \
0           7                 1                1               2         NaN   
1           7                 1                1               2         NaN   
2           5                 1                1               4         NaN   

   Label  
0      0  
1      0  
2      0  

[3 rows x 49 columns]


(700001, 49)

In [61]:
df.dropna(inplace=True, axis=1)
print(df[0:3])
df.shape

        srcip  sport          dstip proto state       dur  sbyte  dbytes  \
0  59.166.0.0   6055  149.171.126.5   tcp   FIN  0.072974   4238   60788   
1  59.166.0.0   7832  149.171.126.3   tcp   FIN  0.144951   5174   91072   
2  59.166.0.8  11397  149.171.126.6   tcp   FIN  0.116107   2934    3742   

   sttl  dttl  ...  ct_state_ttl  ct_ftp_cmd ct_srv_src  ct_srv_dst  \
0    31    29  ...             0           0         13          13   
1    31    29  ...             0           0         13          13   
2    31    29  ...             0           1          1           2   

   ct_dst_ltm  ct_src_ltm  ct_src_dport_ltm  ct_dst_sport_lt  ct_dst_src_ltm  \
0           6           7                 1                1               2   
1           6           7                 1                1               2   
2           7           5                 1                1               4   

   Label  
0      0  
1      0  
2      0  

[3 rows x 45 columns]


(700001, 45)

In [63]:
import pandas as pd
import io
import requests
import numpy as np
import os
import matplotlib.pyplot as plt
import pylab as pl
import tensorflow.contrib.learn as skflow
import string


#from sklearn.utils.multiclass import unique_labels
from scipy.stats import zscore
from sklearn.model_selection import train_test_split
from sklearn import metrics
from keras.models import Sequential
from keras.layers.core import Dense, Activation
from keras.callbacks import EarlyStopping
from sklearn import preprocessing
from sklearn.metrics import confusion_matrix, roc_curve, auc, average_precision_score, precision_recall_curve
from inspect import signature
%matplotlib inline

path = "UNSW-NB15_2.csv"
# This file is a CSV, just no CSV extension or headers
df = pd.read_csv(path, header=None)
print(df[0:3])

print("Read {} rows.".format(len(df)))
# df = df.sample(frac=0.1, replace=False) # Uncomment this line to sample only 10% of the dataset
 # For now, just drop NA's (rows with missing values)
#df.dropna(thresh=1)
# The CSV file has no column heads, so add them
df.columns = [
    'srcip',
    'sport',
    'dstip',
    'dsport',
    'proto',
    'state',
    'dur',
    'sbyte',
    'dbytes',
    'sttl',
    'dttl',
    'sloss',
    'dloss',
    'service',
    'Sload',
    'Dload',
    'Spkts',
    'Dpkts',
    'swin',
    'dwin',
    'stcpb',
    'dtcpb',
    'smeansz',
    'dmeansz',
    'trans',
    'res',
    'Sjit',
    'Djit',
    'Stime', #QUITAR
    'Ltime', #QUITAR
    'Sintpkt',
    'Dintpkt',
    'tcprtt',
    'synack',
    'ackdat',
    'is_sm_ips_ports',
    'ct_state_ttl',
    'ct_flw_http_mthd',
    'is_ftp_login',
    'ct_ftp_cmd',
    'ct_srv_src',
    'ct_srv_dst',
    'ct_dst_ltm',
    'ct_src_ltm', 
    'ct_src_dport_ltm',
    'ct_dst_sport_lt', 
    'ct_dst_src_ltm',
    'attack_cat',
    'Label'
]

df.dropna(inplace=True, axis=1)
print(df[0:3])
df.shape

           0      1              2      3    4    5         6     7      8   \
0  59.166.0.0   6055  149.171.126.5  54145  tcp  FIN  0.072974  4238  60788   
1  59.166.0.0   7832  149.171.126.3   5607  tcp  FIN  0.144951  5174  91072   
2  59.166.0.8  11397  149.171.126.6     21  tcp  FIN  0.116107  2934   3742   

   9   ...  39  40  41 42  43  44  45  46   47  48  
0  31  ...   0  13  13  6   7   1   1   2  NaN   0  
1  31  ...   0  13  13  6   7   1   1   2  NaN   0  
2  31  ...   1   1   2  7   5   1   1   4  NaN   0  

[3 rows x 49 columns]
Read 700001 rows.
        srcip  sport          dstip proto state       dur  sbyte  dbytes  \
0  59.166.0.0   6055  149.171.126.5   tcp   FIN  0.072974   4238   60788   
1  59.166.0.0   7832  149.171.126.3   tcp   FIN  0.144951   5174   91072   
2  59.166.0.8  11397  149.171.126.6   tcp   FIN  0.116107   2934    3742   

   sttl  dttl  ...  ct_state_ttl  ct_ftp_cmd ct_srv_src  ct_srv_dst  \
0    31    29  ...             0           0         1

(700001, 45)

In [64]:
df.dtypes

srcip                object
sport                 int64
dstip                object
proto                object
state                object
dur                 float64
sbyte                 int64
dbytes                int64
sttl                  int64
dttl                  int64
sloss                 int64
dloss                 int64
service              object
Sload               float64
Dload               float64
Spkts                 int64
Dpkts                 int64
swin                  int64
dwin                  int64
stcpb                 int64
dtcpb                 int64
smeansz               int64
dmeansz               int64
trans                 int64
res                   int64
Sjit                float64
Djit                float64
Stime                 int64
Ltime                 int64
Sintpkt             float64
Dintpkt             float64
tcprtt              float64
synack              float64
ackdat              float64
is_sm_ips_ports       int64
ct_state_ttl        

In [66]:
ENCODING = 'utf-8'

def expand_categories(values):
    result = []
    s = values.value_counts()
    t = float(len(values))
    for v in s.index:
        result.append("{}:{}%".format(v,round(100*(s[v]/t),2)))
    return "[{}]".format(",".join(result))
        
def analyze(filename):
    print()
    print("Analyzing: {}".format(filename))
    df = pd.read_csv(filename,encoding=ENCODING)
    cols = df.columns.values
    total = float(len(df))

    print("{} rows".format(int(total)))
    for col in cols:
        uniques = df[col].unique()
        unique_count = len(uniques)
        if unique_count>100:
            print("** {}:{} ({}%)".format(col,unique_count,int(((unique_count)/total)*100)))
        else:
            print("** {}:{}".format(col,expand_categories(df[col])))
            expand_categories(df[col])

#analyze(path)


# display 5 rows
# df[0:5]

# Encode text values to dummy variables(i.e. [1,0,0],[0,1,0],[0,0,1] for red,green,blue)
def encode_text_dummy(df, name):
    dummies = pd.get_dummies(df[name])
    for x in dummies.columns:
        dummy_name = f"{name}-{x}"
        df[dummy_name] = dummies[x]
    df.drop(name, axis=1, inplace=True)

#Clean 'date' column and convert to Int type
def clean_date(s):
    s = ''.join([i for i in s if i not in frozenset(string.punctuation)])
    s_removed = s.replace(" ", "")
    s_int = int(s_removed)
    return s_int

########## CLEAN IP #######################
def clean_ip(s):
    s = ''.join([i for i in s if i not in frozenset(string.punctuation)])
    s_int = int(s)
    return s_int

# Encode a numeric column as zscores
def encode_numeric_zscore(df, name, mean=None, sd=None):
    if mean is None:
        mean = df[name].mean()

    if sd is None:
        sd = df[name].std()

    df[name] = (df[name] - mean) / sd

#MINMAX -1 1
# Encode a column to a range between normalized_low and normalized_high.
def min_max_1(df, name, normalized_low=-1, normalized_high=1,
                         data_low=None, data_high=None):
    if data_low is None:
        data_low = min(df[name])
        data_high = max(df[name])

    df[name] = ((df[name] - data_low) / (data_high - data_low)) \
        * (normalized_high - normalized_low) + normalized_low

#MINMAX 0 1
def min_max_0(df, name, normalized_low=0, normalized_high=1,
                         data_low=None, data_high=None):
    if data_low is None:
        data_low = min(df[name])
        data_high = max(df[name])

    df[name] = ((df[name] - data_low) / (data_high - data_low)) \
        * (normalized_high - normalized_low) + normalized_low

    
# Encode text values to indexes(i.e. [1],[2],[3] for red,green,blue).
def encode_text_index(df, name):
    le = preprocessing.LabelEncoder()
    df[name] = le.fit_transform(df[name])
    return le.classes_

# Convert a Pandas dataframe to the x,y inputs that TensorFlow needs
def to_xy(df, target):
    result = []
    for x in df.columns:
        if x != target:
            result.append(x)
    # find out the type of the target column.  Is it really this hard? :(
    target_type = df[target].dtypes
    target_type = target_type[0] if hasattr(
        target_type, '__iter__') else target_type
    # Encode to int for classification, float otherwise. TensorFlow likes 32 bits.
    if target_type in (np.int64, np.int32):
        # Classification
        dummies = pd.get_dummies(df[target])
        return df[result].values.astype(np.float32), dummies.values.astype(np.float32)
    # Regression
    return df[result].values.astype(np.float32), df[[target]].values.astype(np.float32)

# Plot a confusion matrix.
# cm is the confusion matrix, names are the names of the classes.
def plot_confusion_matrix(cm, names, title='Confusion matrix', cmap=plt.cm.Blues):
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(names))
    plt.xticks(tick_marks, names, rotation=45)
    plt.yticks(tick_marks, names)
    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')




df['srcip'] = df['srcip'].apply(clean_ip)
df['dstip'] = df['dstip'].apply(clean_ip)

df['sport'] = df['sport'].astype('int64')
df['dsport'] = df['dsport'].astype('int64')

# Now encode the feature vector

min_max_0(df, 'srcip')#
min_max_0(df, 'sport')

min_max_0(df, 'dstip') #
#min_max_0(df, 'dsport')

encode_text_dummy(df, 'proto') #
encode_text_dummy(df, 'state')

min_max_0(df, 'dur')

min_max_0(df, 'sbyte') #
min_max_0(df, 'dbytes')#
min_max_0(df, 'sttl')#
min_max_0(df, 'dttl')#
min_max_0(df, 'sloss')#
min_max_0(df, 'dloss')#
encode_text_dummy(df, 'service') #
min_max_0(df, 'Sload')

min_max_0(df, 'Dload')

min_max_0(df, 'Spkts')#
min_max_0(df, 'Dpkts')#
min_max_0(df, 'swin')#
min_max_0(df, 'dwin')#
min_max_0(df, 'stcpb')#
min_max_0(df, 'dtcpb')#
min_max_0(df, 'smeansz')#
min_max_0(df, 'dmeansz')#
min_max_0(df, 'trans')#
min_max_0(df, 'res')#
min_max_0(df, 'Sjit')#
min_max_0(df, 'Djit')#

min_max_0(df, 'Sintpkt')#
min_max_0(df, 'Dintpkt')#
min_max_0(df, 'tcprtt')#
min_max_0(df, 'synack')#
min_max_0(df, 'ackdat')#
min_max_0(df, 'is_sm_ips_ports')#ojo
min_max_0(df, 'ct_state_ttl')#
#min_max_0(df, 'ct_flw_http_mthd')#
#min_max_0(df, 'is_ftp_login')#ojo
min_max_0(df, 'ct_ftp_cmd')#
min_max_0(df, 'ct_srv_src')#
min_max_0(df, 'ct_srv_dst')#
min_max_0(df, 'ct_dst_ltm')#
min_max_0(df, 'ct_src_ltm')

min_max_0(df, 'ct_src_dport_ltm')#
min_max_0(df, 'ct_dst_sport_lt')#
min_max_0(df, 'ct_dst_src_ltm')#

#encode_text_dummy(df, 'attack_cat')

outcomes = encode_text_index(df, 'Label')#
num_classes = len(outcomes)

# display 5 rows

#df.dropna(inplace=True,axis=1)
df[0:5]

TypeError: 'int' object is not iterable

In [67]:
import pandas as pd
import io
import requests
import numpy as np
import os
import matplotlib.pyplot as plt
import pylab as pl
import tensorflow.contrib.learn as skflow
import string


#from sklearn.utils.multiclass import unique_labels
from scipy.stats import zscore
from sklearn.model_selection import train_test_split
from sklearn import metrics
from keras.models import Sequential
from keras.layers.core import Dense, Activation
from keras.callbacks import EarlyStopping
from sklearn import preprocessing
from sklearn.metrics import confusion_matrix, roc_curve, auc, average_precision_score, precision_recall_curve
from inspect import signature
%matplotlib inline

path = "UNSW-NB15_2.csv"
# This file is a CSV, just no CSV extension or headers
df = pd.read_csv(path, header=None)
print(df[0:3])

print("Read {} rows.".format(len(df)))
# df = df.sample(frac=0.1, replace=False) # Uncomment this line to sample only 10% of the dataset
 # For now, just drop NA's (rows with missing values)

# The CSV file has no column heads, so add them
df.columns = [
    'srcip',
    'sport',
    'dstip',
    'dsport',
    'proto',
    'state',
    'dur',
    'sbyte',
    'dbytes',
    'sttl',
    'dttl',
    'sloss',
    'dloss',
    'service',
    'Sload',
    'Dload',
    'Spkts',
    'Dpkts',
    'swin',
    'dwin',
    'stcpb',
    'dtcpb',
    'smeansz',
    'dmeansz',
    'trans',
    'res',
    'Sjit',
    'Djit',
    'Stime', #QUITAR
    'Ltime', #QUITAR
    'Sintpkt',
    'Dintpkt',
    'tcprtt',
    'synack',
    'ackdat',
    'is_sm_ips_ports',
    'ct_state_ttl',
    'ct_flw_http_mthd',
    'is_ftp_login',
    'ct_ftp_cmd',
    'ct_srv_src',
    'ct_srv_dst',
    'ct_dst_ltm',
    'ct_src_ltm', 
    'ct_src_dport_ltm',
    'ct_dst_sport_lt', 
    'ct_dst_src_ltm',
    'attack_cat',
    'Label'
]


df[0:5]

#df.drop('dsport', 1, inplace=True)#
# df.drop('state', 1, inplace=True)#
# df.drop('dur', 1, inplace=True)#
# df.drop('Sload', 1, inplace=True)#
# df.drop('Dload', 1, inplace=True)#
# df.drop('Stime', 1, inplace=True)#
# df.drop('Ltime', 1, inplace=True)#
# df.drop('ct_src_ltm', 1, inplace=True)#

ENCODING = 'utf-8'

def expand_categories(values):
    result = []
    s = values.value_counts()
    t = float(len(values))
    for v in s.index:
        result.append("{}:{}%".format(v,round(100*(s[v]/t),2)))
    return "[{}]".format(",".join(result))
        
def analyze(filename):
    print()
    print("Analyzing: {}".format(filename))
    df = pd.read_csv(filename,encoding=ENCODING)
    cols = df.columns.values
    total = float(len(df))

    print("{} rows".format(int(total)))
    for col in cols:
        uniques = df[col].unique()
        unique_count = len(uniques)
        if unique_count>100:
            print("** {}:{} ({}%)".format(col,unique_count,int(((unique_count)/total)*100)))
        else:
            print("** {}:{}".format(col,expand_categories(df[col])))
            expand_categories(df[col])

#analyze(path)


# display 5 rows
# df[0:5]

# Encode text values to dummy variables(i.e. [1,0,0],[0,1,0],[0,0,1] for red,green,blue)
def encode_text_dummy(df, name):
    dummies = pd.get_dummies(df[name])
    for x in dummies.columns:
        dummy_name = f"{name}-{x}"
        df[dummy_name] = dummies[x]
    df.drop(name, axis=1, inplace=True)

#Clean 'date' column and convert to Int type
def clean_date(s):
    s = ''.join([i for i in s if i not in frozenset(string.punctuation)])
    s_removed = s.replace(" ", "")
    s_int = int(s_removed)
    return s_int

########## CLEAN IP #######################
def clean_ip(s):
    s = ''.join([i for i in s if i not in frozenset(string.punctuation)])
    s_int = int(s)
    return s_int

# Encode a numeric column as zscores
def encode_numeric_zscore(df, name, mean=None, sd=None):
    if mean is None:
        mean = df[name].mean()

    if sd is None:
        sd = df[name].std()

    df[name] = (df[name] - mean) / sd

#MINMAX -1 1
# Encode a column to a range between normalized_low and normalized_high.
def min_max_1(df, name, normalized_low=-1, normalized_high=1,
                         data_low=None, data_high=None):
    if data_low is None:
        data_low = min(df[name])
        data_high = max(df[name])

    df[name] = ((df[name] - data_low) / (data_high - data_low)) \
        * (normalized_high - normalized_low) + normalized_low

#MINMAX 0 1
def min_max_0(df, name, normalized_low=0, normalized_high=1,
                         data_low=None, data_high=None):
    if data_low is None:
        data_low = min(df[name])
        data_high = max(df[name])

    df[name] = ((df[name] - data_low) / (data_high - data_low)) \
        * (normalized_high - normalized_low) + normalized_low

    
# Encode text values to indexes(i.e. [1],[2],[3] for red,green,blue).
def encode_text_index(df, name):
    le = preprocessing.LabelEncoder()
    df[name] = le.fit_transform(df[name])
    return le.classes_

# Convert a Pandas dataframe to the x,y inputs that TensorFlow needs
def to_xy(df, target):
    result = []
    for x in df.columns:
        if x != target:
            result.append(x)
    # find out the type of the target column.  Is it really this hard? :(
    target_type = df[target].dtypes
    target_type = target_type[0] if hasattr(
        target_type, '__iter__') else target_type
    # Encode to int for classification, float otherwise. TensorFlow likes 32 bits.
    if target_type in (np.int64, np.int32):
        # Classification
        dummies = pd.get_dummies(df[target])
        return df[result].values.astype(np.float32), dummies.values.astype(np.float32)
    # Regression
    return df[result].values.astype(np.float32), df[[target]].values.astype(np.float32)

# Plot a confusion matrix.
# cm is the confusion matrix, names are the names of the classes.
def plot_confusion_matrix(cm, names, title='Confusion matrix', cmap=plt.cm.Blues):
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(names))
    plt.xticks(tick_marks, names, rotation=45)
    plt.yticks(tick_marks, names)
    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')




df['srcip'] = df['srcip'].apply(clean_ip)
df['dstip'] = df['dstip'].apply(clean_ip)

df['sport'] = df['sport'].astype('int64')
#df['dsport'] = df['dsport'].astype('int64')

# Now encode the feature vector

min_max_0(df, 'srcip')#
min_max_0(df, 'sport')

min_max_0(df, 'dstip') #
#min_max_0(df, 'dsport')

encode_text_dummy(df, 'proto') #
encode_text_dummy(df, 'state')

min_max_0(df, 'dur')

min_max_0(df, 'sbyte') #
min_max_0(df, 'dbytes')#
min_max_0(df, 'sttl')#
min_max_0(df, 'dttl')#
min_max_0(df, 'sloss')#
min_max_0(df, 'dloss')#
encode_text_dummy(df, 'service') #
min_max_0(df, 'Sload')

min_max_0(df, 'Dload')

min_max_0(df, 'Spkts')#
min_max_0(df, 'Dpkts')#
min_max_0(df, 'swin')#
min_max_0(df, 'dwin')#
min_max_0(df, 'stcpb')#
min_max_0(df, 'dtcpb')#
min_max_0(df, 'smeansz')#
min_max_0(df, 'dmeansz')#
min_max_0(df, 'trans')#
min_max_0(df, 'res')#
min_max_0(df, 'Sjit')#
min_max_0(df, 'Djit')#

min_max_0(df, 'Sintpkt')#
min_max_0(df, 'Dintpkt')#
min_max_0(df, 'tcprtt')#
min_max_0(df, 'synack')#
min_max_0(df, 'ackdat')#
min_max_0(df, 'is_sm_ips_ports')#ojo
min_max_0(df, 'ct_state_ttl')#
#min_max_0(df, 'ct_flw_http_mthd')#
#min_max_0(df, 'is_ftp_login')#ojo
min_max_0(df, 'ct_ftp_cmd')#
min_max_0(df, 'ct_srv_src')#
min_max_0(df, 'ct_srv_dst')#
min_max_0(df, 'ct_dst_ltm')#
min_max_0(df, 'ct_src_ltm')

min_max_0(df, 'ct_src_dport_ltm')#
min_max_0(df, 'ct_dst_sport_lt')#
min_max_0(df, 'ct_dst_src_ltm')#

#encode_text_dummy(df, 'attack_cat')

outcomes = encode_text_index(df, 'Label')#
num_classes = len(outcomes)

# display 5 rows

#df.dropna(inplace=True,axis=1)
df[0:5]

           0      1              2      3    4    5         6     7      8   \
0  59.166.0.0   6055  149.171.126.5  54145  tcp  FIN  0.072974  4238  60788   
1  59.166.0.0   7832  149.171.126.3   5607  tcp  FIN  0.144951  5174  91072   
2  59.166.0.8  11397  149.171.126.6     21  tcp  FIN  0.116107  2934   3742   

   9   ...  39  40  41 42  43  44  45  46   47  48  
0  31  ...   0  13  13  6   7   1   1   2  NaN   0  
1  31  ...   0  13  13  6   7   1   1   2  NaN   0  
2  31  ...   1   1   2  7   5   1   1   4  NaN   0  

[3 rows x 49 columns]
Read 700001 rows.


TypeError: '<' not supported between instances of 'str' and 'int'

In [68]:
df.dtypes

srcip               float64
sport               float64
dstip               float64
dsport               object
dur                 float64
sbyte               float64
dbytes              float64
sttl                float64
dttl                float64
sloss               float64
dloss               float64
Sload               float64
Dload               float64
Spkts               float64
Dpkts               float64
swin                float64
dwin                float64
stcpb               float64
dtcpb               float64
smeansz             float64
dmeansz             float64
trans               float64
res                 float64
Sjit                float64
Djit                float64
Stime                 int64
Ltime                 int64
Sintpkt             float64
Dintpkt             float64
tcprtt              float64
                     ...   
proto-xtp             uint8
proto-zero            uint8
state-ACC             uint8
state-CLO             uint8
state-CON           

In [69]:
import pandas as pd
import io
import requests
import numpy as np
import os
import matplotlib.pyplot as plt
import pylab as pl
import tensorflow.contrib.learn as skflow
import string


#from sklearn.utils.multiclass import unique_labels
from scipy.stats import zscore
from sklearn.model_selection import train_test_split
from sklearn import metrics
from keras.models import Sequential
from keras.layers.core import Dense, Activation
from keras.callbacks import EarlyStopping
from sklearn import preprocessing
from sklearn.metrics import confusion_matrix, roc_curve, auc, average_precision_score, precision_recall_curve
from inspect import signature
%matplotlib inline

path = "UNSW-NB15_2.csv"
# This file is a CSV, just no CSV extension or headers
df = pd.read_csv(path, header=None)
print(df[0:3])

print("Read {} rows.".format(len(df)))
# df = df.sample(frac=0.1, replace=False) # Uncomment this line to sample only 10% of the dataset
 # For now, just drop NA's (rows with missing values)

# The CSV file has no column heads, so add them
df.columns = [
    'srcip',
    'sport',
    'dstip',
    'dsport',
    'proto',
    'state',
    'dur',
    'sbyte',
    'dbytes',
    'sttl',
    'dttl',
    'sloss',
    'dloss',
    'service',
    'Sload',
    'Dload',
    'Spkts',
    'Dpkts',
    'swin',
    'dwin',
    'stcpb',
    'dtcpb',
    'smeansz',
    'dmeansz',
    'trans',
    'res',
    'Sjit',
    'Djit',
    'Stime', #QUITAR
    'Ltime', #QUITAR
    'Sintpkt',
    'Dintpkt',
    'tcprtt',
    'synack',
    'ackdat',
    'is_sm_ips_ports',
    'ct_state_ttl',
    'ct_flw_http_mthd',
    'is_ftp_login',
    'ct_ftp_cmd',
    'ct_srv_src',
    'ct_srv_dst',
    'ct_dst_ltm',
    'ct_src_ltm', 
    'ct_src_dport_ltm',
    'ct_dst_sport_lt', 
    'ct_dst_src_ltm',
    'attack_cat',
    'Label'
]

df.dtypes

  interactivity=interactivity, compiler=compiler, result=result)


           0      1              2      3    4    5         6     7      8   \
0  59.166.0.0   6055  149.171.126.5  54145  tcp  FIN  0.072974  4238  60788   
1  59.166.0.0   7832  149.171.126.3   5607  tcp  FIN  0.144951  5174  91072   
2  59.166.0.8  11397  149.171.126.6     21  tcp  FIN  0.116107  2934   3742   

   9   ...  39  40  41 42  43  44  45  46   47  48  
0  31  ...   0  13  13  6   7   1   1   2  NaN   0  
1  31  ...   0  13  13  6   7   1   1   2  NaN   0  
2  31  ...   1   1   2  7   5   1   1   4  NaN   0  

[3 rows x 49 columns]
Read 700001 rows.


srcip                object
sport                 int64
dstip                object
dsport               object
proto                object
state                object
dur                 float64
sbyte                 int64
dbytes                int64
sttl                  int64
dttl                  int64
sloss                 int64
dloss                 int64
service              object
Sload               float64
Dload               float64
Spkts                 int64
Dpkts                 int64
swin                  int64
dwin                  int64
stcpb                 int64
dtcpb                 int64
smeansz               int64
dmeansz               int64
trans                 int64
res                   int64
Sjit                float64
Djit                float64
Stime                 int64
Ltime                 int64
Sintpkt             float64
Dintpkt             float64
tcprtt              float64
synack              float64
ackdat              float64
is_sm_ips_ports     

In [70]:

ENCODING = 'utf-8'

def expand_categories(values):
    result = []
    s = values.value_counts()
    t = float(len(values))
    for v in s.index:
        result.append("{}:{}%".format(v,round(100*(s[v]/t),2)))
    return "[{}]".format(",".join(result))
        
def analyze(filename):
    print()
    print("Analyzing: {}".format(filename))
    df = pd.read_csv(filename,encoding=ENCODING)
    cols = df.columns.values
    total = float(len(df))

    print("{} rows".format(int(total)))
    for col in cols:
        uniques = df[col].unique()
        unique_count = len(uniques)
        if unique_count>100:
            print("** {}:{} ({}%)".format(col,unique_count,int(((unique_count)/total)*100)))
        else:
            print("** {}:{}".format(col,expand_categories(df[col])))
            expand_categories(df[col])

#analyze(path)


# display 5 rows
# df[0:5]

# Encode text values to dummy variables(i.e. [1,0,0],[0,1,0],[0,0,1] for red,green,blue)
def encode_text_dummy(df, name):
    dummies = pd.get_dummies(df[name])
    for x in dummies.columns:
        dummy_name = f"{name}-{x}"
        df[dummy_name] = dummies[x]
    df.drop(name, axis=1, inplace=True)

#Clean 'date' column and convert to Int type
def clean_date(s):
    s = ''.join([i for i in s if i not in frozenset(string.punctuation)])
    s_removed = s.replace(" ", "")
    s_int = int(s_removed)
    return s_int

########## CLEAN IP #######################
def clean_ip(s):
    s = ''.join([i for i in s if i not in frozenset(string.punctuation)])
    s_int = int(s)
    return s_int

# Encode a numeric column as zscores
def encode_numeric_zscore(df, name, mean=None, sd=None):
    if mean is None:
        mean = df[name].mean()

    if sd is None:
        sd = df[name].std()

    df[name] = (df[name] - mean) / sd

#MINMAX -1 1
# Encode a column to a range between normalized_low and normalized_high.
def min_max_1(df, name, normalized_low=-1, normalized_high=1,
                         data_low=None, data_high=None):
    if data_low is None:
        data_low = min(df[name])
        data_high = max(df[name])

    df[name] = ((df[name] - data_low) / (data_high - data_low)) \
        * (normalized_high - normalized_low) + normalized_low

#MINMAX 0 1
def min_max_0(df, name, normalized_low=0, normalized_high=1,
                         data_low=None, data_high=None):
    if data_low is None:
        data_low = min(df[name])
        data_high = max(df[name])

    df[name] = ((df[name] - data_low) / (data_high - data_low)) \
        * (normalized_high - normalized_low) + normalized_low

    
# Encode text values to indexes(i.e. [1],[2],[3] for red,green,blue).
def encode_text_index(df, name):
    le = preprocessing.LabelEncoder()
    df[name] = le.fit_transform(df[name])
    return le.classes_

# Convert a Pandas dataframe to the x,y inputs that TensorFlow needs
def to_xy(df, target):
    result = []
    for x in df.columns:
        if x != target:
            result.append(x)
    # find out the type of the target column.  Is it really this hard? :(
    target_type = df[target].dtypes
    target_type = target_type[0] if hasattr(
        target_type, '__iter__') else target_type
    # Encode to int for classification, float otherwise. TensorFlow likes 32 bits.
    if target_type in (np.int64, np.int32):
        # Classification
        dummies = pd.get_dummies(df[target])
        return df[result].values.astype(np.float32), dummies.values.astype(np.float32)
    # Regression
    return df[result].values.astype(np.float32), df[[target]].values.astype(np.float32)

# Plot a confusion matrix.
# cm is the confusion matrix, names are the names of the classes.
def plot_confusion_matrix(cm, names, title='Confusion matrix', cmap=plt.cm.Blues):
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(names))
    plt.xticks(tick_marks, names, rotation=45)
    plt.yticks(tick_marks, names)
    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')




df['srcip'] = df['srcip'].apply(clean_ip)
df['dstip'] = df['dstip'].apply(clean_ip)

df['sport'] = df['sport'].astype('int64')
#df['dsport'] = df['dsport'].astype('int64')

# Now encode the feature vector

min_max_0(df, 'srcip')#
min_max_0(df, 'sport')

min_max_0(df, 'dstip') #
#min_max_0(df, 'dsport')

encode_text_dummy(df, 'proto') #
encode_text_dummy(df, 'state')

min_max_0(df, 'dur')

min_max_0(df, 'sbyte') #
min_max_0(df, 'dbytes')#
min_max_0(df, 'sttl')#
min_max_0(df, 'dttl')#
min_max_0(df, 'sloss')#
min_max_0(df, 'dloss')#
encode_text_dummy(df, 'service') #
min_max_0(df, 'Sload')

min_max_0(df, 'Dload')

min_max_0(df, 'Spkts')#
min_max_0(df, 'Dpkts')#
min_max_0(df, 'swin')#
min_max_0(df, 'dwin')#
min_max_0(df, 'stcpb')#
min_max_0(df, 'dtcpb')#
min_max_0(df, 'smeansz')#
min_max_0(df, 'dmeansz')#
min_max_0(df, 'trans')#
min_max_0(df, 'res')#
min_max_0(df, 'Sjit')#
min_max_0(df, 'Djit')#

min_max_0(df, 'Sintpkt')#
min_max_0(df, 'Dintpkt')#
min_max_0(df, 'tcprtt')#
min_max_0(df, 'synack')#
min_max_0(df, 'ackdat')#
min_max_0(df, 'is_sm_ips_ports')#ojo
min_max_0(df, 'ct_state_ttl')#
#min_max_0(df, 'ct_flw_http_mthd')#
#min_max_0(df, 'is_ftp_login')#ojo
#min_max_0(df, 'ct_ftp_cmd')#
min_text_dummy(df, 'ct_ftp_cmd')#
min_max_0(df, 'ct_srv_src')#
min_max_0(df, 'ct_srv_dst')#
min_max_0(df, 'ct_dst_ltm')#
min_max_0(df, 'ct_src_ltm')

min_max_0(df, 'ct_src_dport_ltm')#
min_max_0(df, 'ct_dst_sport_lt')#
min_max_0(df, 'ct_dst_src_ltm')#

#encode_text_dummy(df, 'attack_cat')

outcomes = encode_text_index(df, 'Label')#
num_classes = len(outcomes)

# display 5 rows

#df.dropna(inplace=True,axis=1)
df[0:5]

NameError: name 'min_text_dummy' is not defined

In [74]:
import pandas as pd
import io
import requests
import numpy as np
import os
import matplotlib.pyplot as plt
import pylab as pl
import tensorflow.contrib.learn as skflow
import string


#from sklearn.utils.multiclass import unique_labels
from scipy.stats import zscore
from sklearn.model_selection import train_test_split
from sklearn import metrics
from keras.models import Sequential
from keras.layers.core import Dense, Activation
from keras.callbacks import EarlyStopping
from sklearn import preprocessing
from sklearn.metrics import confusion_matrix, roc_curve, auc, average_precision_score, precision_recall_curve
from inspect import signature
%matplotlib inline

path = "UNSW-NB15_2.csv"
# This file is a CSV, just no CSV extension or headers
df = pd.read_csv(path, header=None)
print(df[0:3])

print("Read {} rows.".format(len(df)))
# df = df.sample(frac=0.1, replace=False) # Uncomment this line to sample only 10% of the dataset
 # For now, just drop NA's (rows with missing values)

# The CSV file has no column heads, so add them
df.columns = [
    'srcip',
    'sport',
    'dstip',
    'dsport',
    'proto',
    'state',
    'dur',
    'sbyte',
    'dbytes',
    'sttl',
    'dttl',
    'sloss',
    'dloss',
    'service',
    'Sload',
    'Dload',
    'Spkts',
    'Dpkts',
    'swin',
    'dwin',
    'stcpb',
    'dtcpb',
    'smeansz',
    'dmeansz',
    'trans',
    'res',
    'Sjit',
    'Djit',
    'Stime', #QUITAR
    'Ltime', #QUITAR
    'Sintpkt',
    'Dintpkt',
    'tcprtt',
    'synack',
    'ackdat',
    'is_sm_ips_ports',
    'ct_state_ttl',
    'ct_flw_http_mthd',
    'is_ftp_login',
    'ct_ftp_cmd',
    'ct_srv_src',
    'ct_srv_dst',
    'ct_dst_ltm',
    'ct_src_ltm', 
    'ct_src_dport_ltm',
    'ct_dst_sport_lt', 
    'ct_dst_src_ltm',
    'attack_cat',
    'Label'
]

df[0:5]

#df.drop('dsport', 1, inplace=True)#
# df.drop('state', 1, inplace=True)#
# df.drop('dur', 1, inplace=True)#
# df.drop('Sload', 1, inplace=True)#
# df.drop('Dload', 1, inplace=True)#
df.drop('Stime', 1, inplace=True)#
df.drop('Ltime', 1, inplace=True)#
# df.drop('ct_src_ltm', 1, inplace=True)#

ENCODING = 'utf-8'

def expand_categories(values):
    result = []
    s = values.value_counts()
    t = float(len(values))
    for v in s.index:
        result.append("{}:{}%".format(v,round(100*(s[v]/t),2)))
    return "[{}]".format(",".join(result))
        
def analyze(filename):
    print()
    print("Analyzing: {}".format(filename))
    df = pd.read_csv(filename,encoding=ENCODING)
    cols = df.columns.values
    total = float(len(df))

    print("{} rows".format(int(total)))
    for col in cols:
        uniques = df[col].unique()
        unique_count = len(uniques)
        if unique_count>100:
            print("** {}:{} ({}%)".format(col,unique_count,int(((unique_count)/total)*100)))
        else:
            print("** {}:{}".format(col,expand_categories(df[col])))
            expand_categories(df[col])

#analyze(path)


# display 5 rows
# df[0:5]

# Encode text values to dummy variables(i.e. [1,0,0],[0,1,0],[0,0,1] for red,green,blue)
def encode_text_dummy(df, name):
    dummies = pd.get_dummies(df[name])
    for x in dummies.columns:
        dummy_name = f"{name}-{x}"
        df[dummy_name] = dummies[x]
    df.drop(name, axis=1, inplace=True)

#Clean 'date' column and convert to Int type
def clean_date(s):
    s = ''.join([i for i in s if i not in frozenset(string.punctuation)])
    s_removed = s.replace(" ", "")
    s_int = int(s_removed)
    return s_int

########## CLEAN IP #######################
def clean_ip(s):
    s = ''.join([i for i in s if i not in frozenset(string.punctuation)])
    s_int = int(s)
    return s_int

# Encode a numeric column as zscores
def encode_numeric_zscore(df, name, mean=None, sd=None):
    if mean is None:
        mean = df[name].mean()

    if sd is None:
        sd = df[name].std()

    df[name] = (df[name] - mean) / sd

#MINMAX -1 1
# Encode a column to a range between normalized_low and normalized_high.
def min_max_1(df, name, normalized_low=-1, normalized_high=1,
                         data_low=None, data_high=None):
    if data_low is None:
        data_low = min(df[name])
        data_high = max(df[name])

    df[name] = ((df[name] - data_low) / (data_high - data_low)) \
        * (normalized_high - normalized_low) + normalized_low

#MINMAX 0 1
def min_max_0(df, name, normalized_low=0, normalized_high=1,
                         data_low=None, data_high=None):
    if data_low is None:
        data_low = min(df[name])
        data_high = max(df[name])

    df[name] = ((df[name] - data_low) / (data_high - data_low)) \
        * (normalized_high - normalized_low) + normalized_low

    
# Encode text values to indexes(i.e. [1],[2],[3] for red,green,blue).
def encode_text_index(df, name):
    le = preprocessing.LabelEncoder()
    df[name] = le.fit_transform(df[name])
    return le.classes_

# Convert a Pandas dataframe to the x,y inputs that TensorFlow needs
def to_xy(df, target):
    result = []
    for x in df.columns:
        if x != target:
            result.append(x)
    # find out the type of the target column.  Is it really this hard? :(
    target_type = df[target].dtypes
    target_type = target_type[0] if hasattr(
        target_type, '__iter__') else target_type
    # Encode to int for classification, float otherwise. TensorFlow likes 32 bits.
    if target_type in (np.int64, np.int32):
        # Classification
        dummies = pd.get_dummies(df[target])
        return df[result].values.astype(np.float32), dummies.values.astype(np.float32)
    # Regression
    return df[result].values.astype(np.float32), df[[target]].values.astype(np.float32)

# Plot a confusion matrix.
# cm is the confusion matrix, names are the names of the classes.
def plot_confusion_matrix(cm, names, title='Confusion matrix', cmap=plt.cm.Blues):
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(names))
    plt.xticks(tick_marks, names, rotation=45)
    plt.yticks(tick_marks, names)
    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')




df['srcip'] = df['srcip'].apply(clean_ip)
df['dstip'] = df['dstip'].apply(clean_ip)

df['sport'] = df['sport'].astype('int64')
#df['dsport'] = df['dsport'].astype('int64')

# Now encode the feature vector

min_max_0(df, 'srcip')#
min_max_0(df, 'sport')

min_max_0(df, 'dstip') #
#min_max_0(df, 'dsport')

encode_text_dummy(df, 'proto') #
encode_text_dummy(df, 'state')

min_max_0(df, 'dur')

min_max_0(df, 'sbyte') #
min_max_0(df, 'dbytes')#
min_max_0(df, 'sttl')#
min_max_0(df, 'dttl')#
min_max_0(df, 'sloss')#
min_max_0(df, 'dloss')#
encode_text_dummy(df, 'service') #
min_max_0(df, 'Sload')

min_max_0(df, 'Dload')

min_max_0(df, 'Spkts')#
min_max_0(df, 'Dpkts')#
min_max_0(df, 'swin')#
min_max_0(df, 'dwin')#
min_max_0(df, 'stcpb')#
min_max_0(df, 'dtcpb')#
min_max_0(df, 'smeansz')#
min_max_0(df, 'dmeansz')#
min_max_0(df, 'trans')#
min_max_0(df, 'res')#
min_max_0(df, 'Sjit')#
min_max_0(df, 'Djit')#

min_max_0(df, 'Sintpkt')#
min_max_0(df, 'Dintpkt')#
min_max_0(df, 'tcprtt')#
min_max_0(df, 'synack')#
min_max_0(df, 'ackdat')#
min_max_0(df, 'is_sm_ips_ports')#ojo
min_max_0(df, 'ct_state_ttl')#
#min_max_0(df, 'ct_flw_http_mthd')#
#min_max_0(df, 'is_ftp_login')#ojo
#min_max_0(df, 'ct_ftp_cmd')#
encode_text_dummy(df, 'ct_ftp_cmd')#
min_max_0(df, 'ct_srv_src')#
min_max_0(df, 'ct_srv_dst')#
min_max_0(df, 'ct_dst_ltm')#
min_max_0(df, 'ct_src_ltm')

min_max_0(df, 'ct_src_dport_ltm')#
min_max_0(df, 'ct_dst_sport_lt')#
min_max_0(df, 'ct_dst_src_ltm')#

#encode_text_dummy(df, 'attack_cat')

outcomes = encode_text_index(df, 'Label')#
num_classes = len(outcomes)


           0      1              2      3    4    5         6     7      8   \
0  59.166.0.0   6055  149.171.126.5  54145  tcp  FIN  0.072974  4238  60788   
1  59.166.0.0   7832  149.171.126.3   5607  tcp  FIN  0.144951  5174  91072   
2  59.166.0.8  11397  149.171.126.6     21  tcp  FIN  0.116107  2934   3742   

   9   ...  39  40  41 42  43  44  45  46   47  48  
0  31  ...   0  13  13  6   7   1   1   2  NaN   0  
1  31  ...   0  13  13  6   7   1   1   2  NaN   0  
2  31  ...   1   1   2  7   5   1   1   4  NaN   0  

[3 rows x 49 columns]
Read 700001 rows.


In [73]:
x, y = to_xy(df,'Label')
#################################################



# Create a test/train split.  25% test
# Split into train/test
x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=0.25, random_state=42)

# Create neural net
model = Sequential()
model.add(Dense(10, input_dim=x.shape[1], kernel_initializer='normal', activation='relu'))
model.add(Dense(50, input_dim=x.shape[1], kernel_initializer='normal', activation='relu'))
model.add(Dense(10, input_dim=x.shape[1], kernel_initializer='normal', activation='relu'))
model.add(Dense(1, kernel_initializer='normal'))
model.add(Dense(y.shape[1],activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
monitor = EarlyStopping(monitor='val_loss', min_delta=1e-4, patience=5, verbose=1, mode='auto')
history = model.fit(x_train,y_train,validation_data=(x_test,y_test),callbacks=[monitor],verbose=2,epochs=1000)


# Measure accuracy
pred = model.predict(x_test)
pred = np.argmax(pred,axis=1)
y_eval = np.argmax(y_test,axis=1)
score = metrics.accuracy_score(y_eval, pred)
print("Validation score: {}".format(score))

print(outcomes)

### PLOT ACCURACY ####

plt.plot(np.arange(len(history.history['acc'])),
history.history['acc'], label='training')
plt.plot(np.arange(len(history.history['val_acc'])),
history.history['val_acc'], label='validation')
plt.title('Accuracy')
plt.xlabel('epochs')
plt.ylabel('accuracy ')
plt.legend(loc=0)
plt.show()

### PLOT CONFUSION MATRIX ###

# Not normalized
cm = confusion_matrix(y_eval, pred)
np.set_printoptions(precision=2)
print('Confusion matrix, without normalization')
print(cm)
plt.figure()
plot_confusion_matrix(cm, outcomes)

# Normalize the confusion matrix by row (i.e by the number of samples
# in each class)
cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
print('Normalized confusion matrix')
print(cm_normalized)
plt.figure()
plot_confusion_matrix(cm_normalized, outcomes, title='Normalized confusion matrix')

plt.show()


### PLOT ROC ###

#Plot an ROC. pred - the predictions, y - the expected outpus.
#En mi caso creo que es pred == pred y== y_eval
def plot_roc(pred, y):
    fpr, tpr, _ = roc_curve(y, pred)
    roc_auc = auc(fpr,tpr)

    plt.figure()
    plt.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % roc_auc)
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic (ROC)')
    plt.legend(loc="lower right")
    plt.show()

plot_roc(pred, y_eval)


### PRECISION-RECALL ###

average_precision = average_precision_score(y_eval, pred)

print('Average precision-recall score: {0:0.2f}'.format(
      average_precision))



precision, recall, _ = precision_recall_curve(y_eval, pred)

# In matplotlib < 1.5, plt.fill_between does not have a 'step' argument
step_kwargs = ({'step': 'post'}
               if 'step' in signature(plt.fill_between).parameters
               else {})
plt.step(recall, precision, color='b', alpha=0.2,
         where='post')
plt.fill_between(recall, precision, alpha=0.2, color='b', **step_kwargs)

plt.xlabel('Recall')
plt.ylabel('Precision')
plt.ylim([0.0, 1.05])
plt.xlim([0.0, 1.0])
plt.title('2-class Precision-Recall curve: AP={0:0.2f}'.format(average_precision))

ValueError: could not convert string to float: '0xcc09'

In [75]:
df[0:10]

Unnamed: 0,srcip,sport,dstip,dsport,dur,sbyte,dbytes,sttl,dttl,sloss,...,service-radius,service-smtp,service-snmp,service-ssh,service-ssl,ct_ftp_cmd-0,ct_ftp_cmd-1,ct_ftp_cmd-2,ct_ftp_cmd-4,ct_ftp_cmd-
0,0.000327,0.092393,0.077615,54145,0.001216,0.000295,0.004147,0.121569,0.114173,0.001316,...,0,0,0,0,0,0,0,0,0,0
1,0.000327,0.119509,0.077615,5607,0.002416,0.00036,0.006213,0.121569,0.114173,0.001316,...,0,0,0,0,0,0,0,0,0,0
2,0.000327,0.173907,0.077615,21,0.001935,0.000204,0.000255,0.121569,0.114173,0.002068,...,0,0,0,0,0,0,0,0,0,0
3,0.000327,0.058045,0.077615,53,1.6e-05,1e-05,1.2e-05,0.121569,0.114173,0.0,...,0,0,0,0,0,0,0,0,0,0
4,0.000327,0.218799,0.077615,14724,0.000641,0.000622,2.2e-05,0.121569,0.114173,0.000752,...,0,0,0,0,0,0,0,0,0,0
5,0.000327,0.596536,0.077615,53,1.7e-05,9e-06,1.1e-05,0.121569,0.114173,0.0,...,0,0,0,0,0,0,0,0,0,0
6,0.000327,0.165484,0.077615,5190,9.4e-05,7.4e-05,0.000154,0.121569,0.114173,0.000752,...,0,0,0,0,0,0,0,0,0,0
7,0.000327,0.696452,0.077615,80,0.000334,7.2e-05,5.6e-05,0.121569,0.114173,0.000376,...,0,0,0,0,0,0,0,0,0,0
8,0.000327,0.029465,0.077615,6881,0.0546,0.000959,0.037402,0.121569,0.114173,0.003948,...,0,0,0,0,0,0,0,0,0,0
9,0.000327,0.392523,0.077615,5190,0.002555,8.9e-05,0.000175,0.121569,0.114173,0.000752,...,0,0,0,0,0,0,0,0,0,0


In [79]:
import pandas as pd
import io
import requests
import numpy as np
import os
import matplotlib.pyplot as plt
import pylab as pl
import tensorflow.contrib.learn as skflow
import string


#from sklearn.utils.multiclass import unique_labels
from scipy.stats import zscore
from sklearn.model_selection import train_test_split
from sklearn import metrics
from keras.models import Sequential
from keras.layers.core import Dense, Activation
from keras.callbacks import EarlyStopping
from sklearn import preprocessing
from sklearn.metrics import confusion_matrix, roc_curve, auc, average_precision_score, precision_recall_curve
from inspect import signature
#%matplotlib inline

path = "UNSW-NB15_1.csv"
# This file is a CSV, just no CSV extension or headers
df = pd.read_csv(path, header=None)
print(df[0:3])

print("Read {} rows.".format(len(df)))
# df = df.sample(frac=0.1, replace=False) # Uncomment this line to sample only 10% of the dataset
 # For now, just drop NA's (rows with missing values)

# The CSV file has no column heads, so add them
df.columns = [
    'srcip',
    'sport',
    'dstip',
    'dsport',
    'proto',
    'state',
    'dur',
    'sbyte',
    'dbytes',
    'sttl',
    'dttl',
    'sloss',
    'dloss',
    'service',
    'Sload',
    'Dload',
    'Spkts',
    'Dpkts',
    'swin',
    'dwin',
    'stcpb',
    'dtcpb',
    'smeansz',
    'dmeansz',
    'trans',
    'res',
    'Sjit',
    'Djit',
    'Stime', #QUITAR
    'Ltime', #QUITAR
    'Sintpkt',
    'Dintpkt',
    'tcprtt',
    'synack',
    'ackdat',
    'is_sm_ips_ports',
    'ct_state_ttl',
    'ct_flw_http_mthd',
    'is_ftp_login',
    'ct_ftp_cmd',
    'ct_srv_src',
    'ct_srv_dst',
    'ct_dst_ltm',
    'ct_src_ltm', 
    'ct_src_dport_ltm',
    'ct_dst_sport_lt', 
    'ct_dst_src_ltm',
    'attack_cat',
    'Label'
]

df[0:5]

#df.drop('dsport', 1, inplace=True)#
# df.drop('state', 1, inplace=True)#
# df.drop('dur', 1, inplace=True)#
# df.drop('Sload', 1, inplace=True)#
# df.drop('Dload', 1, inplace=True)#
df.drop('Stime', 1, inplace=True)#
df.drop('Ltime', 1, inplace=True)#
# df.drop('ct_src_ltm', 1, inplace=True)#

ENCODING = 'utf-8'

def expand_categories(values):
    result = []
    s = values.value_counts()
    t = float(len(values))
    for v in s.index:
        result.append("{}:{}%".format(v,round(100*(s[v]/t),2)))
    return "[{}]".format(",".join(result))
        
def analyze(filename):
    print()
    print("Analyzing: {}".format(filename))
    df = pd.read_csv(filename,encoding=ENCODING)
    cols = df.columns.values
    total = float(len(df))

    print("{} rows".format(int(total)))
    for col in cols:
        uniques = df[col].unique()
        unique_count = len(uniques)
        if unique_count>100:
            print("** {}:{} ({}%)".format(col,unique_count,int(((unique_count)/total)*100)))
        else:
            print("** {}:{}".format(col,expand_categories(df[col])))
            expand_categories(df[col])

#analyze(path)


# display 5 rows
# df[0:5]

# Encode text values to dummy variables(i.e. [1,0,0],[0,1,0],[0,0,1] for red,green,blue)
def encode_text_dummy(df, name):
    dummies = pd.get_dummies(df[name])
    for x in dummies.columns:
        dummy_name = f"{name}-{x}"
        df[dummy_name] = dummies[x]
    df.drop(name, axis=1, inplace=True)

#Clean 'date' column and convert to Int type
def clean_date(s):
    s = ''.join([i for i in s if i not in frozenset(string.punctuation)])
    s_removed = s.replace(" ", "")
    s_int = int(s_removed)
    return s_int

########## CLEAN IP #######################
def clean_ip(s):
    s = ''.join([i for i in s if i not in frozenset(string.punctuation)])
    s_int = int(s)
    return s_int

# Encode a numeric column as zscores
def encode_numeric_zscore(df, name, mean=None, sd=None):
    if mean is None:
        mean = df[name].mean()

    if sd is None:
        sd = df[name].std()

    df[name] = (df[name] - mean) / sd

#MINMAX -1 1
# Encode a column to a range between normalized_low and normalized_high.
def min_max_1(df, name, normalized_low=-1, normalized_high=1,
                         data_low=None, data_high=None):
    if data_low is None:
        data_low = min(df[name])
        data_high = max(df[name])

    df[name] = ((df[name] - data_low) / (data_high - data_low)) \
        * (normalized_high - normalized_low) + normalized_low

#MINMAX 0 1
def min_max_0(df, name, normalized_low=0, normalized_high=1,
                         data_low=None, data_high=None):
    if data_low is None:
        data_low = min(df[name])
        data_high = max(df[name])

    df[name] = ((df[name] - data_low) / (data_high - data_low)) \
        * (normalized_high - normalized_low) + normalized_low

    
# Encode text values to indexes(i.e. [1],[2],[3] for red,green,blue).
def encode_text_index(df, name):
    le = preprocessing.LabelEncoder()
    df[name] = le.fit_transform(df[name])
    return le.classes_

# Convert a Pandas dataframe to the x,y inputs that TensorFlow needs
def to_xy(df, target):
    result = []
    for x in df.columns:
        if x != target:
            result.append(x)
    # find out the type of the target column.  Is it really this hard? :(
    target_type = df[target].dtypes
    target_type = target_type[0] if hasattr(
        target_type, '__iter__') else target_type
    # Encode to int for classification, float otherwise. TensorFlow likes 32 bits.
    if target_type in (np.int64, np.int32):
        # Classification
        dummies = pd.get_dummies(df[target])
        return df[result].values.astype(np.float32), dummies.values.astype(np.float32)
    # Regression
    return df[result].values.astype(np.float32), df[[target]].values.astype(np.float32)

# Plot a confusion matrix.
# cm is the confusion matrix, names are the names of the classes.
def plot_confusion_matrix(cm, names, title='Confusion matrix', cmap=plt.cm.Blues):
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(names))
    plt.xticks(tick_marks, names, rotation=45)
    plt.yticks(tick_marks, names)
    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')




df['srcip'] = df['srcip'].apply(clean_ip)
df['dstip'] = df['dstip'].apply(clean_ip)

df['sport'] = df['sport'].astype('int64')
#df['dsport'] = df['dsport'].astype('int64')

# Now encode the feature vector

min_max_0(df, 'srcip')#
min_max_0(df, 'sport')

min_max_0(df, 'dstip') #
#min_max_0(df, 'dsport')

encode_text_dummy(df, 'proto') #
encode_text_dummy(df, 'state')

min_max_0(df, 'dur')

min_max_0(df, 'sbyte') #
min_max_0(df, 'dbytes')#
min_max_0(df, 'sttl')#
min_max_0(df, 'dttl')#
min_max_0(df, 'sloss')#
min_max_0(df, 'dloss')#
encode_text_dummy(df, 'service') #
min_max_0(df, 'Sload')

min_max_0(df, 'Dload')

min_max_0(df, 'Spkts')#
min_max_0(df, 'Dpkts')#
min_max_0(df, 'swin')#
min_max_0(df, 'dwin')#
min_max_0(df, 'stcpb')#
min_max_0(df, 'dtcpb')#
min_max_0(df, 'smeansz')#
min_max_0(df, 'dmeansz')#
min_max_0(df, 'trans')#
min_max_0(df, 'res')#
min_max_0(df, 'Sjit')#
min_max_0(df, 'Djit')#

min_max_0(df, 'Sintpkt')#
min_max_0(df, 'Dintpkt')#
min_max_0(df, 'tcprtt')#
min_max_0(df, 'synack')#
min_max_0(df, 'ackdat')#
min_max_0(df, 'is_sm_ips_ports')#ojo
min_max_0(df, 'ct_state_ttl')#
#min_max_0(df, 'ct_flw_http_mthd')#
#min_max_0(df, 'is_ftp_login')#ojo
#min_max_0(df, 'ct_ftp_cmd')#
encode_text_dummy(df, 'ct_ftp_cmd')#
min_max_0(df, 'ct_srv_src')#
min_max_0(df, 'ct_srv_dst')#
min_max_0(df, 'ct_dst_ltm')#
min_max_0(df, 'ct_src_ltm')

min_max_0(df, 'ct_src_dport_ltm')#
min_max_0(df, 'ct_dst_sport_lt')#
min_max_0(df, 'ct_dst_src_ltm')#

#encode_text_dummy(df, 'attack_cat')

outcomes = encode_text_index(df, 'Label')#
num_classes = len(outcomes)

# display 5 rows

#df.dropna(inplace=True,axis=1)
df[0:10]


##################################################
# Break into X (predictors) & y (prediction)
x, y = to_xy(df,'Label')
#################################################



# Create a test/train split.  25% test
# Split into train/test
x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=0.25, random_state=42)

# Create neural net
model = Sequential()
model.add(Dense(10, input_dim=x.shape[1], kernel_initializer='normal', activation='relu'))
model.add(Dense(50, input_dim=x.shape[1], kernel_initializer='normal', activation='relu'))
model.add(Dense(10, input_dim=x.shape[1], kernel_initializer='normal', activation='relu'))
model.add(Dense(1, kernel_initializer='normal'))
model.add(Dense(y.shape[1],activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
monitor = EarlyStopping(monitor='val_loss', min_delta=1e-4, patience=5, verbose=1, mode='auto')
history = model.fit(x_train,y_train,validation_data=(x_test,y_test),callbacks=[monitor],verbose=2,epochs=1000)


# Measure accuracy
pred = model.predict(x_test)
pred = np.argmax(pred,axis=1)
y_eval = np.argmax(y_test,axis=1)
score = metrics.accuracy_score(y_eval, pred)
print("Validation score: {}".format(score))

print(outcomes)

### PLOT ACCURACY ####

plt.plot(np.arange(len(history.history['acc'])),
history.history['acc'], label='training')
plt.plot(np.arange(len(history.history['val_acc'])),
history.history['val_acc'], label='validation')
plt.title('Accuracy')
plt.xlabel('epochs')
plt.ylabel('accuracy ')
plt.legend(loc=0)
plt.show()

### PLOT CONFUSION MATRIX ###

# Not normalized
cm = confusion_matrix(y_eval, pred)
np.set_printoptions(precision=2)
print('Confusion matrix, without normalization')
print(cm)
plt.figure()
plot_confusion_matrix(cm, outcomes)

# Normalize the confusion matrix by row (i.e by the number of samples
# in each class)
cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
print('Normalized confusion matrix')
print(cm_normalized)
plt.figure()
plot_confusion_matrix(cm_normalized, outcomes, title='Normalized confusion matrix')

plt.show()


### PLOT ROC ###

#Plot an ROC. pred - the predictions, y - the expected outpus.
#En mi caso creo que es pred == pred y== y_eval
def plot_roc(pred, y):
    fpr, tpr, _ = roc_curve(y, pred)
    roc_auc = auc(fpr,tpr)

    plt.figure()
    plt.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % roc_auc)
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic (ROC)')
    plt.legend(loc="lower right")
    plt.show()

plot_roc(pred, y_eval)


### PRECISION-RECALL ###

average_precision = average_precision_score(y_eval, pred)

print('Average precision-recall score: {0:0.2f}'.format(
      average_precision))



precision, recall, _ = precision_recall_curve(y_eval, pred)

# In matplotlib < 1.5, plt.fill_between does not have a 'step' argument
step_kwargs = ({'step': 'post'}
               if 'step' in signature(plt.fill_between).parameters
               else {})
plt.step(recall, precision, color='b', alpha=0.2,
         where='post')
plt.fill_between(recall, precision, alpha=0.2, color='b', **step_kwargs)

plt.xlabel('Recall')
plt.ylabel('Precision')
plt.ylim([0.0, 1.05])
plt.xlim([0.0, 1.0])
plt.title('2-class Precision-Recall curve: AP={0:0.2f}'.format(average_precision))

  interactivity=interactivity, compiler=compiler, result=result)


           0      1              2     3    4    5         6    7    8   9   \
0  59.166.0.0   1390  149.171.126.6    53  udp  CON  0.001055  132  164  31   
1  59.166.0.0  33661  149.171.126.9  1024  udp  CON  0.036133  528  304  31   
2  59.166.0.6   1464  149.171.126.7    53  udp  CON  0.001119  146  178  31   

   ...  39  40  41 42  43  44  45  46   47  48  
0  ...   0   3   7  1   3   1   1   1  NaN   0  
1  ...   0   2   4  2   3   1   1   2  NaN   0  
2  ...   0  12   8  1   2   2   1   1  NaN   0  

[3 rows x 49 columns]
Read 700001 rows.


ValueError: invalid literal for int() with base 10: '0x000b'

In [77]:
df[0:10]

Unnamed: 0,srcip,sport,dstip,dsport,dur,sbyte,dbytes,sttl,dttl,sloss,...,service-radius,service-smtp,service-snmp,service-ssh,service-ssl,ct_ftp_cmd-0,ct_ftp_cmd-1,ct_ftp_cmd-2,ct_ftp_cmd-4,ct_ftp_cmd-
0,0.000327,0.092393,0.077615,54145,0.001216,0.000295,0.004147,0.121569,0.114173,0.001316,...,0,0,0,0,0,0,0,0,0,0
1,0.000327,0.119509,0.077615,5607,0.002416,0.00036,0.006213,0.121569,0.114173,0.001316,...,0,0,0,0,0,0,0,0,0,0
2,0.000327,0.173907,0.077615,21,0.001935,0.000204,0.000255,0.121569,0.114173,0.002068,...,0,0,0,0,0,0,0,0,0,0
3,0.000327,0.058045,0.077615,53,1.6e-05,1e-05,1.2e-05,0.121569,0.114173,0.0,...,0,0,0,0,0,0,0,0,0,0
4,0.000327,0.218799,0.077615,14724,0.000641,0.000622,2.2e-05,0.121569,0.114173,0.000752,...,0,0,0,0,0,0,0,0,0,0
5,0.000327,0.596536,0.077615,53,1.7e-05,9e-06,1.1e-05,0.121569,0.114173,0.0,...,0,0,0,0,0,0,0,0,0,0
6,0.000327,0.165484,0.077615,5190,9.4e-05,7.4e-05,0.000154,0.121569,0.114173,0.000752,...,0,0,0,0,0,0,0,0,0,0
7,0.000327,0.696452,0.077615,80,0.000334,7.2e-05,5.6e-05,0.121569,0.114173,0.000376,...,0,0,0,0,0,0,0,0,0,0
8,0.000327,0.029465,0.077615,6881,0.0546,0.000959,0.037402,0.121569,0.114173,0.003948,...,0,0,0,0,0,0,0,0,0,0
9,0.000327,0.392523,0.077615,5190,0.002555,8.9e-05,0.000175,0.121569,0.114173,0.000752,...,0,0,0,0,0,0,0,0,0,0


In [78]:
df['ct']

0         0
1         0
2         0
3         0
4         0
5         0
6         0
7         0
8         0
9         0
10        0
11        0
12        0
13        0
14        0
15        0
16        0
17        0
18        0
19        0
20        0
21        0
22        0
23        0
24        0
25        0
26        0
27        0
28        0
29        0
         ..
699971    0
699972    0
699973    0
699974    0
699975    0
699976    0
699977    0
699978    0
699979    0
699980    0
699981    0
699982    0
699983    0
699984    0
699985    1
699986    1
699987    0
699988    0
699989    0
699990    0
699991    0
699992    0
699993    0
699994    0
699995    0
699996    0
699997    0
699998    0
699999    0
700000    0
Name: Label, Length: 700001, dtype: int64

In [80]:
df.dtypes


srcip                 int64
sport                object
dstip                 int64
dsport               object
proto                object
state                object
dur                 float64
sbyte                 int64
dbytes                int64
sttl                  int64
dttl                  int64
sloss                 int64
dloss                 int64
service              object
Sload               float64
Dload               float64
Spkts                 int64
Dpkts                 int64
swin                  int64
dwin                  int64
stcpb                 int64
dtcpb                 int64
smeansz               int64
dmeansz               int64
trans                 int64
res                   int64
Sjit                float64
Djit                float64
Sintpkt             float64
Dintpkt             float64
tcprtt              float64
synack              float64
ackdat              float64
is_sm_ips_ports       int64
ct_state_ttl          int64
ct_flw_http_mthd    

In [81]:
import pandas as pd
import io
import requests
import numpy as np
import os
import matplotlib.pyplot as plt
import pylab as pl
import tensorflow.contrib.learn as skflow
import string


#from sklearn.utils.multiclass import unique_labels
from scipy.stats import zscore
from sklearn.model_selection import train_test_split
from sklearn import metrics
from keras.models import Sequential
from keras.layers.core import Dense, Activation
from keras.callbacks import EarlyStopping
from sklearn import preprocessing
from sklearn.metrics import confusion_matrix, roc_curve, auc, average_precision_score, precision_recall_curve
from inspect import signature
#%matplotlib inline

path = "UNSW-NB15_1.csv"
# This file is a CSV, just no CSV extension or headers
df = pd.read_csv(path, header=None, na_values = '0x')
print(df[0:3])

print("Read {} rows.".format(len(df)))
# df = df.sample(frac=0.1, replace=False) # Uncomment this line to sample only 10% of the dataset
 # For now, just drop NA's (rows with missing values)

# The CSV file has no column heads, so add them
df.columns = [
    'srcip',
    'sport',
    'dstip',
    'dsport',
    'proto',
    'state',
    'dur',
    'sbyte',
    'dbytes',
    'sttl',
    'dttl',
    'sloss',
    'dloss',
    'service',
    'Sload',
    'Dload',
    'Spkts',
    'Dpkts',
    'swin',
    'dwin',
    'stcpb',
    'dtcpb',
    'smeansz',
    'dmeansz',
    'trans',
    'res',
    'Sjit',
    'Djit',
    'Stime', #QUITAR
    'Ltime', #QUITAR
    'Sintpkt',
    'Dintpkt',
    'tcprtt',
    'synack',
    'ackdat',
    'is_sm_ips_ports',
    'ct_state_ttl',
    'ct_flw_http_mthd',
    'is_ftp_login',
    'ct_ftp_cmd',
    'ct_srv_src',
    'ct_srv_dst',
    'ct_dst_ltm',
    'ct_src_ltm', 
    'ct_src_dport_ltm',
    'ct_dst_sport_lt', 
    'ct_dst_src_ltm',
    'attack_cat',
    'Label'
]

df[0:5]

#df.drop('dsport', 1, inplace=True)#
# df.drop('state', 1, inplace=True)#
# df.drop('dur', 1, inplace=True)#
# df.drop('Sload', 1, inplace=True)#
# df.drop('Dload', 1, inplace=True)#
df.drop('Stime', 1, inplace=True)#
df.drop('Ltime', 1, inplace=True)#
# df.drop('ct_src_ltm', 1, inplace=True)#

ENCODING = 'utf-8'

def expand_categories(values):
    result = []
    s = values.value_counts()
    t = float(len(values))
    for v in s.index:
        result.append("{}:{}%".format(v,round(100*(s[v]/t),2)))
    return "[{}]".format(",".join(result))
        
def analyze(filename):
    print()
    print("Analyzing: {}".format(filename))
    df = pd.read_csv(filename,encoding=ENCODING)
    cols = df.columns.values
    total = float(len(df))

    print("{} rows".format(int(total)))
    for col in cols:
        uniques = df[col].unique()
        unique_count = len(uniques)
        if unique_count>100:
            print("** {}:{} ({}%)".format(col,unique_count,int(((unique_count)/total)*100)))
        else:
            print("** {}:{}".format(col,expand_categories(df[col])))
            expand_categories(df[col])

#analyze(path)


# display 5 rows
# df[0:5]

# Encode text values to dummy variables(i.e. [1,0,0],[0,1,0],[0,0,1] for red,green,blue)
def encode_text_dummy(df, name):
    dummies = pd.get_dummies(df[name])
    for x in dummies.columns:
        dummy_name = f"{name}-{x}"
        df[dummy_name] = dummies[x]
    df.drop(name, axis=1, inplace=True)

#Clean 'date' column and convert to Int type
def clean_date(s):
    s = ''.join([i for i in s if i not in frozenset(string.punctuation)])
    s_removed = s.replace(" ", "")
    s_int = int(s_removed)
    return s_int

########## CLEAN IP #######################
def clean_ip(s):
    s = ''.join([i for i in s if i not in frozenset(string.punctuation)])
    s_int = int(s)
    return s_int

# Encode a numeric column as zscores
def encode_numeric_zscore(df, name, mean=None, sd=None):
    if mean is None:
        mean = df[name].mean()

    if sd is None:
        sd = df[name].std()

    df[name] = (df[name] - mean) / sd

#MINMAX -1 1
# Encode a column to a range between normalized_low and normalized_high.
def min_max_1(df, name, normalized_low=-1, normalized_high=1,
                         data_low=None, data_high=None):
    if data_low is None:
        data_low = min(df[name])
        data_high = max(df[name])

    df[name] = ((df[name] - data_low) / (data_high - data_low)) \
        * (normalized_high - normalized_low) + normalized_low

#MINMAX 0 1
def min_max_0(df, name, normalized_low=0, normalized_high=1,
                         data_low=None, data_high=None):
    if data_low is None:
        data_low = min(df[name])
        data_high = max(df[name])

    df[name] = ((df[name] - data_low) / (data_high - data_low)) \
        * (normalized_high - normalized_low) + normalized_low

    
# Encode text values to indexes(i.e. [1],[2],[3] for red,green,blue).
def encode_text_index(df, name):
    le = preprocessing.LabelEncoder()
    df[name] = le.fit_transform(df[name])
    return le.classes_

# Convert a Pandas dataframe to the x,y inputs that TensorFlow needs
def to_xy(df, target):
    result = []
    for x in df.columns:
        if x != target:
            result.append(x)
    # find out the type of the target column.  Is it really this hard? :(
    target_type = df[target].dtypes
    target_type = target_type[0] if hasattr(
        target_type, '__iter__') else target_type
    # Encode to int for classification, float otherwise. TensorFlow likes 32 bits.
    if target_type in (np.int64, np.int32):
        # Classification
        dummies = pd.get_dummies(df[target])
        return df[result].values.astype(np.float32), dummies.values.astype(np.float32)
    # Regression
    return df[result].values.astype(np.float32), df[[target]].values.astype(np.float32)

# Plot a confusion matrix.
# cm is the confusion matrix, names are the names of the classes.
def plot_confusion_matrix(cm, names, title='Confusion matrix', cmap=plt.cm.Blues):
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(names))
    plt.xticks(tick_marks, names, rotation=45)
    plt.yticks(tick_marks, names)
    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')




df['srcip'] = df['srcip'].apply(clean_ip)
df['dstip'] = df['dstip'].apply(clean_ip)

df['sport'] = df['sport'].astype('int64')
#df['dsport'] = df['dsport'].astype('int64')

# Now encode the feature vector

min_max_0(df, 'srcip')#
min_max_0(df, 'sport')

min_max_0(df, 'dstip') #
#min_max_0(df, 'dsport')

encode_text_dummy(df, 'proto') #
encode_text_dummy(df, 'state')

min_max_0(df, 'dur')

min_max_0(df, 'sbyte') #
min_max_0(df, 'dbytes')#
min_max_0(df, 'sttl')#
min_max_0(df, 'dttl')#
min_max_0(df, 'sloss')#
min_max_0(df, 'dloss')#
encode_text_dummy(df, 'service') #
min_max_0(df, 'Sload')

min_max_0(df, 'Dload')

min_max_0(df, 'Spkts')#
min_max_0(df, 'Dpkts')#
min_max_0(df, 'swin')#
min_max_0(df, 'dwin')#
min_max_0(df, 'stcpb')#
min_max_0(df, 'dtcpb')#
min_max_0(df, 'smeansz')#
min_max_0(df, 'dmeansz')#
min_max_0(df, 'trans')#
min_max_0(df, 'res')#
min_max_0(df, 'Sjit')#
min_max_0(df, 'Djit')#

min_max_0(df, 'Sintpkt')#
min_max_0(df, 'Dintpkt')#
min_max_0(df, 'tcprtt')#
min_max_0(df, 'synack')#
min_max_0(df, 'ackdat')#
min_max_0(df, 'is_sm_ips_ports')#ojo
min_max_0(df, 'ct_state_ttl')#
#min_max_0(df, 'ct_flw_http_mthd')#
#min_max_0(df, 'is_ftp_login')#ojo
#min_max_0(df, 'ct_ftp_cmd')#
encode_text_dummy(df, 'ct_ftp_cmd')#
min_max_0(df, 'ct_srv_src')#
min_max_0(df, 'ct_srv_dst')#
min_max_0(df, 'ct_dst_ltm')#
min_max_0(df, 'ct_src_ltm')

min_max_0(df, 'ct_src_dport_ltm')#
min_max_0(df, 'ct_dst_sport_lt')#
min_max_0(df, 'ct_dst_src_ltm')#

#encode_text_dummy(df, 'attack_cat')

outcomes = encode_text_index(df, 'Label')#
num_classes = len(outcomes)

# display 5 rows

#df.dropna(inplace=True,axis=1)
df[0:10]


##################################################
# Break into X (predictors) & y (prediction)
x, y = to_xy(df,'Label')
#################################################



# Create a test/train split.  25% test
# Split into train/test
x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=0.25, random_state=42)

# Create neural net
model = Sequential()
model.add(Dense(10, input_dim=x.shape[1], kernel_initializer='normal', activation='relu'))
model.add(Dense(50, input_dim=x.shape[1], kernel_initializer='normal', activation='relu'))
model.add(Dense(10, input_dim=x.shape[1], kernel_initializer='normal', activation='relu'))
model.add(Dense(1, kernel_initializer='normal'))
model.add(Dense(y.shape[1],activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
monitor = EarlyStopping(monitor='val_loss', min_delta=1e-4, patience=5, verbose=1, mode='auto')
history = model.fit(x_train,y_train,validation_data=(x_test,y_test),callbacks=[monitor],verbose=2,epochs=1000)


# Measure accuracy
pred = model.predict(x_test)
pred = np.argmax(pred,axis=1)
y_eval = np.argmax(y_test,axis=1)
score = metrics.accuracy_score(y_eval, pred)
print("Validation score: {}".format(score))

print(outcomes)

### PLOT ACCURACY ####

plt.plot(np.arange(len(history.history['acc'])),
history.history['acc'], label='training')
plt.plot(np.arange(len(history.history['val_acc'])),
history.history['val_acc'], label='validation')
plt.title('Accuracy')
plt.xlabel('epochs')
plt.ylabel('accuracy ')
plt.legend(loc=0)
plt.show()

### PLOT CONFUSION MATRIX ###

# Not normalized
cm = confusion_matrix(y_eval, pred)
np.set_printoptions(precision=2)
print('Confusion matrix, without normalization')
print(cm)
plt.figure()
plot_confusion_matrix(cm, outcomes)

# Normalize the confusion matrix by row (i.e by the number of samples
# in each class)
cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
print('Normalized confusion matrix')
print(cm_normalized)
plt.figure()
plot_confusion_matrix(cm_normalized, outcomes, title='Normalized confusion matrix')

plt.show()


### PLOT ROC ###

#Plot an ROC. pred - the predictions, y - the expected outpus.
#En mi caso creo que es pred == pred y== y_eval
def plot_roc(pred, y):
    fpr, tpr, _ = roc_curve(y, pred)
    roc_auc = auc(fpr,tpr)

    plt.figure()
    plt.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % roc_auc)
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic (ROC)')
    plt.legend(loc="lower right")
    plt.show()

plot_roc(pred, y_eval)


### PRECISION-RECALL ###

average_precision = average_precision_score(y_eval, pred)

print('Average precision-recall score: {0:0.2f}'.format(
      average_precision))



precision, recall, _ = precision_recall_curve(y_eval, pred)

# In matplotlib < 1.5, plt.fill_between does not have a 'step' argument
step_kwargs = ({'step': 'post'}
               if 'step' in signature(plt.fill_between).parameters
               else {})
plt.step(recall, precision, color='b', alpha=0.2,
         where='post')
plt.fill_between(recall, precision, alpha=0.2, color='b', **step_kwargs)

plt.xlabel('Recall')
plt.ylabel('Precision')
plt.ylim([0.0, 1.05])
plt.xlim([0.0, 1.0])
plt.title('2-class Precision-Recall curve: AP={0:0.2f}'.format(average_precision))

           0      1              2     3    4    5         6    7    8   9   \
0  59.166.0.0   1390  149.171.126.6    53  udp  CON  0.001055  132  164  31   
1  59.166.0.0  33661  149.171.126.9  1024  udp  CON  0.036133  528  304  31   
2  59.166.0.6   1464  149.171.126.7    53  udp  CON  0.001119  146  178  31   

   ...  39  40  41 42  43  44  45  46   47  48  
0  ...   0   3   7  1   3   1   1   1  NaN   0  
1  ...   0   2   4  2   3   1   1   2  NaN   0  
2  ...   0  12   8  1   2   2   1   1  NaN   0  

[3 rows x 49 columns]
Read 700001 rows.


ValueError: invalid literal for int() with base 10: '0x000b'