# Flow Classification Task

In [23]:
from collections import Counter
from sklearn import tree
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.cross_validation import train_test_split
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
import math
import numpy as np
import seaborn as sns
import pandas as pd
import random
import collections
import string
import time

### Load data

In [5]:
#Load preprocessed data
data=pd.read_csv('numeric_data_scenario10.csv',sep='\t')
data.shape

(645357, 12)

In [42]:
Label_list=[]
Source_list=[]
Des_list=[]
for index, row in data.iterrows():
    if(row['Labels']=='Botnet'):
        Label_list.append(1)
    else:
        Label_list.append(0)
    Source_list.append(row['SrcIPAddr:Port'].split(':')[0])
    #print(row['SrcIPAddr:Port'].split(':')[0])
data['Int_label']=Label_list
data['SourceIP']=Source_list

In [76]:
data.describe()

Unnamed: 0.1,Unnamed: 0,Durat,Prot,Flags,PacketsBytes,Flows,Label,code,Int_label
count,645357.0,645357.0,645357.0,645357.0,645357.0,645357.0,645357.0,645357.0,645357.0
mean,322678.0,0.251275,10.347725,17.386899,0.416758,1.673449,1.0,7069.802872,0.501133
std,186298.663173,0.433747,9.414517,15.481148,0.493023,1.103632,0.0,5871.694814,0.499999
min,0.0,0.0,1.0,0.0,0.0,0.0,1.0,624.0,0.0
25%,161339.0,0.0,1.0,2.0,0.0,1.0,1.0,668.0,0.0
50%,322678.0,0.0,7.0,15.0,0.0,2.0,1.0,4416.0,1.0
75%,484017.0,1.0,20.0,34.0,1.0,3.0,1.0,12754.0,1.0
max,645356.0,1.0,20.0,80.0,1.0,3.0,1.0,14941.0,1.0


In [77]:
data.head()

Unnamed: 0.1,Unnamed: 0,Dateflowstart,Durat,Prot,SrcIPAddr:Port,DstIPAddr:Port,Flags,PacketsBytes,Flows,Label,Labels,code,Int_label,SourceIP
0,0,2011-08-18 10:19:13.392,0,1,147.32.86.110:48102,74.125.232.214:443,0,0,0,1,LEGITIMATE,624,0,147.32.86.110
1,1,2011-08-18 10:19:13.411,1,1,147.32.85.88:56949,147.32.80.13:80,0,1,3,1,LEGITIMATE,2503,0,147.32.85.88
2,2,2011-08-18 10:19:13.460,1,1,147.32.84.59:2768,74.125.108.208:80,0,1,3,1,LEGITIMATE,2503,0,147.32.84.59
3,3,2011-08-18 10:19:13.486,0,1,147.32.84.59:56058,74.125.232.215:443,0,0,0,1,LEGITIMATE,624,0,147.32.84.59
4,4,2011-08-18 10:19:13.689,1,1,147.32.84.164:34588,195.24.232.164:80,0,1,3,1,LEGITIMATE,2503,0,147.32.84.164


In [52]:
data_train, data_test, label_train, label_test = train_test_split(data[['SourceIP','Durat','Prot','Flags','PacketsBytes','PacketsBytes']], data['Int_label'], test_size = 0.3)
data_train_source=data_train['SourceIP']
data_test_source=data_test['SourceIP']
del data_train['SourceIP']
del data_test['SourceIP']

### Packet Level

In [53]:
print('##################################################')
print('#                Random Forest                   #')
print('##################################################')
RF=RandomForestClassifier(n_estimators=50 , n_jobs=2)
RF.fit(data_train,label_train)
predict_label=RF.predict(data_test)
#print(Counter(predict_label-label_test))
tn, fp, fn, tp = confusion_matrix(label_test, predict_label).ravel()
print('tp=',tp,'tn=',tn,'fp=',fp,'fn=',fn)
print('recall=',tp/(tp+fn),' precision=',tp/(tp+fp))
print('tpr=',tp/(tp+fn),' fpr=',fp/(fp+tn))

##################################################
#                Random Forest                   #
##################################################
tp= 95344 tn= 96375 fp= 330 fn= 1559
recall= 0.9839117467983447  precision= 0.9965507870476827
tpr= 0.9839117467983447  fpr= 0.0034124398945245852


In [104]:
print('##################################################')
print('#             Logistic Regression               #')
print('##################################################')
LR=LogisticRegression(C=10)
LR.fit(data_train,label_train)
predict_label=LR.predict(data_test)
#print(Counter(predict_label-label_test))
tn, fp, fn, tp = confusion_matrix(label_test, predict_label).ravel()
print('tp=',tp,'tn=',tn,'fp=',fp,'fn=',fn)
print('recall=',tp/(tp+fn),' precision=',tp/(tp+fp))
print('tpr=',tp/(tp+fn),' fpr=',fp/(fp+tn))

tp= 93614 tn= 96317 fp= 388 fn= 3289
recall= 0.9660588423475022  precision= 0.9958724282462075
tpr= 0.9660588423475022  fpr= 0.004012202057804663


### Host Level

Here we choose oneinfected IP source to train, and the rest known IP to test.
If one data of a host is regarded as infected, then we take that IP as a infected IP. 

In [98]:
infected_host='147.32.84.205'
host=['147.32.84.191','147.32.84.192','147.32.84.193','147.32.84.204','147.32.84.165','147.32.84.206','147.32.84.207','147.32.84.208','147.32.84.209','147.32.84.170','147.32.84.134','147.32.84.164']
train_set=data[data['SourceIP']==infected_host]
test_set=data[data['SourceIP']!=infected_host]
data_test_source=test_set['SourceIP']
train_label=train_set['Int_label']
test_label=test_set['Int_label']
train_set=train_set[['Durat','Prot','Flags','PacketsBytes','PacketsBytes']]
test_set=test_set[['Durat','Prot','Flags','PacketsBytes','PacketsBytes']]

In [99]:
print('##################################################')
print('#                Random Forest                   #')
print('##################################################')
RF=RandomForestClassifier(n_estimators=20 , n_jobs=3)
RF.fit(train_set,train_label)
predict_label=RF.predict(test_set)
#print(Counter(predict_label-label_test))
test_set['SourceIP']=data_test_source
test_set['predict_label']=predict_label
test_set['label']=test_label


In [108]:
true_label=[]
predicted_label=[]
for ip in host:
    subset=test_set[test_set['SourceIP']==ip]
    #subset.info()
    #print(subset['label'].value_counts())
    #print(subset['predict_label'].value_counts())
    if (1 in list(subset['label'])):
        true_label.append(1)
    else:
        true_label.append(0)
    if (1 in list(subset['predict_label'])):
        predicted_label.append(1)
    else:
        predicted_label.append(0)
        
tp=0
fp=0
tn=0
fn=0
for i in range(len(predicted_label)):
    if (true_label[i]==1 and predicted_label[i]==0):
        fn+=1
    if (true_label[i]==1 and predicted_label[i]==1):
        tp+=1
    if (true_label[i]==0 and predicted_label[i]==0):
        tn+=1
    if (true_label[i]==0 and predicted_label[i]==1):    
        fp+=1
        
print('tp=',tp,'tn=',tn,'fp=',fp,'fn=',fn)
print('recall=',tp/(tp+fn),' precision=',tp/(tp+fp))
print('tpr=',tp/(tp+fn),' fpr=',fp/(fp+tn))

tp= 9 tn= 0 fp= 3 fn= 0
recall= 1.0  precision= 0.75
tpr= 1.0  fpr= 1.0


In [113]:
infected_host='147.32.84.205'
host=['147.32.84.191','147.32.84.192','147.32.84.193','147.32.84.204','147.32.84.165','147.32.84.206','147.32.84.207','147.32.84.208','147.32.84.209','147.32.84.170','147.32.84.134','147.32.84.164']
train_set=data[data['SourceIP']==infected_host]
test_set=data[data['SourceIP']!=infected_host]
data_test_source=test_set['SourceIP']
train_label=train_set['Int_label']
test_label=test_set['Int_label']
train_set=train_set[['Durat','Prot','Flags','PacketsBytes','PacketsBytes']]
test_set=test_set[['Durat','Prot','Flags','PacketsBytes','PacketsBytes']]


In [114]:
LR=LogisticRegression(C=10)
LR.fit(train_set,train_label)
predict_label=LR.predict(test_set)
#print(Counter(predict_label-label_test))
test_set['SourceIP']=data_test_source
test_set['predict_label']=predict_label
test_set['label']=test_label

In [115]:
true_label=[]
predicted_label=[]
for ip in host:
    subset=test_set[test_set['SourceIP']==ip]
    #subset.info()
    #print(subset['label'].value_counts())
    #print(subset['predict_label'].value_counts())
    if (1 in list(subset['label'])):
        true_label.append(1)
    else:
        true_label.append(0)
    if (1 in list(subset['predict_label'])):
        predicted_label.append(1)
    else:
        predicted_label.append(0)
        
tp=0
fp=0
tn=0
fn=0
for i in range(len(predicted_label)):
    if (true_label[i]==1 and predicted_label[i]==0):
        fn+=1
    if (true_label[i]==1 and predicted_label[i]==1):
        tp+=1
    if (true_label[i]==0 and predicted_label[i]==0):
        tn+=1
    if (true_label[i]==0 and predicted_label[i]==1):    
        fp+=1
        
print('tp=',tp,'tn=',tn,'fp=',fp,'fn=',fn)
print('recall=',tp/(tp+fn),' precision=',tp/(tp+fp))
print('tpr=',tp/(tp+fn),' fpr=',fp/(fp+tn))
    

tp= 9 tn= 0 fp= 3 fn= 0
recall= 1.0  precision= 0.75
tpr= 1.0  fpr= 1.0
