In [132]:
#Netflows are total count of occurence of a src_ip in a particular window
import pandas as pd
import numpy as np
#Edit path to the file for extracting features
def getFeatures(file_path):
    df=pd.read_csv(file_path)
    temp = pd.get_dummies(df.Tag)
    df['attack'] = temp.Attack
    df['normal'] = temp.Normal
    del temp, df['Tag']
    group = df.groupby(['bucket','source'])
    features = group.sourcePort.nunique()
    features = pd.DataFrame(features)
    features.rename(columns={'sourcePort':'usrc_port'},inplace=True)
    features['udest_ip'] = group.destination.nunique()
    features['udest_port'] = group.destinationPort.nunique()
    features['netflows'] = group.destination.count()
    features['bytes'] = group.totalSourceBytes.sum()
    features['packets'] = group.totalSourcePackets.sum()
    features['attack'] = group.attack.sum()
    features['normal'] = group.normal.sum()
    features.reset_index(inplace=True) 
    return features
    
def featurestoArray(features):
    #Converting DF to np array
    features_array = features.copy()
    del features_array['attack'],features_array['normal'],features_array['source'],features_array['bucket']
    features_array = features_array.as_matrix()
    return features_array

def getClusterFeatures(labeled_features):
    group = labeled_features.groupby(['label'])
    clusterfeatures = group.bucket.count()
    clusterfeatures = pd.DataFrame(clusterfeatures)
    clusterfeatures.rename(columns={'bucket':'instances'},inplace=True) 
    clusterfeatures['netflows']=group.netflows.sum()
    clusterfeatures['avgnetflows']=group.netflows.mean()
    clusterfeatures['stdnetflows']=group.netflows.std()
    clusterfeatures['usrc_ip']=group.source.nunique()
    clusterfeatures['avgsrc_port']=group.usrc_port.mean()
    clusterfeatures['stdsrc_port']=group.usrc_port.std()
    clusterfeatures['avgdest_ip']=group.udest_ip.mean()
    clusterfeatures['stddest_ip']=group.udest_ip.std()
    clusterfeatures['avgdest_port']=group.udest_port.mean()
    clusterfeatures['stddest_port']=group.udest_port.std()
    clusterfeatures['avgbytes']=group.bytes.mean()
    clusterfeatures['stdbytes']=group.bytes.std()
    clusterfeatures['avgpackets']=group.packets.mean()
    clusterfeatures['stdpackets']=group.packets.std()
    clusterfeatures['attack']=group.attack.sum()
    clusterfeatures['normal']=group.normal.sum()
    #True = Botnet    False = Normal
    clusterfeatures['label'] = (group.attack.sum()/group.normal.sum()>0.01)
    return clusterfeatures

In [133]:
def getLabeledFeaturesGMM(features_array):
    X_train = features_array
    from sklearn import mixture
    #Number of clusters = number of components
    # Four covariance Type ['spherical', 'diag', 'tied', 'full'] iterations can be changed
    clf = mixture.GMM(n_components=40, covariance_type='full', n_iter=100)
    label = clf.fit_predict(X_train)
    return label           

def addLabelTofeatures(features, label):
    labeled_features = features.copy()
    labeled_features['label'] = label
    return labeled_features

In [134]:
from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import Imputer 
features = getFeatures(file_path="/home/ankit/Desktop/MTP/working_directory/NDF/13Junflow_ndf.csv")
X_train, X_test = train_test_split(features, test_size=0.3,random_state=0)
#X_train = getFeatures(file_path="F:/MTP/working_directory/ndf3.csv")
#X_train

In [78]:
#Reading unlabelled test file
#X_test
# df=pd.read_csv("F:/MTP/working_directory/17Jan_df_bucket.csv")
# group = df.groupby(['bucket','SrcAddr'])
# X_test = group.Sport.nunique()
# X_test = pd.DataFrame(X_test)
# X_test['udest_ip'] = group.DstAddr.nunique()
# X_test['udest_port'] = group.Dport.nunique()
# X_test['netflows'] = group.DstAddr.count()
# X_test['bytes'] = group.TotBytes.sum()
# X_test['packets'] = group.sTos.sum()
# X_test['attack'] = group.sTos.sum() #dummy
# X_test['normal'] = group.sTos.sum() #dummy
# X_test.reset_index(inplace=True) 
# X_test.rename(columns={'Sport':'usrc_port'},inplace=True)
# X_test.rename(columns={'SrcAddr':'source'},inplace=True)
# X_test
#df

Converting Dataframe to numpy object

In [135]:
feature_array = featurestoArray(features=X_train)
label = getLabeledFeaturesGMM(features_array=feature_array)
X_train = addLabelTofeatures(features=X_train,label=label)
train_cluster_features = getClusterFeatures(labeled_features=X_train) 
train_labels = train_cluster_features['label']
del train_cluster_features['attack'],train_cluster_features['normal'],train_cluster_features['label']
train_cluster_features = Imputer.fit_transform(Imputer(strategy="most_frequent",axis=0),train_cluster_features)


feature_array = featurestoArray(features=X_test)
label = getLabeledFeaturesGMM(features_array=feature_array)
X_test = addLabelTofeatures(features=X_test,label=label)
test_cluster_features = getClusterFeatures(labeled_features=X_test)
test_labels = test_cluster_features['label']
del test_cluster_features['attack'],test_cluster_features['normal'],test_cluster_features['label'] 
test_cluster_features = Imputer.fit_transform(Imputer(strategy="most_frequent",axis=0),test_cluster_features)
del feature_array

In [136]:
train_cluster_features = train_cluster_features.astype(np.float32)
test_cluster_features = test_cluster_features.astype(np.float32)
train_labels = train_labels.astype(np.float32)
test_labels = test_labels.astype(np.float32)

In [137]:
#normalize cluster features
import sklearn.preprocessing
train_cluster_features= sklearn.preprocessing.normalize(train_cluster_features,axis=0)
test_cluster_features= sklearn.preprocessing.normalize(test_cluster_features,axis=0)

In [142]:
#from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier(random_state=0)
clf.fit(train_cluster_features,train_labels)
cluster_label_predict=clf.predict(test_cluster_features)

In [151]:
from sklearn.metrics import confusion_matrix
#confusion_matrix(test_labels, label_predicted)
print sum(abs(label_predicted-test_labels))
temp = X_test.groupby(['label'])
t = temp.packets.count()
label_predicted = [-1 if e == 0 else e for e in label_predicted]
test_labels = [-1 if e == 0 else e for e in test_labels]

pred = [a*b for a,b in zip(label_predicted,t)]
act =  [a*b for a,b in zip(test_labels,t)]
pp =0
pn = 0
nn = 0
np = 0
for a,b in zip(pred,act):
    if a>0 and b>0:
        pp +=1;
    elif a<0 and b<0:
        nn += 1;
    elif a>0 and b<0:
        pn += 1;
    else :
        np += 1;
print(pp)
print(nn)
print(np)
print(pn)

TypeError: unsupported operand type(s) for -: 'list' and 'list'

In [147]:
X_test['pred'] = [label_predicted[x[0]] for x in zip(X_test['label'])]
X_test['act_label'] = [test_labels[x[0]] for x in zip(X_test['label'])]
X_test.source.unique()

array(['131.202.240.218', '192.168.1.105', '192.168.1.101',
       '192.168.4.119', '192.168.3.115', '192.168.2.111', '192.168.4.118',
       '192.168.2.113', '0.0.0.0', '192.168.3.117', '192.168.3.116',
       '192.168.3.114', '192.168.2.112', '192.168.2.108', '192.168.2.107',
       '192.168.5.123', '192.168.2.110', '192.168.5.122', '192.168.2.106',
       '192.168.4.120', '192.168.1.102', '192.168.1.104', '192.168.2.109',
       '192.168.1.103', '192.168.4.121', '131.202.243.90',
       '64.237.127.131', '142.167.88.44', '63.111.123.26',
       '210.188.199.237'], dtype=object)

In [148]:
#X_test.columns
malicious_IP = X_test.groupby(['pred']).get_group(1)['source']
malicious_IP.unique()

array(['192.168.4.119', '192.168.2.111', '192.168.2.112', '192.168.5.122',
       '192.168.4.120', '192.168.1.104', '192.168.2.109', '192.168.3.117',
       '192.168.3.115', '192.168.2.106', '192.168.1.101', '192.168.4.118',
       '192.168.2.110', '192.168.2.107', '192.168.4.121', '192.168.1.105',
       '192.168.1.102', '192.168.1.103', '0.0.0.0', '192.168.2.108',
       '192.168.2.113', '192.168.3.114', '192.168.3.116', '192.168.5.123',
       '142.167.88.44', '131.202.243.90'], dtype=object)

In [149]:
malicious_act = X_test.groupby(['act_label']).get_group(1)['source']
malicious_act.unique()

array(['131.202.240.218', '192.168.1.105', '192.168.3.115', '0.0.0.0',
       '192.168.3.117', '192.168.3.116', '192.168.3.114', '192.168.2.108',
       '192.168.5.123', '192.168.2.110', '192.168.2.107', '192.168.5.122',
       '192.168.2.112', '192.168.1.102', '192.168.4.119', '192.168.1.104',
       '192.168.2.106', '192.168.1.101', '192.168.2.113', '192.168.1.103',
       '192.168.4.118', '192.168.2.111', '192.168.4.121', '192.168.2.109',
       '192.168.4.120', '131.202.243.90', '64.237.127.131',
       '142.167.88.44', '63.111.123.26', '210.188.199.237'], dtype=object)

In [31]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
gnb.fit(train_cluster_features,train_labels)
label_predicted=gnb.predict(test_cluster_features)
gnb.score(test_cluster_features,test_labels)

0.0