**TASK 2**

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import copy
import os
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
le = LabelEncoder()
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import RandomOverSampler

In [None]:
# Load training data from A2_2

task1_data_dir = '/Users/adityadesu/Desktop/COMP90073/Assignment - 2/A2_2'
rootpath2 = '/Users/adityadesu/Desktop/COMP90073/Assignement - 2 Report/'

#Load training data
train_data = pd.read_csv(os.path.join(task1_data_dir, '2_training_data_with_label.csv'))

# Load validation data
val_data = pd.read_csv(os.path.join(task1_data_dir, '2_validation_data_with_label.csv'))

# Load test data
test_data = pd.read_csv(os.path.join(task1_data_dir, '2_test_data_with_label.csv'))

# Add a column names to all the dataframes
features = ['stream_ID','date-time', 'duration','protocol', 'src_ip', 'src_port', 'direction', 'dst_ip', 'dst_port', 'state', 'src_type', 'dst_type','total_packets', 'two-way-byte_transf', 'src-dst-byte-transf', 'label']
train_data.columns = features
val_data.columns = features
test_data.columns = features

# Convert anomaly to 1 and normal to 0
train_data.loc[train_data['label'].str.contains('Botnet'), 'label'] = 1
train_data.loc[train_data['label'].str.contains('Botnet')==False, 'label'] = 0

val_data.loc[val_data['label'].str.contains('Botnet'), 'label'] = 1
val_data.loc[val_data['label'].str.contains('Botnet')==False, 'label'] = 0

test_data.loc[test_data['label'].str.contains('Botnet'), 'label'] = 1
test_data.loc[test_data['label'].str.contains('Botnet')==False, 'label'] = 0

# Save labels of data
train_labels = train_data['label'].values
val_labels = val_data['label'].values
test_labels = test_data['label'].values


In [None]:
# Count no of labels contianing botnet
train_data['label'].value_counts()

In [None]:
# count of instances with null values
#print("Count of training instances with missing values:\n",train_data.isnull().sum())
#print("Count of training instances with missing values:\n",val_data.isnull().sum())
#print("Count of training instances with missing values:\n",test_data.isnull().sum())

# Fill missing values with mode
train_data['state'].fillna(train_data['state'].mode()[0], inplace=True)
train_data['src_type'].fillna(train_data['src_type'].mode()[0], inplace=True)
train_data['dst_type'].fillna(train_data['dst_type'].mode()[0], inplace=True)

# Fill missing values with mode
val_data['state'].fillna(val_data['state'].mode()[0], inplace=True)
val_data['src_type'].fillna(val_data['src_type'].mode()[0], inplace=True)
val_data['dst_type'].fillna(val_data['dst_type'].mode()[0], inplace=True)


# Fill missing values with mode
test_data['state'].fillna(test_data['state'].mode()[0], inplace=True)
test_data['src_type'].fillna(test_data['src_type'].mode()[0], inplace=True)
test_data['dst_type'].fillna(test_data['dst_type'].mode()[0], inplace=True)

In [None]:
print(train_data['label'].value_counts())
print(val_data['label'].value_counts())
print(test_data['label'].value_counts())



In [None]:
def feature_generator(df):
    ''' Generate features from the data
    params:
        df: dataframe
    returns:
        df: dataframe with new features'''
    # Join src ip, dst ip and dst port
    #df['irc'] = df['src_ip'].str.cat(df['dst_ip'], sep="").str.cat(df['dst_port'].astype(str), sep="")
    feature_generated_df = pd.DataFrame()
    #Compute difference between alternate date-time values
    feature_generated_df['duration'] = df['duration']
    feature_generated_df['src_ip'] = df['src_ip']
    feature_generated_df['total_packets'] = df['total_packets']
    #feature_generated_df['dst_ip'] = le.fit_transform(df['dst_ip'])
    feature_generated_df['pps'] = df['total_packets']/df['duration'].replace({0:np.inf})
    feature_generated_df['bps'] = df['two-way-byte_transf']/df['duration'].replace({0:np.inf})
    feature_generated_df['bps_src'] = df['src-dst-byte-transf']/df['duration'].replace({0:np.inf})
    feature_generated_df['bps_two-way'] = df['two-way-byte_transf']/df['duration'].replace({0:np.inf})
    feature_generated_df['bytes_per_packet'] = df['two-way-byte_transf']/df['total_packets'].replace({0:np.inf})
    feature_generated_df['bytes_per_packet_src'] = df['src-dst-byte-transf']/df['total_packets'].replace({0:np.inf})
    feature_generated_df['label'] = df['label']
    return feature_generated_df

# Generate features
feature_generated_training_data = feature_generator(train_data)
feature_generated_val_data = feature_generator(val_data)
feature_generated_test_data = feature_generator(test_data)

feature_generated_training_data.head()


In [None]:
train_bot_ips = train_data[train_data['label']==1]['src_ip'].unique()
val_bot_ips = val_data[val_data['label']==1]['src_ip'].unique()
test_bot_ips = test_data[test_data['label']==1]['src_ip'].unique()

In [None]:
feature_generated_training_data('label').value_counts()

In [None]:
feature_generated_training_data.groupby('label')['src_ip'].value_counts()

In [None]:
#extract for each IP address: (1) mean outbound packet size, (2) variance of outbound packet size, (3) mean packet count per second, (4) max packet count per second, (5) mean of packet jitter, (6) variance of packet jitter.

def extract_features_ip(df):
    ''' Extract features from the data
    params:
        df: dataframe
    returns:
        df: dataframe with new features'''
        
    df2 = df.groupby('src_ip').agg({'total_packets': ['mean', 'var'], 'pps': ['mean', 'max'], 'duration': ['mean', 'var'], 'bps': ['mean', 'var'], 'bps_src': ['mean', 'var'], 'bps_two-way': ['mean', 'var'], 'bytes_per_packet': ['mean', 'var']})
    # flatten the column names
    df2.columns = ['_'.join(col).strip() for col in df2.columns.values]
    # reset index
    df2 = df2.reset_index()
    # rename columns
    df2.columns = ['src_ip','mean_otb_pkt_size', 'var_otb_pkt_size','mean_pkt_cnt_per_sec', 'max_pkt_cnt_per_sec', 'mean_pkt_jitter', 'var_pkt_jitter', 'mean_bps', 'var_bps', 'mean_bps_src', 'var_bps_src', 'mean_bps_two_way', 'var_bps_two_way', 'mean_bytes_per_pkt', 'var_bytes_per_pkt']

    # add label column
    

    return df2

feature_generated_training_data = extract_features_ip(feature_generated_training_data)
feature_generated_val_data = extract_features_ip(feature_generated_val_data)
feature_generated_test_data = extract_features_ip(feature_generated_test_data)


# Replace all Nan values with 0
feature_generated_training_data.fillna(0, inplace=True)
feature_generated_val_data.fillna(0, inplace=True)
feature_generated_test_data.fillna(0, inplace=True)

# Add label column by normal or botnet label
feature_generated_training_data['label'] = feature_generated_training_data['src_ip'].apply(lambda x: 1 if x in train_bot_ips else 0)
feature_generated_val_data['label'] = feature_generated_val_data['src_ip'].apply(lambda x: 1 if x in val_bot_ips else 0)
feature_generated_test_data['label'] = feature_generated_test_data['src_ip'].apply(lambda x: 1 if x in test_bot_ips else 0)



In [None]:

feature_generated_test_data['label'].value_counts()

In [None]:
# Save to csv
feature_generated_training_data.to_csv('training_data_agg.csv', index=False)
feature_generated_val_data.to_csv('val_data_agg.csv', index=False)
feature_generated_test_data.to_csv('test_data_agg.csv', index=False)

**Notebook can be run from here**

In [None]:
# Load the data
feature_generated_training_data = pd.read_csv('training_data_agg.csv')
feature_generated_val_data = pd.read_csv('val_data_agg.csv')
feature_generated_test_data = pd.read_csv('test_data_agg.csv')

In [None]:
try:
    feature_generated_training_data.drop(['src_ip'], axis=1, inplace=True)
    feature_generated_val_data.drop(['src_ip'], axis=1, inplace=True)
    feature_generated_test_data.drop(['src_ip'], axis=1, inplace=True)

except:
    pass
print(feature_generated_training_data.columns)

feature_generated_val_data.iloc[:,0:14]

**Feature Selection**

In [None]:

#apply SelectKBest class to extract top 6 best features
bestfeatures = SelectKBest(score_func=chi2, k=6)
fit = bestfeatures.fit(feature_generated_training_data.iloc[:,0:14],feature_generated_training_data['label'].astype(int))
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(feature_generated_training_data.iloc[:,0:14].columns)

#concat two dataframes for better visualization
featureScores = pd.concat([dfcolumns,dfscores],axis=1)
featureScores.columns = ['Specs','Score']  #naming the dataframe columns

# print 4 best features
print(featureScores.nlargest(6,'Score'))

# Save best 4 features
best_features = featureScores.nlargest(6,'Score')['Specs'].values

# add label to the best features
best_features = np.append(best_features, 'label')

# Apply these features to the data
feature_generated_training_data = feature_generated_training_data[best_features]
feature_generated_val_data = feature_generated_val_data[best_features]
feature_generated_test_data = feature_generated_test_data[best_features]

#
feature_generated_training_data.head()

**Feature Standardisation**

In [None]:
# Apply min max scaling

scaler = StandardScaler()
feature_generated_training_data.iloc[:,0:6] = scaler.fit_transform(feature_generated_training_data.iloc[:,0:6])
feature_generated_training_data.head()
feature_generated_val_data.iloc[:,0:6] = scaler.transform(feature_generated_val_data.iloc[:,0:6])
feature_generated_test_data.iloc[:,0:6] = scaler.transform(feature_generated_test_data.iloc[:,0:6])

In [None]:
def accurate_outlier_preds(preds,labels):
    ''' Calaculate the number of accurate outlier predictions
        params:
            preds: predictions
            train_labels: true labels
        returns:
            number of accurate outlier predictions'''
    accurate_outlier_count = 0
    for i in range(len(preds)):
        if preds[i] == 1:
            if labels[i] == 1:
                accurate_outlier_count +=1
    return accurate_outlier_count

**Logistic Regression**

In [None]:

# Create logistic regression
logisticRegr = LogisticRegression(random_state = 0)

# Train the classifier
logisticRegr.fit(feature_generated_training_data.iloc[:,0:6], feature_generated_training_data['label'].astype(int))

# Predict on training set
pred = logisticRegr.predict(feature_generated_training_data.iloc[:,0:6])

# View accuracy score
print("Accuracy score (training): {0:.3f}".format(accuracy_score(feature_generated_training_data['label'].astype(int), pred)))

# View confusion matrix
print("Confusion matrix (training):")
print(confusion_matrix(feature_generated_training_data['label'].astype(int), pred))

# View classification report
print("Classification report (training):")
print(classification_report(feature_generated_training_data['label'].astype(int), pred))

a = accurate_outlier_preds(pred,feature_generated_training_data['label'].astype(int))
print("Number of accurate outlier predictions (training):",a)

In [None]:
# Predict on validation set
val_pred = logisticRegr.predict(feature_generated_val_data.iloc[:,0:6])

# View accuracy score
print("Accuracy score (validation): {0:.3f}".format(accuracy_score(feature_generated_val_data['label'].astype(int), val_pred)))

# View confusion matrix
print("Confusion matrix (validation):")
print(confusion_matrix(feature_generated_val_data['label'].astype(int), val_pred))

# View classification report
print("Classification report (validation):")
print(classification_report(feature_generated_val_data['label'].astype(int), val_pred))

a = accurate_outlier_preds(val_pred,feature_generated_val_data['label'].astype(int))
print("Number of accurate outlier predictions (validation):",a)

In [None]:
# Predict on test set
test_pred = logisticRegr.predict(feature_generated_test_data.iloc[:,0:6])

# View accuracy score
print("Accuracy score (test): {0:.3f}".format(accuracy_score(feature_generated_test_data['label'].astype(int), test_pred)))

# View confusion matrix
print("Confusion matrix (test):")
print(confusion_matrix(feature_generated_test_data['label'].astype(int), test_pred))

# View classification report
print("Classification report (test):")
print(classification_report(feature_generated_test_data['label'].astype(int), test_pred))

a = accurate_outlier_preds(test_pred,feature_generated_test_data['label'].astype(int))

**Training and Testing on Oversampled botnet class data**

In [None]:
# Oversample minority class using random oversampling

ros = RandomOverSampler(sampling_strategy ='minority',random_state=0)
X_resampled, y_resampled = ros.fit_resample(feature_generated_training_data.iloc[:,0:6], feature_generated_training_data['label'].astype(int))

print("Number of samples in each class (training):")
print(y_resampled.value_counts())

In [None]:


# Create logistic regression
logisticRegr = LogisticRegression(random_state = 0)

# Train the model
logisticRegr.fit(X_resampled, y_resampled)

# Predict on training set
pred = logisticRegr.predict(X_resampled)

# View accuracy score
print("Accuracy score (training): {0:.3f}".format(accuracy_score(y_resampled, pred)))

# View confusion matrix
print("Confusion matrix (training):")
print(confusion_matrix(y_resampled, pred))

# View classification report
print("Classification report (training):")
print(classification_report(y_resampled, pred))

a = accurate_outlier_preds(pred,y_resampled)
print("Number of accurate outlier predictions (training):",a)

In [None]:
# Predict on validation set
val_pred = logisticRegr.predict(feature_generated_val_data.iloc[:,0:6])

# View accuracy score
print("Accuracy score (validation): {0:.3f}".format(accuracy_score(feature_generated_val_data['label'].astype(int), val_pred)))

# View confusion matrix
print("Confusion matrix (validation):")
print(confusion_matrix(feature_generated_val_data['label'].astype(int), val_pred))

# View classification report
print("Classification report (validation):")
print(classification_report(feature_generated_val_data['label'].astype(int), val_pred))

a = accurate_outlier_preds(val_pred,feature_generated_val_data['label'].astype(int))
print("Number of accurate outlier predictions (validation):",a)

In [None]:
# Predict on test set
test_pred = logisticRegr.predict(feature_generated_test_data.iloc[:,0:6])

# View accuracy score
print("Accuracy score (test): {0:.3f}".format(accuracy_score(feature_generated_test_data['label'].astype(int), test_pred)))

# View confusion matrix
print("Confusion matrix (test):")
print(confusion_matrix(feature_generated_test_data['label'].astype(int), test_pred))

# View classification report
print("Classification report (test):")
print(classification_report(feature_generated_test_data['label'].astype(int), test_pred))

a = accurate_outlier_preds(test_pred,feature_generated_test_data['label'].astype(int))