In [1]:
import os

import pandas as pd
import numpy as np
from tqdm import tqdm


DATA_DIR="/data/security/CSV"

files_dict = {}
for folder in os.listdir(DATA_DIR):
    subroot = os.path.join(DATA_DIR, folder)
    files_dict[folder] = [os.path.join(subroot, file) for file in os.listdir(subroot)]


print(files_dict.keys())

def read_csvs(path_list):
    """Creates DataFrame from list of CSV paths

    Args:
        path_list (List): list of paths

    Returns:
        DataFrame: dataframe constructed by concatinating
    """
    dfs = [pd.read_csv(f, low_memory=False) for f in tqdm(path_list)]
    df = pd.concat(dfs, ignore_index=True)
    return df

attack_list = ['1.Deauth', '7.SSH', '10.SQL_Injection', '12.Evil_Twin', '13.Website_spoofing']
dfs = [read_csvs(files_dict[attack]) for attack in tqdm(attack_list)]
data = pd.concat(dfs, ignore_index=True)
    # data = read_csvs(files_dict['1.Deauth', '7.SSH', '10.SQL_Injection', '12.Evil_Twin', '13.Website_spoofing'])


print(data['Label'].value_counts())

dict_keys(['1.Deauth', '12.Evil_Twin', '9.Malware', '7.SSH', '4.Rogue_AP', '5.Krack', '13.Website_spoofing', '6.Kr00k', '2.Disas', '3.ReAssoc', '8.Botnet', '10.SQL_Injection', '11.SSDP'])


100%|██████████| 33/33 [00:41<00:00,  1.25s/it]
100%|██████████| 49/49 [01:03<00:00,  1.29s/it]
100%|██████████| 52/52 [01:11<00:00,  1.38s/it]
100%|██████████| 76/76 [01:39<00:00,  1.30s/it]
100%|██████████| 54/54 [01:08<00:00,  1.26s/it]
100%|██████████| 5/5 [06:27<00:00, 77.44s/it]


: 

: 

In [2]:
pd.set_option('display.max_columns', 300)
pd.set_option('display.max_rows', 300)

In [3]:
drop_cols = []
for col in data.columns:
    if len(data[col].unique()) == 1:
        drop_cols.append(col)

In [4]:
data.drop(drop_cols, axis=1, inplace=True)

In [5]:
data.shape
df = data.copy()

In [6]:
dff = df[[c for c in df if df[c].isnull().sum() == 0]]
dff.isna().sum()
# dff.drop(['radiotap.datarate', 'radiotap.mactime', 'radiotap.mactime', 'wlan_radio.start_tsf', 'wlan_radio.timestamp', 'wlan_radio.end_tsf '], axis=1, inplace=True)

frame.len                     0
frame.number                  0
frame.time                    0
frame.time_delta              0
frame.time_delta_displayed    0
frame.time_epoch              0
frame.time_relative           0
radiotap.dbm_antsignal        0
radiotap.length               0
radiotap.present.tsft         0
radiotap.timestamp.ts         0
wlan.duration                 0
wlan.fc.ds                    0
wlan.fc.frag                  0
wlan.fc.order                 0
wlan.fc.moredata              0
wlan.fc.protected             0
wlan.fc.pwrmgt                0
wlan.fc.type                  0
wlan.fc.retry                 0
wlan.fc.subtype               0
wlan.ra                       0
wlan_radio.duration           0
wlan_radio.data_rate          0
wlan_radio.signal_dbm         0
wlan_radio.phy                0
Label                         0
dtype: int64

In [10]:
dff.drop(['frame.time'], axis=1, inplace=True)
dff.describe(include='object')

Unnamed: 0,radiotap.dbm_antsignal,radiotap.present.tsft,wlan.fc.ds,wlan.ra,Label
count,2440570,2440570,2440570,2440570,2440570
unique,691,2,4,962,2
top,-103,1-0-0,0x00000000,0c:9d:92:54:fe:34,Normal
freq,764234,1537879,1527796,950962,2428688


In [16]:
object_col_to_drop = ['wlan.ra']
dff.drop(object_col_to_drop, axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dff.drop(object_col_to_drop, axis=1, inplace=True)


In [17]:
dff.describe(include='object')

Unnamed: 0,radiotap.present.tsft,wlan.fc.ds,Label
count,2440570,2440570,2440570
unique,2,4,2
top,1-0-0,0x00000000,Normal
freq,1537879,1527796,2428688


In [18]:
dummies = []
cols = ['radiotap.present.tsft', 'wlan.fc.ds']
for col in cols:
   dummies.append(pd.get_dummies(df[col]))
dff.drop(cols, axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dff.drop(cols, axis=1, inplace=True)


In [19]:
dummies =  pd.concat(dummies, axis=1)
dff = pd.concat((dff,dummies), axis=1)

In [20]:
dff["label"]=dff['Label'].map(lambda a: 0 if a == 'Normal' else 1)
dff.drop(['Label'], axis=1, inplace=True)

In [21]:
y = dff.label
dff.drop(['label'], inplace=True, axis=1)

In [22]:
# Normaliling the data
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

norm_dff = scaler.fit_transform(dff)
norm_dff.shape

(2440570, 27)

In [23]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(norm_dff, y, test_size=0.3, random_state=42)

In [24]:
# Oversampling the data
from collections import Counter
from imblearn.over_sampling import SMOTE

print('Original dataset shape %s' % Counter(y_train))
sm = SMOTE(random_state=42)

X_train, y_train = sm.fit_resample(X_train, y_train)
X_test, y_test = sm.fit_resample(X_test, y_test)

print('Original dataset shape %s' % Counter(y_train))

Original dataset shape Counter({0: 1700047, 1: 8352})
Original dataset shape Counter({0: 1700047, 1: 1700047})


In [25]:
# we'll initialize each model and store it by name in a dictionary
models = {}

# Logistic Regression
from sklearn.linear_model import LogisticRegression
models['Logistic Regression'] = LogisticRegression()

# # Support Vector Machines
# from sklearn.svm import LinearSVC, SVC
# models['Support Vector Machines linear'] = LinearSVC()
# models['Support Vector Machines plonomial'] = SVC(kernel='poly')
# models['Support Vector Machines RBf'] =  SVC(C=100.0)

# # Decision Trees
# from sklearn.tree import DecisionTreeClassifier
# models['Decision Trees'] = DecisionTreeClassifier(max_depth=3)

# # Random Forest
# from sklearn.ensemble import RandomForestClassifier
# models['Random Forest'] = RandomForestClassifier()

# # Naive Bayes
# from sklearn.naive_bayes import GaussianNB
# models['Naive Bayes'] = GaussianNB()

# # K-Nearest Neighbors
# from sklearn.neighbors import KNeighborsClassifier
# models['K-Nearest Neighbor'] = KNeighborsClassifier(n_neighbors=20)

In [26]:
# we'll loop over each one, train it by calling .fit(), make predictions, calculate metrics, and store each result in a dictionary.

from sklearn.metrics import accuracy_score, precision_score, recall_score

accuracy, precision, recall = {}, {}, {}

for key in models.keys():
    
    # Fit the classifier
    models[key].fit(X_train, y_train)
    
    # Make predictions
    predictions = models[key].predict(X_test)
    
    # Calculate metrics
    accuracy[key] = accuracy_score(predictions, y_test)
    precision[key] = precision_score(predictions, y_test)
    recall[key] = recall_score(predictions, y_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [27]:
df_model = pd.DataFrame(index=models.keys(), columns=['Accuracy', 'Precision', 'Recall'])
df_model['Accuracy'] = accuracy.values()
df_model['Precision'] = precision.values()
df_model['Recall'] = recall.values()

df_model

Unnamed: 0,Accuracy,Precision,Recall
Logistic Regression,0.964168,0.981944,0.948232
