In [None]:
import plotly.express as px
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report


Aggregate Data Sets

In [None]:
dataTemp = []
files = ["Malware_26.csv","Malware_20.csv", "Malware_21.csv"]
for i in files:
    dataTemp.append(pd.read_csv(f"C:\\Masters\\MlFinalProjFinal\\Data\\CSV\\9.Malware\\{i}", low_memory=True))
    print(f"read {i} into memory")

df = pd.concat(dataTemp)
print(df.shape)

Label analysis

In [None]:
print(df["Label"].unique())
print(df["Label"].value_counts())
fig = px.bar(df["Label"].value_counts())
fig.show()

In [None]:
print(df.columns)

Null analysis

In [None]:
df.dropna(axis=1, how="all", inplace=True)
nulls = df.isna()
print(df.shape)

In [None]:
nonNullCols = []
for col in nulls.columns:
    if len(nulls[col].unique()) == 1 and nulls[col].unique()[0] == False:
       nonNullCols.append(col)
       
nulls.drop(nonNullCols, axis=1, inplace=True)

nullRatios = np.asarray([nulls[c].value_counts()[True] / nulls[c].shape[0] for c in nulls], dtype=float)
nullFilt = np.array([x > 0.5 for x in nullRatios])
colswithvals = np.array(nulls.columns[[not x for x in nullFilt]])
colswithvals = np.concatenate([colswithvals, nonNullCols])
print(f"Cols with greater then 50% values: {colswithvals}")
print(f'Count: {len(colswithvals)}')

Ip analysis

In [None]:
ip = ["ip.dst","ip.proto","ip.src","ip.ttl","ip.version"]

ipOnly = df.drop(df.columns.difference(ip), axis=1)
print(ipOnly.shape)
for c in ipOnly.columns:
    print(ipOnly[c].isna().value_counts())

ipOnly.dropna(axis=0, how="any", inplace=True)

print(ipOnly.columns)

#### Deleting the rows from the DataFrame with NA values from the IP columns

In [None]:
print("df shape before: ", df.shape)
for i in ipOnly.columns:
    df.dropna(subset=[i], axis=0, how="any", inplace=True)

print("df shape after:", df.shape)

In [None]:
pd.set_option('display.max_columns', None)
df.head(10)

#### Further Null Analysis

In [None]:
# finding columns with NaN values and ratio of NaN values to the total
nan_col = df.columns[df.isna().any()].tolist()
print("\nNaN column count: ", len(nan_col))
print("Total columns:", len(df.columns))
nan_col_ratio = df.isna().mean().tolist()


nan_column = {nan_col[i]: nan_col_ratio[i] for i in range(len(nan_col))}
print("Columns with NaN:")
print(nan_column)
print("\n")

# get columns with ratio of NaN values more than 50%
useless_col = dict((k, v) for k, v in nan_column.items() if v >= 0.5)
print("Columns with more than 50% NaN values: ")
print(useless_col)
print(f'\nNumber of useless columns {len(useless_col)}')
print(f'Column with the least ratio: {min(useless_col, key=useless_col.get)}')

#print (df.isin([' ','NULL',0]).mean())

In [None]:
# dropping the columns with more than 50% null values

df = df.dropna(thresh=df.shape[0]*0.5,axis=1)
#df = df.dropna(thresh=df.shape[0]*0.5, axis=1, subset=df.columns.difference(ip))

print("New df shape:", df.shape)
df.head(10)

In [None]:
#### replacing the left over NaN values with 0 or unknown
import warnings
warnings.filterwarnings('ignore')
## checking if we havent dropped IP data
# for col in df.columns:
#     if "ip" in col:
#         print(col)
# print("\n")
cnt_num = 0
cnt = 0
non_num_feature = []
for col in df.columns:
    if (df[col].dtypes == "int64") or (df[col].dtypes == "float64") or ("udp" in col) :
        #print("number col: ", col, df[col].dtypes)
        cnt_num += 1
        df[col] = df[col].fillna(0)
    else:
        #print("string col:", col, df[col].dtypes)
        cnt += 1
        non_num_feature.append(col)
        df[col] = df[col].fillna('Unknown')

print("Numeric count and non numeric count: ",cnt_num, cnt)
print("Non numeric columns: ", non_num_feature)
df.head(10)

In [None]:
def check_feature_type(df):
    cnt_num = 0
    cnt = 0
    global non_num_feature
    non_num_feature = []
    global num_feature
    num_feature = []
    for col in df.columns:
        if (df[col].dtypes == "int64") or (df[col].dtypes == "float64"):
            cnt_num += 1
            num_feature.append(col)
        else:
            #print("string col:", col, df[col].dtypes)
            cnt += 1
            non_num_feature.append(col)

    print("Numeric features: " + str(cnt_num) + " ; Non-numeric features: " + str(cnt))
    print("Non numeric columns: ", non_num_feature)
    
    return num_feature, non_num_feature

In [None]:
df[non_num_feature].head(10)

#### Converting IP and MAC address information from string to int

In [None]:
import ipaddress
import re

ipv4_addr_features = ['ip.src', 'ip.dst']

# removing rows with non IP address format and then convering those IPs to integer values using ipaddress
for col in ipv4_addr_features:
    df = df[df[col].str.contains("^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$", regex=True) == True]

for col in ipv4_addr_features:
  df[col] = df[col].apply(lambda x: int(ipaddress.IPv4Address(x)))

print("df shape after dropping non IPv4 rows: ", df.shape)

In [None]:
from numpy import NaN


mac_addr_features = ['wlan.bssid','wlan.da', 'wlan.ra', 'wlan.sa', 'wlan.ta']

# converting  MAC addresses in the wlan features to int
def mac_to_int(mac):
    mac = str(mac)
    res = re.match('^((?:(?:[0-9a-f]{2}):){5}[0-9a-f]{2})$', mac.lower())
    if res is None:
        print(f'invalid mac address: {mac}')
        return 0
    return int(res.group(0).replace(':', ''), 16)

for col in mac_addr_features:
  df[col] = df[col].apply(lambda x: mac_to_int(x))


In [None]:
print(df["wlan.bssid"].value_counts())
print(df['wlan.da'].value_counts())
print(df['wlan.ra'].value_counts())
print(df['wlan.sa'].value_counts())
print(df['wlan.ta'].value_counts())

In [None]:
udp_feaures = ['udp.dstport', 'udp.srcport', 'udp.length', 'udp.time_delta', 'ip.version']

for col in udp_feaures:
    df[col] = df[col].astype('float64')

In [None]:
check_feature_type(df)

In [None]:
print(num_feature)

In [None]:
udp_feaures = ['udp.dstport', 'udp.srcport', 'udp.length', 'udp.time_delta', 'ip.version']

for feat in udp_feaures:
    try: 
        num_feature.remove(feat)
    except:
        print("hello")

print(num_feature)

Noticed that wlan.bssid has the same MAC address in all rows - basic service set identifier (BSSID). THis is the MAC address of the Access Point.

#### Encoding Labels + Train-test split + Scaling
For data containing IP features

Only for Numeric Features

In [None]:
print(num_feature)

In [None]:
from copy import copy, deepcopy
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, LabelBinarizer, StandardScaler

encoder = LabelEncoder()



Y = df["Label"]
#X = df.loc[: ,df.columns != 'Label']
X = df[num_feature]

encoded_Y = encoder.fit_transform(Y)
print(encoder.classes_)

train_data, test_data, train_labels , test_labels = train_test_split(X, encoded_Y, test_size = 0.2)

print("Train data shape:" , train_data.shape)
print("Test data shape: ", test_data.shape)

scaler=StandardScaler()
scaler.fit(train_data)
scaled_train_data = scaler.transform(train_data)
scaled_test_data = scaler.transform(test_data)

#### Need to convert data to categorical values before performing PCA - So far able to prune NaN columns and replace leftover NaN with 0 or unknown

## PCA Analysis

In [None]:
import matplotlib.pyplot as plt
from sklearn import metrics
from sklearn.decomposition import PCA

def PCA_(k, data):

  total_var = []
  for i in range(len(k)):
    pca = PCA(n_components=k[i])
    pca.fit(data)
    var_components = pca.explained_variance_ratio_
    print("k = "+ str(k[i]) + " ; Variance = "+ str(np.sum(var_components)))
    total_var.append(np.sum(var_components))


  # plotting
  print("\n")
  plt.plot(k, total_var, marker = 'o')
  plt.title('Fraction of total variance vs. number of principal components)') 
  plt.xlabel("Number of principal components")
  plt.ylabel("Variance")

n_components = [ 2, 3, 4, 5, 10, 20, 30]
print("PCA on unscaled data")
PCA_(n_components, train_data)



In [None]:
print("PCA on Scaled data")
PCA_(n_components, scaled_train_data)

In [None]:
# Helper func - Accuracy and error
def accuracy_scaled(model):
    y_preds= model.predict(scaled_test_data)
    return metrics.accuracy_score(test_labels, y_preds)

def accuracy_nonscaled(model):
    y_preds= model.predict(test_data)
    return metrics.accuracy_score(test_labels, y_preds)

def error_scaled(model):
    y_preds= model.predict(scaled_test_data)
    return metrics.zero_one_loss(test_labels, y_preds)

def error_nonscaled(model):
    y_preds= model.predict(test_data)
    return metrics.zero_one_loss(test_labels, y_preds)

    

In [None]:
# more sophisticated metrics:
def generate_and_display_classification_rep(model, data, y_true):
  
    ypred = model.predict(data)
    classRep = classification_report(y_true, ypred, output_dict=True)
    return classRep

#### Classification Algorithms in ML

In [None]:
validationSet = pd.read_csv("C:\\Masters\\MlFinalProjFinal\\Data\\ValidationSets\\malware_valid.csv")
print(validationSet.head(10))
validation_lables = encoder.transform(validationSet["Label"])
validation_data = validationSet[num_feature]

In [None]:
def strip_ip_features(data):
    no_ip_num_features = deepcopy(num_feature)
    no_ip_num_features.remove("ip.dst")
    no_ip_num_features.remove("ip.src")
    return data[no_ip_num_features]

In [None]:
def compare_ip_no_ip(ip_model, no_ip_model, train_data, train_labels, test_data, test_labels, validation_data, validation_labels, attack_class, model_type):
    
    ip_model.fit(train_data, train_labels)
    ip_class_report = generate_and_display_classification_rep(ip_model, test_data, test_labels)
    ip_class_report_valid = generate_and_display_classification_rep(ip_model, validation_data, validation_labels)

    no_ip_train_data = strip_ip_features(train_data)
    no_ip_test_data = strip_ip_features(test_data)
    no_ip_valid_data = strip_ip_features(validation_data)

    no_ip_model.fit(no_ip_train_data, train_labels)
    no_ip_class_report = generate_and_display_classification_rep(no_ip_model, no_ip_test_data, test_labels)
    no_ip_class_report_valid = generate_and_display_classification_rep(no_ip_model, no_ip_valid_data, validation_labels)

    (pd.DataFrame(ip_class_report)).to_csv(f"C:\\Masters\\MlFinalProjFinal\\Results\\{attack_class}\\{model_type}\\ip_class_rep.csv")
    (pd.DataFrame(ip_class_report_valid)).to_csv(f"C:\\Masters\\MlFinalProjFinal\\Results\\{attack_class}\\{model_type}\\ip_class_rep_valid.csv")
    (pd.DataFrame(no_ip_class_report)).to_csv(f"C:\\Masters\\MlFinalProjFinal\\Results\\{attack_class}\\{model_type}\\no_ip_class_rep.csv")
    (pd.DataFrame(no_ip_class_report_valid)).to_csv(f"C:\\Masters\\MlFinalProjFinal\\Results\\{attack_class}\\{model_type}\\no_ip_class_rep_valid.csv")

In [None]:
attack_type = "malware"

Knn:

In [None]:
from sklearn.neighbors import KNeighborsClassifier


knn_ip = KNeighborsClassifier()
knn_no_ip = KNeighborsClassifier()
compare_ip_no_ip(knn_ip, knn_no_ip, train_data, train_labels, test_data, test_labels, validation_data, validation_lables, attack_type, "knn")

Random Forest:

In [None]:
rf_ip = RandomForestClassifier()
rf_no_ip = RandomForestClassifier()
compare_ip_no_ip(rf_ip, rf_no_ip, train_data, train_labels, test_data, test_labels, validation_data, validation_lables, attack_type, "rf")

Niave Bayes

In [None]:
from sklearn.naive_bayes import GaussianNB


nb_ip = GaussianNB()
nb_no_ip = GaussianNB()
compare_ip_no_ip(nb_ip, nb_no_ip, train_data, train_labels, test_data, test_labels, validation_data, validation_lables, attack_type, "nb")

Desc Tree:

In [None]:
dt_ip = DecisionTreeClassifier()
dt_no_ip = DecisionTreeClassifier()
compare_ip_no_ip(dt_ip, dt_no_ip, train_data, train_labels, test_data, test_labels, validation_data, validation_lables, attack_type, "dt")

Logit

In [None]:
lg_ip = LogisticRegression()
lg_no_ip = LogisticRegression()
compare_ip_no_ip(dt_ip, dt_no_ip, train_data, train_labels, test_data, test_labels, validation_data, validation_lables, attack_type, "lg")