In [3]:
import pandas as pd
import numpy as np
import scipy.stats as stats
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, recall_score, precision_score, f1_score, classification_report
from sklearn.model_selection import cross_val_score, KFold
import matplotlib.pyplot as plt
import seaborn as sn


# importing csv file from location
dataset = pd.read_csv("C:/Users/Zeyad Mohamed/Downloads/assignments-main (1)/assignments-main/SDN_traffic.csv")

print(dataset.head())
print(dataset.info())
print(dataset.describe())
print(dataset.duplicated())

#dataset needed for the analysis in CSV file
x = dataset[['forward_bps_var',
            "tp.sre", "tp.dst", "nw_proto",
            "forward pe", "forward_bc", "forward_pl",
            "forward piat", "forward_pps", "forward_bps", "forward_pl_mean",
            "forward piat mean", "forward_pps_mean", "forward_bps_mean", "forward_pl_var", "forward_piat_var",
            "forward.pps var", "forward_pl_q1", "forward_pl_q3",
            "forward piat.g1", "forward_piat_q3", "forward_pl_max", "forward_pl_min", 
            "forward plat.max", "forward.piat_win", "forward_pps_max", "forward_pps_min",
            "forward bps_max", "forward_bps_min", "forward_duration", "forward_size_packets",
            "forward size bytes", "reverse_pc", "reverse_bc", "reverse.pl", "reverse piat", "reverse PRS", 
            "reverse_bps", "reverse_pl_mean", "reverse_piat.mean", "reverse_pps_nean", "reverse bas mean", "reverse_pl_var",
            "reverse plin", "reverse_pl", "reverse_piat", "reverse_piat_var", "reverse_pps_var", "reverse_bps_var",
            "reverse_piat q1", "reverse_pl_q3", "reverse_piat_max", "reverse_piat_min", "reverse_pps_max", "reverse_pps_min",
            "reverse_piat_q3", "reverse_pl_max", "reverse_bps_max", "reverse_bps_min", "reverse_duration", 
             "reverse_size_packets", "reverse_size_bytes"]]

x.loc[1877, 'forward_bps_var'] = float(11968865203349)
x.loc[9131, 'forward_bps_var'] = float(12880593884833)
x.loc[2381, 'forward_bps_var'] = float(39987497172945)
x.loc[2562, 'forward_bps_var'] = float(663388742992)
x.loc[1931, 'forward_bps_var'] = float(37770223877794)
x.loc[2078, 'forward_bps_var'] = float(9822747730895)
x.loc[2567, 'forward_bps_var'] = float(37778223877794)
x.loc[2586, 'forward_bps_var'] = float(97227875883751)
x.loc[2754, 'forward_bps_var'] = float(18789751483737)
x.loc[2765, 'forward_bps_var'] = float(33969277035759)
x.loc[2984, 'forward_bps_var'] = float(39284786962856)
x.loc[3844, 'forward_bps_var'] = float(9169996863653)
x.loc[3349, 'forward_bps_var'] = float(37123283690575)
x.loc[3507, 'forward_bps_var'] = float(61019864598464)
x.loc[3610, 'forward_bps_var'] = float(46849628984872)
x.loc[3717, 'forward_bps_var'] = float(97158873841506)
x.loc[3845, 'forward_bps_var'] = float(11968865203349)
x.loc[3868, 'forward_bps_var'] = float(85874278395372)

X = pd.DataFrame(x)
X["forward_bps_var"] = pd.to_numeric(X["forward_bps_var"])
print(X.info())

Y = dataset[["category"]]
Y = Y.to_numpy()
Y = Y.ravel()
Labels, uniques = pd.factorize(Y)
Y = Labels
Y = Y.ravel()

X = stats.zscore(X)
X = np.nan_to_num(X)

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, random_state=0, test_size=0.3) 

clf = DecisionTreeClassifier(random_state=0, max_depth=2)
clf.fit(X_train, Y_train)

cv = KFold(n_splits=10, random_state=0, shuffle=True)
accuracy = clf.score(X_test, Y_test)
KFold10_accuracy = cross_val_score(clf, X_train, Y_train, scoring='accuracy', cv=cv, n_jobs=-1)
print(KFold10_accuracy.mean())

predict = clf.predict(X_test)
cm = confusion_matrix(Y_test, predict)
precision = precision_score(Y_test, predict, average='weighted', labels=np.unique(predict))
recall = recall_score(Y_test, predict, average='weighted', labels=np.unique(predict))
fiscoreMacro = f1_score(Y_test, predict, average='macro', labels=np.unique(predict))
print(classification_report(Y_test, predict, target_names=uniques))

importance = clf.feature_importances_
important_features_dict = {}
for idx, val in enumerate(importance):
    important_features_dict[idx] = val
important_features_list = sorted(important_features_dict,
                                 key=important_features_dict.get,
                                 reverse=True)
print(f'10 most important features: {important_features_list[:10]}')

fn = ['forward_bps_var',
      "tp.src", "tp.dst", "nw_proto",
      "forward_pe", "forward_bc", "forward_pl",
      "forward_piat", "forward_pps", "forward_bps", "forward_pl_mean",
      "forward_piat_mean", "forward_pps_mean", "forward_bps_mean", "forward_pl_var", "forward_piat_var",
      "forward_pps_var", "forward_pl_q1", "forward_pl_q3",
      "forward_piat_q1", "forward_piat_q3", "forward_pl_max", "forward_pl_min",
      "forward_plat_max", "forward_piat_win", "forward_pps_max", "forward_pps_min",
      "forward_bps_max", "forward_bps_min", "forward_duration", "forward_size_packets",
      "forward_size_bytes", "reverse_pc", "reverse_bc", "reverse_pl", "reverse_piat", "reverse_PRS",
      "reverse_bps", "reverse_pl_mean", "reverse_piat_mean", "reverse_pps_mean", "reverse_bps_mean", "reverse_pl_var",
      "reverse_plin", "reverse_pl", "reverse_piat", "reverse_piat_var", "reverse_pps_var", "reverse_bps_var",
      "reverse_piat_q1", "reverse_pl_q3", "reverse_piat_max", "reverse_piat_min", "reverse_pps_max", "reverse_pps_min",
      "reverse_piat_q3", "reverse_pl_max", "reverse_bps_max", "reverse_bps_min", "reverse_duration", "reverse_size_packets", 
      "reverse_size_bytes"]

la = ['WWW', 'DNS', 'FTP', 'ICMP', 'P2P', 'VOIP']

plt.figure(1, dpi=300)
fig = tree.plot_tree(clf, filled=True, feature_names=fn, class_names=la)
plt.title("Decision tree trained on all the features")
plt.show()

import seaborn as sn
import matplotlib.pyplot as plt

labels = uniques
plt.figure(2, figsize=(5, 2))
plt.title("Confusion Matrix", fontsize=10)
cm_new = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
sn.heatmap(cm_new, annot=True, cmap="YlGnBu", fmt=".2f", xticklabels=labels, yticklabels=labels)
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.xticks(rotation=45)
plt.yticks(rotation=45)
plt.show()

                            id_flow         nw_src  tp_src         nw_dst  \
0  b2bb77a570fcfa9325eb9e51b6116d2a  172.16.25.104   41402  34.107.221.82   
1  f07977b0d1d6645c4fe1e9efea080ff3  172.16.25.104   41406  34.107.221.82   
2  e4026ba9b6c1957516e92bdd0d04878f  172.16.25.104   38232    52.84.77.43   
3  e2d747932e41500b1463fe8ae4299ecb  172.16.25.104   38234    52.84.77.43   
4  56325703391225ad65e013e7a2b02fac  172.16.25.104   60166    52.32.34.32   

   tp_dst  nw_proto  forward_pc  forward_bc  forward_pl  forward_piat  ...  \
0      80         6           5         300       60.00           6.0  ...   
1      80         6           5         300       60.00           6.0  ...   
2     443         6           3         198       66.00          10.0  ...   
3     443         6           3         198       66.00          10.0  ...   
4     443         6           4         265       66.25           7.5  ...   

   reverse_piat_max  reverse_piat_min  reverse_pps_max  reverse_pps_

KeyError: "['tp.sre', 'tp.dst', 'forward pe', 'forward piat', 'forward piat mean', 'forward.pps var', 'forward piat.g1', 'forward plat.max', 'forward.piat_win', 'forward bps_max', 'forward size bytes', 'reverse.pl', 'reverse piat', 'reverse PRS', 'reverse_piat.mean', 'reverse_pps_nean', 'reverse bas mean', 'reverse plin', 'reverse_piat q1'] not in index"