### Understanding the dataset

In [1]:
import pandas as pd

In [3]:
code_red_df = pd.read_csv("dataset/Code_Red_I.csv")
code_red_df.head()

Unnamed: 0,H+M,H,M,S,A - no,W - no,A - NLRI,W - NLRI,Mean AS PATH,Max AS PATH,...,W - Dupli,W -Implicit,Avg Edit D,Max Edit D,IAT,IGP,EGP,Incomplete,Packet size,Label
0,0,0,0,8,58,4,130,29,6,13,...,53,8,13,1.0,6,38,0,20,248,-1
1,1,0,1,2,58,7,107,25,6,12,...,83,3,12,1.0,6,48,0,10,232,-1
2,2,0,2,20,43,6,68,22,5,8,...,27,2,8,0.8,6,42,0,1,225,-1
3,3,0,3,0,60,7,119,54,6,10,...,95,21,10,1.1,6,58,0,2,246,-1
4,4,0,4,10,52,6,102,52,5,8,...,52,23,8,0.9,6,48,0,4,241,-1


In [7]:
from sklearn.model_selection import train_test_split

In [8]:
print(code_red_df.shape)

(7200, 22)


In [13]:
X = code_red_df.iloc[:,4:21]
Y = code_red_df.iloc[:,21]

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=1)

#### Using Random Forest

In [15]:
from sklearn.ensemble import RandomForestClassifier

In [16]:
# creating a model with 100 tress
model = RandomForestClassifier(n_estimators=100, bootstrap=True, max_features='sqrt')

#### Training...

In [17]:
model.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='sqrt',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

### Predicting and testing

In [18]:
rf_predictions = model.predict(X_test)

### since the dataset is imabalanced, we will use ROC AUC

In [19]:
rf_probs = model.predict_proba(X_test)[:, 1]

In [29]:
from sklearn.metrics import roc_auc_score
roc_auc = roc_auc_score(y_test, rf_probs)
roc_auc

0.9168449082403742

#### Selecting important features

In [26]:
features = pd.DataFrame({'feature': list(X_train.columns), 'importance': model.feature_importances_}).sort_values(by="importance", ascending=False)

In [27]:
features

Unnamed: 0,feature,importance
13,IGP,0.163754
0,A - no,0.132944
3,W - NLRI,0.109377
11,Max Edit D,0.107056
2,A - NLRI,0.098192
8,W - Dupli,0.071458
16,Packet size,0.049719
15,Incomplete,0.045632
7,A - Dupli,0.04413
1,W - no,0.036393


### Create a new dataframe with 5 important features

In [113]:
X_train_select_feature = X_train.drop(X_train.columns[[8,16,15,7,1,9,5,10,4,12,6,14]], axis=1)
X_test_select_feature = X_test.drop(X_train.columns[[8,16,15,7,1,9,5,10,4,12,6,14]], axis=1)

# temp_df = X_train.drop(X_train.columns[[13,0,3,11,2,8,16,15,7,1]], axis=1)

# features_rem_list = [8,16,15,7,1,9,5,10,4,12,6,14]

In [114]:
model.fit(X_train_select_feature, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='sqrt',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [115]:
rf_predictions = model.predict(X_test_select_feature)
rf_probs = model.predict_proba(X_test_select_feature)[:, 1]
roc_auc = roc_auc_score(y_test, rf_probs)
roc_auc

0.9057689816480747