## Bad vs Good Connections
### Author: Ana Javed

You are involved in a project where you are tasked to build a machine learning algorithm that distinguishes between "bad'' connections (called intrusions or attacks) and "good'' (normal) connections. Note that the number of normal connections is greater than that of bad ones. 

The dataset you will use in this assignment originated with the KDD Cup 1999 (https://kdd.ics.uci.edu/databases/kddcup99/kddcup99.html).


In [253]:
## Importing Necessary Packages 
import matplotlib.pyplot as plt
import pandas as pd 
import numpy as np
import csv
import sklearn 

from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_selection import RFE
from sklearn.tree import DecisionTreeRegressor
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split


In [254]:
## Reading file into Dataframe 
url = "https://library.startlearninglabs.uw.edu/DATASCI420/2019/Datasets/Intrusion%20Detection.csv"
df = pd.read_csv(url, sep=",")

## Top 5 Rows from Dataframe
df.head()


Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,Class
0,0,tcp,http,SF,181,5450,0,0,0,0,...,9,1.0,0.0,0.11,0.0,0.0,0.0,0.0,0.0,0
1,0,tcp,http,SF,239,486,0,0,0,0,...,19,1.0,0.0,0.05,0.0,0.0,0.0,0.0,0.0,0
2,0,tcp,http,SF,235,1337,0,0,0,0,...,29,1.0,0.0,0.03,0.0,0.0,0.0,0.0,0.0,0
3,0,tcp,http,SF,219,1337,0,0,0,0,...,39,1.0,0.0,0.03,0.0,0.0,0.0,0.0,0.0,0
4,0,tcp,http,SF,217,2032,0,0,0,0,...,49,1.0,0.0,0.02,0.0,0.0,0.0,0.0,0.0,0


In [255]:
## Conducting Exploratory Data Analysis: 
print(df.shape)  # (97308, 42)
print(df.dtypes) # many int64, object, and float64 types 
print(df.describe()) 

(97308, 42)
duration                         int64
protocol_type                   object
service                         object
flag                            object
src_bytes                        int64
dst_bytes                        int64
land                             int64
wrong_fragment                   int64
urgent                           int64
hot                              int64
num_failed_logins                int64
logged_in                        int64
num_compromised                  int64
root_shell                       int64
su_attempted                     int64
num_root                         int64
num_file_creations               int64
num_shells                       int64
num_access_files                 int64
num_outbound_cmds                int64
is_host_login                    int64
is_guest_login                   int64
count                            int64
srv_count                        int64
serror_rate                    float64
srv_serror_ra

In [256]:
## Obtaining a list of Column Names that will be Used for Normalization & Dummy Variables 
other_columns = list(df.columns)
other_columns.remove('Class')
print(other_columns)

## Creating Dummy Variables for Categorical Fields 
df_expanded = pd.get_dummies(df.loc[:, other_columns], drop_first = True)



['duration', 'protocol_type', 'service', 'flag', 'src_bytes', 'dst_bytes', 'land', 'wrong_fragment', 'urgent', 'hot', 'num_failed_logins', 'logged_in', 'num_compromised', 'root_shell', 'su_attempted', 'num_root', 'num_file_creations', 'num_shells', 'num_access_files', 'num_outbound_cmds', 'is_host_login', 'is_guest_login', 'count', 'srv_count', 'serror_rate', 'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate', 'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count', 'dst_host_srv_count', 'dst_host_same_srv_rate', 'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate', 'dst_host_serror_rate', 'dst_host_srv_serror_rate', 'dst_host_rerror_rate', 'dst_host_srv_rerror_rate']


In [257]:
## Standardizing / Normalizing Values with StandardScaler 
standardization_scale = StandardScaler().fit(df_expanded)
normalized_df = pd.DataFrame(standardization_scale.transform(df_expanded), columns=df_expanded.columns)

# # Using sklearn standardization_scale 
# Y = df.loc[:, "Class"]
# Y = pd.DataFrame(Y) 
# standardization_scale = StandardScaler().fit(Y)
# y = standardization_scale.transform(Y)
# df.loc[:, "Class"] = y



In [258]:
print(normalized_df.shape)

normalized_df

(97308, 72)


Unnamed: 0,duration,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,logged_in,num_compromised,...,service_urh_i,service_urp_i,flag_REJ,flag_RSTO,flag_RSTR,flag_S0,flag_S1,flag_S2,flag_S3,flag_SF
0,-0.159396,-0.028524,0.054945,-0.003206,0.0,-0.003206,-0.052515,-0.008904,0.625084,-0.007165,...,-0.011996,-0.074493,-0.240988,-0.026444,-0.017852,-0.022899,-0.023564,-0.013219,-0.008482,0.246407
1,-0.159396,-0.026829,-0.077172,-0.003206,0.0,-0.003206,-0.052515,-0.008904,0.625084,-0.007165,...,-0.011996,-0.074493,-0.240988,-0.026444,-0.017852,-0.022899,-0.023564,-0.013219,-0.008482,0.246407
2,-0.159396,-0.026946,-0.054522,-0.003206,0.0,-0.003206,-0.052515,-0.008904,0.625084,-0.007165,...,-0.011996,-0.074493,-0.240988,-0.026444,-0.017852,-0.022899,-0.023564,-0.013219,-0.008482,0.246407
3,-0.159396,-0.027414,-0.054522,-0.003206,0.0,-0.003206,-0.052515,-0.008904,0.625084,-0.007165,...,-0.011996,-0.074493,-0.240988,-0.026444,-0.017852,-0.022899,-0.023564,-0.013219,-0.008482,0.246407
4,-0.159396,-0.027472,-0.036025,-0.003206,0.0,-0.003206,-0.052515,-0.008904,0.625084,-0.007165,...,-0.011996,-0.074493,-0.240988,-0.026444,-0.017852,-0.022899,-0.023564,-0.013219,-0.008482,0.246407
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
97303,0.076807,0.010195,-0.039884,-0.003206,0.0,-0.003206,-0.052515,-0.008904,0.625084,-0.007165,...,-0.011996,-0.074493,-0.240988,37.815341,-0.017852,-0.022899,-0.023564,-0.013219,-0.008482,-4.058327
97304,-0.126283,0.034449,0.021703,-0.003206,0.0,-0.003206,3.438021,-0.008904,0.625084,0.239971,...,-0.011996,-0.074493,-0.240988,-0.026444,-0.017852,-0.022899,-0.023564,-0.013219,-0.008482,0.246407
97305,-0.029889,0.011744,-0.017395,-0.003206,0.0,-0.003206,3.438021,-0.008904,0.625084,0.981379,...,-0.011996,-0.074493,-0.240988,-0.026444,-0.017852,-0.022899,-0.023564,-0.013219,-0.008482,0.246407
97306,-0.114510,0.034449,0.021517,-0.003206,0.0,-0.003206,3.438021,-0.008904,0.625084,0.239971,...,-0.011996,-0.074493,-0.240988,-0.026444,-0.017852,-0.022899,-0.023564,-0.013219,-0.008482,0.246407


In [259]:
## Splitting Training and Testing Data
x_train, x_test, y_train, y_test = train_test_split (np.array(normalized_df),
                                                     np.array(df.loc[:,"Class"]),
                                                     test_size = 0.5,
                                                     random_state = 123)


In [260]:
# create pipeline
rfe = RFE(estimator=DecisionTreeClassifier(), n_features_to_select= 5)
model = DecisionTreeClassifier()
pipeline = Pipeline(steps=[('s',rfe), ('m',model)])

In [262]:
# fit the model on all available data
pipeline.fit(x_train, y_train)

Pipeline(steps=[('s',
                 RFE(estimator=DecisionTreeClassifier(),
                     n_features_to_select=5)),
                ('m', DecisionTreeClassifier())])

In [271]:
## Separating Predicted vs Target Values 
predictions = pipeline.predict(x_test)
target = y_test

pipeline.score(x_test, y_test)

# print(predictions, target)
# print(sum(predictions), sum(target)) #14, 19


0.9998972335265343

In [272]:
# Confusion Matrix from your predicted values 
from sklearn.metrics import *

# Passing Target and Prediction Values to Confusion_Matrix
CM = confusion_matrix(target, predictions)
print ("\n\nConfusion matrix:\n", CM)

tn, fp, fn, tp = CM.ravel()
print ("\nTP, TN, FP, FN:", tp, ",", tn, ",", fp, ",", fn)


P = precision_score(target, predictions)
print ("\nPrecision Score:", np.round(P, 2))
print (' ')
R = recall_score(target, predictions)
print ("\nRecall Score:", np.round(R, 2))
print (' ')
F1 = f1_score(target, predictions)
print ("\nF1 Score:", np.round(F1, 2))
print (' ')
a = accuracy_score(target, predictions)
print ("\nAccuracy Score:", np.round(a, 2))
print (' ')





Confusion matrix:
 [[48635     0]
 [    5    14]]

TP, TN, FP, FN: 14 , 48635 , 0 , 5

Precision Score: 1.0
 

Recall Score: 0.74
 

F1 Score: 0.85
 

 Accuracy Score: 1.0
 


### Part 2: Modifying data by handling class imbalance

In [273]:
from imblearn.over_sampling import SMOTE 

sm = SMOTE(random_state= 123)
X_res, y_res = sm.fit_resample(normalized_df, df.loc[:,"Class"])

In [286]:
print(y_res.value_counts()) 

print(len(y_res))

print(X_res.shape)

0    97278
1    97278
Name: Class, dtype: int64
194556
(194556, 72)


In [289]:
# print(x_train, x_test, sum(y_train), sum(y_test))  #48738, 48540

In [290]:
## Splitting Training and Testing Data
x_train, x_test, y_train, y_test = train_test_split (X_res,
                                                     y_res,
                                                     test_size = 0.5,
                                                     random_state = 123)
# create pipeline
rfe = RFE(estimator=DecisionTreeClassifier(), n_features_to_select= 5)
model = DecisionTreeClassifier()
pipeline = Pipeline(steps=[('s',rfe), ('m',model)])

# fit the model on all available data
pipeline.fit(x_train, y_train)



Pipeline(steps=[('s',
                 RFE(estimator=DecisionTreeClassifier(),
                     n_features_to_select=5)),
                ('m', DecisionTreeClassifier())])

In [293]:
## Separating Predicted vs Target Values 
predictions = pipeline.predict(x_test)
target = y_test

pipeline.score(x_test, y_test)


0.9999280412837435

In [294]:
print(sum(predictions), sum(target))

48543 48540


In [295]:
# Confusion Matrix from your predicted values 
from sklearn.metrics import *

# Passing Target and Prediction Values to Confusion_Matrix
CM = confusion_matrix(target, predictions)
print ("\n\nConfusion matrix:\n", CM)

tn, fp, fn, tp = CM.ravel()
print ("\nTP, TN, FP, FN:", tp, ",", tn, ",", fp, ",", fn)


P = precision_score(target, predictions)
print ("\nPrecision Score:", np.round(P, 2))
print (' ')
R = recall_score(target, predictions)
print ("\nRecall Score:", np.round(R, 2))
print (' ')
F1 = f1_score(target, predictions)
print ("\nF1 Score:", np.round(F1, 2))
print (' ')
a = accuracy_score(target, predictions)
print ("\nAccuracy Score:", np.round(a, 2))
print (' ')





Confusion matrix:
 [[48733     5]
 [    2 48538]]

TP, TN, FP, FN: 48538 , 48733 , 5 , 2

Precision Score: 1.0
 

Recall Score: 1.0
 

F1 Score: 1.0
 

Accuracy Score: 1.0
 
