# Project 3
## Composed Encrypted Malicious Traffic Dataset for machine learning based encrypted malicious traffic analysis
### Dataset: bank sampled 9K

### Group Members
- Shedrach Ezenwali  300377414
- Bryan Rodriguez    300369955
- Hazel Ibasco       300366644
- Nathaniel Pearson  300387657

____

____

## Library-import

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

%matplotlib inline

from datetime import datetime as dt

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [2]:
df = pd.read_csv("./Dataset/Project_Dataset.csv")

In [3]:
## A look of the data set
df.head()

Unnamed: 0,session,Flag_of_packets,Traffic_sequence,Payload_ratio,Length_of_IP_packets,Length_of_TCP_payload,Length_of_TCP_packet_header,Length_of_IP_packet_header,TCP_windows_size_value,Length_of_TCP_segment(packet),...,std_backward_pkt_length,duration_forward,duration_back,mean_of_backward_IP_header,mean_of_forward_IP_header,total_payload_per_session,IPratio,Goodput,source_IP_address,Destination_IP_address
0,66141,24,2279944530,0.90566,424,384,20,20,65535,404,...,586.204197,7.832529,6.262479,20.0,20.0,211878.0,36.5,1177.711659,10.0.2.15,95.101.15.55
1,94061,24,3701792612,0.824916,297,245,32,20,4368,277,...,605.42649,29132.055096,29132.048757,20.0,20.0,35326.0,28.846154,0.536805,192.168.1.165,184.73.174.14
2,68533,16,2094088526,0.972603,1460,1420,20,20,65535,1440,...,360.076025,1.224812,1.281815,20.0,20.0,252568.0,37.5,2015.054781,10.0.2.15,151.101.112.70
3,16275,24,1596612280,0.834025,241,201,20,20,1369,221,...,525.716548,0.40772,10.408007,20.0,20.0,9968.0,37.5,1046.429906,10.42.0.151,115.239.210.141
4,16676,24,2320058429,0.851852,351,299,32,20,340,331,...,386.96131,0.720086,0.738152,20.0,20.0,229377.0,28.269231,834.93219,10.42.0.151,172.217.6.202


In [4]:
## Let's get general information about the data, the shape and also the data type

def get_data_description(df)->None:
    '''Take a dataframe and prints out a consise 
    description based on the data shape'''
    rows,cols = df.shape

    print(f'We have {rows} rows and {cols} columns in the data set')

get_data_description(df)



We have 26189 rows and 113 columns in the data set


In [5]:
## Checking of the data type of each column
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26189 entries, 0 to 26188
Columns: 113 entries, session to Destination_IP_address
dtypes: float64(97), int64(14), object(2)
memory usage: 22.6+ MB


In [6]:
## Getting the percentage of missing values in each column

def check_cols_with_null(df):
    '''Gets  dataframe and prints out percentage of missing values in each column'''
    print(    df.shape)
    for col in df.columns:
        print(f'{col}: {format((df[col].isnull().sum()/df.shape[0])*100,".0f")}%')

check_cols_with_null(df)

(26189, 113)
session: 0%
Flag_of_packets: 0%
Traffic_sequence: 0%
Payload_ratio: 0%
Length_of_IP_packets: 0%
Length_of_TCP_payload: 0%
Length_of_TCP_packet_header: 0%
Length_of_IP_packet_header: 0%
TCP_windows_size_value: 0%
Length_of_TCP_segment(packet): 0%
Time_difference_between_packets_per_session: 0%
Change_values_of_TCP_windows_length_per_session: 0%
Interval_of_arrival_time_of_forward_traffic: 0%
Interval_of_arrival_time_of_backward_traffic: 0%
Time_to_live: 0%
Ratio_to_previous_packets_in_each_session: 0%
Total_length_of_IP_packet_per_session: 0%
Total_Time_to_live_per_session: 0%
The_times_of_change_of_TCP_windows_length: 0%
The_times_of_change_of_payload_per_session: 0%
Total_length_of_forward_payload: 0%
Total_length_of_backward_payload: 0%
Total_length_of_forward_IP_header: 0%
Total_length_of_backward_IP_header: 0%
Total_length_of_forward_TCP_header: 0%
Total_length_of_backward_TCP_header: 0%
Total_length_of_forward_IP_packets: 0%
Total_length_of_backward_IP_packets: 0%
flo

## Creating-dummy-variables-for-each-categorical-feature

In [7]:
# Identify categorical features
categorical_columns = df.select_dtypes('object').columns

# Create dummy variables for each categorical feature
for column in categorical_columns:
    # Create dummy variables with a prefix related to the original column name
    dummies = pd.get_dummies(df[column], prefix=column, drop_first=True)
    
    # Rename columns using lambda to replace hyphen '-' with underscore '_'
    dummies.columns = dummies.columns.map(lambda x: x.replace('-', '_'))
    
    # Concatenate the dummy variables to the original DataFrame
    df = pd.concat([df, dummies], axis=1)
    
# Drop the original categorical columns
df = df.drop(categorical_columns, axis=1)

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26189 entries, 0 to 26188
Columns: 3665 entries, session to Destination_IP_address_98.139.50.166
dtypes: bool(3554), float64(97), int64(14)
memory usage: 110.9 MB


____

# Feature-Selection-and-Scaling

Performing feature selection from the dataset. 

In [9]:
# Separate features and target variable
X = df.drop('label', axis=1)  
Y = df['label']

## LogisticRegression

In [10]:
#split the dataset
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size=0.2, random_state=42)

#### Creating pipelines for each feature selection method

In [11]:
# Logistic Regression
lr_pipe = Pipeline([
    ('scaler', RobustScaler()),
    ('feature_selection', LogisticRegression(max_iter=1000)),
])

## Linear SVM (please use regularization hyperparameter value 0.001

In [12]:
# Linear SVM
svm_pipe = Pipeline([
    ('scaler', RobustScaler()),
    ('feature_selection', LinearSVC(C=0.001, max_iter=1000)),
])

## SelectKBest with mutual_info_classif as the score metric,
### with the number of features to be selected equal to 10

In [13]:
# SelectKBest
kbest_pipe = Pipeline([
    ('scaler', RobustScaler()),
    ('feature_selection', SelectKBest(mutual_info_classif, k=10)),
])

### Find the method with the best number of features

In [14]:
# Fit the models and get selected features

# Random Forest 

____

____

___

___

____

____