# Set up environment

In [28]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

# Loading the datasets into dataframes

In [29]:
df_fridayAF_DDos = pd.read_csv("./datasets/Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv")
df_fridayAF_PortScan = pd.read_csv("./datasets/Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX.csv")
df_fridayMO = pd.read_csv("./datasets/Friday-WorkingHours-Morning.pcap_ISCX.csv")
df_monday = pd.read_csv("./datasets/Monday-WorkingHours.pcap_ISCX.csv")
df_thursdayAF_Infilteration = pd.read_csv("./datasets/Thursday-WorkingHours-Afternoon-Infilteration.pcap_ISCX.csv")
df_thursdayMO_WebAttacks = pd.read_csv("./datasets/Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX.csv")
df_tuesday = pd.read_csv("./datasets/Tuesday-WorkingHours.pcap_ISCX.csv")
df_wednesday = pd.read_csv("./datasets/Wednesday-workingHours.pcap_ISCX.csv")

# Concatenating the dataframes to single dataframe

In [30]:
df_data = pd.concat([df_fridayAF_DDos,df_fridayAF_PortScan,df_fridayMO,df_monday,df_thursdayAF_Infilteration,df_thursdayMO_WebAttacks,df_tuesday,df_wednesday], axis=0,ignore_index=True)

# Data Preprocessing

#### Find columns with zero variance; columns where all values are the same


In [None]:
zero_variance_cols = [col for col in df_data.columns if df_data[col].nunique() == 1]

# Display the columns with zero variance
if zero_variance_cols:
    print(f"Columns with zero variance: {zero_variance_cols}")
else:
    print("No columns with zero variance found.")

##### Shape before removing zero variance columns

In [None]:
print('Shape before removing zero variance columns:', df_data.shape)

#### Handle columns with zero variance

In [None]:
if zero_variance_cols:
        df_data.drop(zero_variance_cols, axis=1, inplace=True)
        print(f'Dropped zero variance columns: {zero_variance_cols}')

##### Shape after removing zero variance columns

In [None]:
print('Shape after removing zero variance columns:', df_data.shape)

#### Find  spaces from column names

In [None]:
df_data.keys()

##### Handle spaces from column names

In [36]:
df_data.columns = df_data.columns.str.strip()

#### Columns after removing spaces

In [None]:
df_data.keys()

#### Identify Rows with NaN, inf, or -inf Values
##### The row listed here contain  NaN across the columns displayed.

In [None]:
df_data.isna().sum()

### Options to handle missing values

##### Option1: Drop rows with missing values

In [None]:
df_data_cleaned = df_data.dropna(subset=['Flow Bytes/s'])

# Check shape after removing rows with missing values
print("Shape after dropping rows with missing 'Flow Bytes/s':", df_data_cleaned.shape)

In [None]:
df_data.isna().sum()

#### Option 2: Fill Missing Values

##### fill with the mean

In [None]:
# df_data['Flow Bytes/s'].fillna(df_data['Flow Bytes/s'].mean(), inplace=True)

#### Fill with the median

In [None]:
# df_data['Flow Bytes/s'].fillna(df_data['Flow Bytes/s'].median(), inplace=True)

#### Fill with 0

In [None]:
# df_data['Flow Bytes/s'].fillna(0, inplace=True)

##### The row listed here contain inf, or -inf value across the columns displayed.

In [None]:
# Select only numeric columns
numeric_cols = df_data.select_dtypes(include=[np.number])

# Check for inf and -inf values in the numeric columns
inf_values = np.isinf(numeric_cols).sum()

# Display the count of inf and -inf values in each numeric column
print("Count of inf and -inf values in each numeric column:")
print(inf_values)

##### Identify duplicate rows

In [None]:
duplicate_rows = df_data[df_data.duplicated()]

if not duplicate_rows.empty:
    print("Duplicate rows:")
    print(duplicate_rows)
else:
    print("No duplicate rows found.")

##### Identify columns with identical values

In [23]:
column_pairs = []
num_columns = len(df_data.columns)

for i in range(num_columns):
    for j in range(i + 1, num_columns):
        if df_data.iloc[:, i].equals(df_data.iloc[:, j]): 
            column_pairs.append((df_data.columns[i], df_data.columns[j]))

In [None]:
print("Shape before removing identical columns:", df_data.shape)

##### Print the column pairs with identical values

In [None]:
if column_pairs:
    print("Columns with identical values:")
    for pair in column_pairs:
        print(f"{pair[0]} and {pair[1]} have identical values.")
    
    # Step 3: Drop one column from each pair
    columns_to_drop = [pair[1] for pair in column_pairs]
    df_data.drop(columns=columns_to_drop, axis=1, inplace=True)
    print(f"Dropped columns: {columns_to_drop}")
else:
    print("No columns with identical values found.")

print("Shape after removing identical columns:", df_data.shape)

### Check for non-numeric columns

In [None]:
non_numeric_columns = df_data.select_dtypes(include=['object']).columns
print("Non-numeric columns:", non_numeric_columns)

In [None]:
df_data.head()

In [None]:
print(df_data.columns)

In [None]:
df_data['Label'].unique()

In [None]:
sns.countplot(x='Label', data=df_data)
plt.xticks(rotation=45, ha='right')
plt.show()

In [103]:
df_data['Label'] = df_data['Label'].map({
    'BENIGN': 0,              
    'DDoS': 1,                
    'PortScan': 1,
    'Bot': 1,
    'Infiltration': 1,
    'Web Attack ': 1,
    'Brute Force': 1,
    'Web Attack � XSS': 1,
    'Web Attack � Sql Injection': 1,
    'FTP-Patator': 1,
    'SSH-Patator': 1,
    'DoS slowloris': 1,
    'DoS Slowhttptest': 1,
    'DoS Hulk': 1,
    'DoS GoldenEye': 1,
    'Heartbleed': 1
})

In [None]:
sns.countplot(x='Label', data=df_data)
plt.xticks(rotation=45, ha='right') 
plt.show()

In [None]:
df_data.describe()

### Object typically refers to string or mixed data types
### The output usually includes:

#### count: Number of non-null entries in each column.
#### unique: Number of unique values in each column.
#### top: Most frequent value in each column.
#### freq: Frequency of the most common value.

In [None]:
df_data.describe(include="object")

In [None]:
df_data.shape

In [None]:
df_data.isnull().sum()

In [None]:
print(f"Number of duplicate rows: {df_data.duplicated().sum()}")