### Importing Libraries

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats.mstats

### Import dataset

In [2]:
dataset = pd.read_csv(r'C:\Users\11 PrO\Desktop\design proj\df_unsw15.csv')


### Removing redundant columns

In [3]:

def remove_redundant_columns(dataset):
    # Create a set to store the names of columns already encountered
    seen = set()
    # List to store columns to drop
    to_drop = []

    # Iterate through columns
    for column in dataset.columns:
        # If we have already seen the column name, mark it for dropping
        if column in seen:
            to_drop.append(column)
        else:
            seen.add(column)

    # Drop redundant columns
    dataset.drop(to_drop, axis=1, inplace=True)
    return dataset

# Example usage:
# Load your dataset
# df = pd.read_csv('your_dataset.csv')

# Remove redundant columns
data1 = remove_redundant_columns(dataset)


### Check for missing values and fill them

In [4]:
def columns_with_missing_values(df):
    # Check for missing values in each column
    missing_values = df.isnull().sum()

    # Filter out columns with missing values
    missing_columns = missing_values[missing_values > 0]

    if missing_columns.empty:
        return []
    else:
        return missing_columns.index.tolist()


columns_with_missing = columns_with_missing_values(data1)
print("Columns with missing values:", columns_with_missing)



Columns with missing values: []


### Encoding categorical features

In [5]:
# fetch categorical features
def get_categorical_features(df):
    # Select columns with dtype 'object' (strings) or 'category'
    categorical_features = data1.select_dtypes(include=['object', 'category']).columns.tolist()
    return categorical_features

categorical_features = get_categorical_features(data1)
print("Categorical features:", categorical_features)


Categorical features: ['proto', 'service', 'state']


In [7]:
#ENCODING THE FEATURES
from sklearn.preprocessing import LabelEncoder

def label_encode_categorical_features(df, categorical_features):
    # Create a label encoder object
    label_encoder = LabelEncoder()

    # Iterate through each categorical feature and encode its values
    for feature in categorical_features:
        df[feature] = label_encoder.fit_transform(df[feature])

    return df

# Get categorical features
categorical_features = get_categorical_features(data1)

# Label encode categorical features
df1 = label_encode_categorical_features(data1, categorical_features)
# Now, categorical features in df have been label encoded


In [None]:
df1

Unnamed: 0.1,Unnamed: 0,Dst Port,Protocol,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,TotLen Fwd Pkts,TotLen Bwd Pkts,Fwd Pkt Len Max,Fwd Pkt Len Min,...,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,0,0,0,112641719,3,0,0,0,0,0,...,0,0.0,0.0,0,0,56320859.5,139.300036,56320958,56320761,1
1,1,0,0,112641466,3,0,0,0,0,0,...,0,0.0,0.0,0,0,56320733.0,114.551299,56320814,56320652,1
2,2,0,0,112638623,3,0,0,0,0,0,...,0,0.0,0.0,0,0,56319311.5,301.934596,56319525,56319098,1
3,3,22,6,6453966,15,10,1239,2273,744,0,...,32,0.0,0.0,0,0,0.0,0.000000,0,0,1
4,4,22,6,8804066,14,11,1143,2209,744,0,...,32,0.0,0.0,0,0,0.0,0.000000,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1099995,1099995,80,6,5004773,5,3,499,769,499,0,...,20,0.0,0.0,0,0,0.0,0.000000,0,0,11
1099996,1099996,80,6,22,2,0,0,0,0,0,...,20,0.0,0.0,0,0,0.0,0.000000,0,0,11
1099997,1099997,80,6,796,2,0,0,0,0,0,...,20,0.0,0.0,0,0,0.0,0.000000,0,0,11
1099998,1099998,80,6,9345199,7,5,1497,4152,602,0,...,20,0.0,0.0,0,0,0.0,0.000000,0,0,11


### fill missing values

In [8]:
def columns_with_missing_values(df1):
    # Check for missing values in each column
    missing_values = df1.isnull().sum()

    # Filter out columns with missing values
    missing_columns = missing_values[missing_values > 0]

    if missing_columns.empty:
        return []
    else:
        return missing_columns.index.tolist()


columns_with_missing = columns_with_missing_values(df1)
print("Columns with missing values:", columns_with_missing)



Columns with missing values: []


### allocating X and y

In [10]:
import pandas as pd

# Remove rows with missing values
df1.dropna(inplace=True)

# Split data into features and target
X_second = df1.drop(columns=["attack_cat"])
y_second = df1['attack_cat']

### Using Pearson correlation method and removing redundant features

In [11]:
import pandas as pd
# Convert the NumPy array to a DataFrame
def pearson_correlation_matrix(df):
    # Compute the Pearson correlation matrix
    corr_matrix = df.corr(method='pearson')

    return corr_matrix
# Compute the Pearson correlation matrix
corr_matrix = pearson_correlation_matrix(df1)
print(corr_matrix)

                   Unnamed: 0        id       dur     proto   service  \
Unnamed: 0           1.000000  1.000000  0.017117 -0.003568  0.024453   
id                   1.000000  1.000000  0.017117 -0.003568  0.024453   
dur                  0.017117  0.017117  1.000000 -0.121704 -0.007036   
proto               -0.003568 -0.003568 -0.121704  1.000000  0.163221   
service              0.024453  0.024453 -0.007036  0.163221  1.000000   
state                0.375842  0.375842  0.083886 -0.152795 -0.104715   
spkts               -0.049826 -0.049826  0.264716  0.011486  0.105379   
dpkts               -0.092853 -0.092853  0.192954  0.021882  0.070046   
sbytes               0.001509  0.001509  0.211499  0.005056  0.098579   
dbytes              -0.059282 -0.059282  0.152250  0.012977  0.032849   
rate                 0.283977  0.283977 -0.117749  0.022988 -0.112416   
sttl                 0.463306  0.463306  0.011470  0.067153 -0.272897   
dttl                -0.019705 -0.019705  0.059616  

In [12]:
def filter_highly_correlated_features(corr_matrix, threshold=0.95):
    # Get the absolute correlation values
    corr_abs = corr_matrix.abs()

    # Select upper triangle of correlation matrix
    upper = corr_abs.where(np.triu(np.ones(corr_abs.shape), k=1).astype(bool))

    # Find index of feature columns with correlation greater than threshold
    highly_correlated = [column for column in upper.columns if any(upper[column] > threshold)]


    return highly_correlated

corr_matrix = pearson_correlation_matrix(df1)  # Assuming you have already computed the correlation matrix
highly_correlated_features = filter_highly_correlated_features(corr_matrix, threshold=0.95)
print("Highly correlated features:", highly_correlated_features)

Highly correlated features: ['id', 'sbytes', 'dbytes', 'sloss', 'dloss', 'dwin', 'ct_src_dport_ltm', 'ct_dst_src_ltm', 'ct_ftp_cmd', 'ct_srv_dst']


### Including only the correlated features

In [15]:
actual_df2=df1.drop(columns=['Unnamed: 0',  'dur', 'proto', 'service', 'state', 'spkts', 'dpkts','rate', 'sttl', 'dttl', 'sload', 'dload', 'sinpkt', 'dinpkt', 'sjit', 'djit', 'swin', 'stcpb', 'dtcpb','tcprtt', 'synack', 'ackdat', 'smean', 'dmean', 'trans_depth', 'response_body_len', 'ct_srv_src', 'ct_state_ttl', 'ct_dst_ltm', 'ct_dst_sport_ltm', 'is_ftp_login', 'ct_flw_http_mthd', 'ct_src_ltm', 'is_sm_ips_ports', 'label'
 ],axis=1)

In [17]:
num_features = actual_df2.shape[1]
print("Number of features in X:", num_features)

Number of features in X: 11


In [18]:
features_list = actual_df2.columns.tolist()
print("List of features in df:", features_list)


List of features in df: ['id', 'sbytes', 'dbytes', 'sloss', 'dloss', 'dwin', 'ct_src_dport_ltm', 'ct_dst_src_ltm', 'ct_ftp_cmd', 'ct_srv_dst', 'attack_cat']


### Deleting redundant features

In [19]:
def delete_redundant_features(df, threshold=0.95):
    # Compute the correlation matrix
    corr_matrix = df.corr().abs()

    # Create a boolean mask to identify highly correlated features
    mask = np.triu(np.ones(corr_matrix.shape), k=1).astype(bool)

    # Find index of feature columns with correlation greater than threshold
    redundant_features = np.where(corr_matrix > threshold)

    # Initialize a set to keep track of features to delete
    features_to_delete = set()

    # Identify groups of redundant features and add one feature from each group to delete
    for i, j in zip(*redundant_features):
        if mask[i, j]:
            features_to_delete.add(df.columns[j])

    # Drop the redundant features from the DataFrame
    df.drop(features_to_delete, axis=1, inplace=True)

    return df

# Example usage:
# Load your dataset
# df = pd.read_csv('your_dataset.csv')

# Delete one feature from each group of redundant features
act_df = delete_redundant_features(actual_df2, threshold=0.95)


In [20]:
num_features = actual_df2.shape[1]

print("Number of features in the dataset:", num_features)

Number of features in the dataset: 8


In [21]:
features_list = actual_df2.columns.tolist()
print("List of features in df:", features_list)

List of features in df: ['id', 'sbytes', 'dbytes', 'dwin', 'ct_src_dport_ltm', 'ct_dst_src_ltm', 'ct_ftp_cmd', 'attack_cat']
