In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.decomposition import PCA
import csv

# Load the dataset with error handling
data = pd.read_csv('Darknet.CSV', low_memory=False, on_bad_lines='skip', encoding='utf-8', quoting=csv.QUOTE_NONE)

# Display column names to verify correct selection
print("Columns in dataset:", data.columns.tolist())

# Drop unnecessary columns (IPs, Flow ID, Timestamp, and unnamed columns)
cols_to_drop = ['Flow ID', 'Src IP', 'Dst IP', 'Timestamp'] + [col for col in data.columns if "Unnamed" in col]
data.drop(columns=[col for col in cols_to_drop if col in data.columns], inplace=True)

# Check and print missing values
print("Missing values per column:\n", data.isnull().sum())

# Drop columns with >50% missing values
threshold = 0.5 * len(data)
data.dropna(axis=1, thresh=threshold, inplace=True)

# Fill remaining missing values with 0
data.fillna(0, inplace=True)

# Define feature columns
selected_columns = ['Src Port', 'Dst Port', 'Protocol', 'Flow Duration', 'Total Fwd Packet', 'Total Bwd packets']
selected_columns = [col for col in selected_columns if col in data.columns]

# Include 'Label' if available
if 'Label' in data.columns:
    selected_columns.append('Label')

data = data[selected_columns]

# Convert 'Label' to numerical format if it exists
if 'Label' in data.columns:
    label_encoder = LabelEncoder()
    data['Label'] = label_encoder.fit_transform(data['Label'])

# Ensure numerical features for scaling
numerical_cols = data.select_dtypes(include=['int64', 'float64']).columns.tolist()
if 'Label' in numerical_cols:
    numerical_cols.remove('Label')

# Scale numerical features
scaler = StandardScaler()
data[numerical_cols] = scaler.fit_transform(data[numerical_cols])

# Ensure enough data before splitting
if data.shape[0] > 10:
    X = data.drop(columns=['Label'])
    y = data['Label']
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Apply PCA (Ensure max 5 components, but not more than available features)
    n_components = min(5, X_train.shape[1])
    pca = PCA(n_components=n_components)
    
    X_train = pca.fit_transform(X_train)
    X_test = pca.transform(X_test)
    
    print("Shape of X_train after PCA:", X_train.shape)
    print("Shape of X_test after PCA:", X_test.shape)
    print("PCA Components:\n", pca.components_)
    # Save transformed data to CSV
    pca_columns = [f'PC{i+1}' for i in range(n_components)]
    pca_df = pd.DataFrame(X_train, columns=pca_columns)
    pca_df['Label'] = y_train.values
    pca_df.to_csv('Darknet_PCA.csv', index=False)
    print("PCA-transformed data saved to 'Darknet_PCA.csv'")
else:
    print("Not enough data to perform train-test split. Please check dataset preprocessing.")



Columns in dataset: ['Flow ID', 'Src IP', 'Src Port', 'Dst IP', 'Dst Port', 'Protocol', 'Timestamp', 'Flow Duration', 'Total Fwd Packet', 'Total Bwd packets', 'Total Length of Fwd Packet', 'Total Length of Bwd Packet', 'Fwd Packet Length Max', 'Fwd Packet Length Min', 'Fwd Packet Length Mean', 'Fwd Packet Length Std', 'Bwd Packet Length Max', 'Bwd Packet Length Min', 'Bwd Packet Length Mean', 'Bwd Packet Length Std', 'Flow Bytes/s', 'Flow Packets/s', 'Flow IAT Mean', 'Flow IAT Std', 'Flow IAT Max', 'Flow IAT Min', 'Fwd IAT Total', 'Fwd IAT Mean', 'Fwd IAT Std', 'Fwd IAT Max', 'Fwd IAT Min', 'Bwd IAT Total', 'Bwd IAT Mean', 'Bwd IAT Std', 'Bwd IAT Max', 'Bwd IAT Min', 'Fwd PSH Flags', 'Bwd PSH Flags', 'Fwd URG Flags', 'Bwd URG Flags', 'Fwd Header Length', 'Bwd Header Length', 'Fwd Packets/s', 'Bwd Packets/s', 'Packet Length Min', 'Packet Length Max', 'Packet Length Mean', 'Packet Length Std', 'Packet Length Variance', 'FIN Flag Count', 'SYN Flag Count', 'RST Flag Count', 'PSH Flag Count