In [9]:
from sklearn.datasets import load_iris
import pandas as pd

data = load_iris()
iris_df = pd.DataFrame(data.data, columns=data.feature_names)
iris_df['target'] = data.target

In [11]:
iris_df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


In [7]:
!pip install minisom

Collecting minisom
  Downloading minisom-2.3.5.tar.gz (12 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: minisom
  Building wheel for minisom (setup.py) ... [?25l[?25hdone
  Created wheel for minisom: filename=MiniSom-2.3.5-py3-none-any.whl size=12031 sha256=5e659afdf6856d1c09ca53e25493b18aa856b9d0eb38f01ced1fe6639cc0abad
  Stored in directory: /root/.cache/pip/wheels/19/db/95/5e53bc2b88a328217fdf9f2886cafbe86b0df274f4b601f572
Successfully built minisom
Installing collected packages: minisom
Successfully installed minisom-2.3.5


In [13]:
iris = iris_df.iloc[: , :-1]
iris.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [14]:
from sklearn.preprocessing import MinMaxScaler
from minisom import MiniSom

# Normalization
scaler = MinMaxScaler()
normalized_data = scaler.fit_transform(iris_df.iloc[:, :-1])

# SOM initialization
som = MiniSom(x=10, y=10, input_len=4, sigma=1.0, learning_rate=0.5)
som.random_weights_init(normalized_data)
som.train_random(normalized_data, 100)


In [15]:
import numpy as np

def calculate_bmu_distances(som, data):
    return [np.linalg.norm(sample - som.get_weights()[som.winner(sample)])
            for sample in data]

distances = calculate_bmu_distances(som, normalized_data)
iris_df['bmu_distance'] = distances


In [16]:
from scipy.stats import zscore

iris_df['z_score'] = zscore(distances)
anomalies = iris_df[iris_df['z_score'].abs() > 2]

In [20]:
print(len(anomalies))
anomalies

7


Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target,bmu_distance,z_score
41,4.5,2.3,1.3,0.3,0,0.262963,2.631764
109,7.2,3.6,6.1,2.5,2,0.240869,2.302556
114,5.8,2.8,5.1,2.4,2,0.228376,2.116404
117,7.7,3.8,6.7,2.2,2,0.417237,4.9305
118,7.7,2.6,6.9,2.3,2,0.329537,3.623743
122,7.7,2.8,6.7,2.0,2,0.254917,2.51188
131,7.9,3.8,6.4,2.0,2,0.424832,5.043677


In [23]:
import pandas as pd
import numpy as np

COLUMNS = [
    'duration', 'protocol_type', 'service', 'flag', 'src_bytes', 'dst_bytes',
    'land', 'wrong_fragment', 'urgent', 'hot', 'num_failed_logins',
    'logged_in', 'num_compromised', 'root_shell', 'su_attempted', 'num_root',
    'num_file_creations', 'num_shells', 'num_access_files', 'num_outbound_cmds',
    'is_host_login', 'is_guest_login', 'count', 'srv_count', 'serror_rate',
    'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate',
    'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count', 'dst_host_srv_count',
    'dst_host_same_srv_rate', 'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate',
    'dst_host_srv_diff_host_rate', 'dst_host_serror_rate', 'dst_host_srv_serror_rate',
    'dst_host_rerror_rate', 'dst_host_srv_rerror_rate',
]

def load_and_preprocess_kdd(file_path):
        df = pd.read_csv(
            file_path,
            header=None,
            names=COLUMNS,
            dtype={
                'protocol_type': 'category',
                'service': 'category',
                'flag': 'category',
            }
        )

        print("Data loaded successfully. First 5 rows:")
        print(df.head())

        categorical_cols = df.select_dtypes(include=['category']).columns

        print()
        print("Categorical column analysis:")
        for col in categorical_cols:
            unique_vals = df[col].unique()
            print(f"{col}: {len(unique_vals)} unique values")
            print(f"Sample values: {list(unique_vals[:3])}")
            print()

        # Vectorization strategy
        for col in categorical_cols:

            # One-hot encode
            if len(df[col].unique()) < 10:
                df = pd.concat([
                    df.drop(col, axis=1),
                    pd.get_dummies(df[col], prefix=col)
                ], axis=1)
            else:
                # Label encode
                df[col] = df[col].astype('category').cat.codes

        num_cols = df.select_dtypes(include=np.number).columns
        df[num_cols] = df[num_cols].astype('float32')

        return df

In [24]:
features = load_and_preprocess_kdd('/content/kddcup.testdata.unlabeled_10_percent')


Data loaded successfully. First 5 rows:
   duration protocol_type  service flag  src_bytes  dst_bytes  land  \
0         0           udp  private   SF        105        146     0   
1         0           udp  private   SF        105        146     0   
2         0           udp  private   SF        105        146     0   
3         0           udp  private   SF        105        146     0   
4         0           udp  private   SF        105        146     0   

   wrong_fragment  urgent  hot  ...  dst_host_count  dst_host_srv_count  \
0               0       0    0  ...             255                 254   
1               0       0    0  ...             255                 254   
2               0       0    0  ...             255                 254   
3               0       0    0  ...             255                 254   
4               0       0    0  ...             255                 254   

   dst_host_same_srv_rate  dst_host_diff_srv_rate  \
0                     1.0    

In [25]:
if features is not None:
    print()
    print("Processed features shape:", features.shape)
    print("Sample preprocessed data:")
    print(features.iloc[:3, :5])


Processed features shape: (311029, 43)
Sample preprocessed data:
   duration  service  flag  src_bytes  dst_bytes
0       0.0     12.0   5.0      105.0      146.0
1       0.0     12.0   5.0      105.0      146.0
2       0.0     12.0   5.0      105.0      146.0


In [26]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
som_input = scaler.fit_transform(features)


In [28]:
# SOM initialization
som = MiniSom(x=50, y=50, input_len=43, sigma=1.0, learning_rate=0.5)
som.random_weights_init(som_input)
som.train_random(som_input, 1000)

In [29]:
import numpy as np

def calculate_bmu_distances(som, data):
    return [np.linalg.norm(sample - som.get_weights()[som.winner(sample)])
            for sample in data]

distances = calculate_bmu_distances(som, som_input)
features['bmu_distance'] = distances


In [30]:
from scipy.stats import zscore

features['z_score'] = zscore(distances)
anomalies = features[features['z_score'].abs() > 2]

In [32]:
print(len(anomalies))
print(anomalies)

9817
        duration  service  flag  src_bytes  dst_bytes  land  wrong_fragment  \
6            0.0      4.0   5.0       29.0        0.0   0.0             0.0   
38          20.0      8.0   5.0      232.0      765.0   0.0             0.0   
44           0.0      9.0   5.0      615.0        0.0   0.0             0.0   
71           1.0     13.0   5.0     1018.0      333.0   0.0             0.0   
79           0.0      9.0   5.0      884.0        0.0   0.0             0.0   
...          ...      ...   ...        ...        ...   ...             ...   
310138       0.0      4.0   5.0       46.0      134.0   0.0             0.0   
310139       0.0      4.0   5.0       44.0       44.0   0.0             0.0   
310140       0.0      4.0   5.0       44.0       44.0   0.0             0.0   
310186       0.0      4.0   5.0       45.0      127.0   0.0             0.0   
310907       0.0     12.0   5.0      105.0      147.0   0.0             0.0   

        urgent  hot  num_failed_logins  ...  d