#  PHY Layer Authentication - EDA and Clustering Notebook
Environment Setup

In [2]:
import os
import sys
sys.path.append(os.path.abspath("../src"))

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import DBSCAN, KMeans
import umap

from preprocess import preprocess_data
from config import FEATURES

Load Dataset

In [None]:

df = pd.read_csv("../data/training.csv", sep=";")
print(df.shape)
print(df.dtypes)
print(df.head())
print(df.describe())

ParserError: Error tokenizing data. C error: Expected 1 fields in line 609, saw 2


Preprocess Dataset

In [7]:
df_clean_subset, X_scaled_subset = preprocess_data(df_subset)
#X_scaled[:5]  # show the first few rows

[INFO] Starting preprocessing...
[INFO] Using features: ['dlBytes', 'dlMcs', 'dlBler', 'ulBytes', 'ulMcs', 'ulBler', 'ri', 'phr', 'pcmax', 'rsrq', 'sinr', 'rsrp', 'rssi', 'cqi', 'pucchSnr', 'puschSnr']
['dlBytes', 'dlMcs', 'dlBler', 'ulBytes', 'ulMcs', 'ulBler', 'ri', 'phr', 'pcmax', 'rsrq', 'sinr', 'rsrp', 'rssi', 'cqi', 'pucchSnr', 'puschSnr', 'inSync', 'rnti', 'hour', 'dayofweek']
[INFO] Using features: ['dlBytes', 'dlMcs', 'dlBler', 'ulBytes', 'ulMcs', 'ulBler', 'ri', 'phr', 'pcmax', 'rsrq', 'sinr', 'rsrp', 'rssi', 'cqi', 'pucchSnr', 'puschSnr', 'inSync', 'rnti', 'hour', 'dayofweek']


ValueError: could not convert string to float: '1.000.098'

Dimensionality Reduction with UMAP

In [None]:
reducer = umap.UMAP(n_neighbors=150, min_dist=0.1, random_state=42)
X_umap = reducer.fit_transform(X_scaled)

plt.figure(figsize=(10, 6))
plt.scatter(X_umap[:, 0], X_umap[:, 1], s=10, alpha=0.7, c='blue')
plt.title("UMAP Projection of PHY Features")
plt.xlabel("UMAP-1")
plt.ylabel("UMAP-2")
plt.grid(True)
plt.show()

In [None]:
X_umap.shape

Clustering with DBSCAN

In [None]:
db = DBSCAN(eps=0.5, min_samples=600)
labels = db.fit_predict(X_scaled)

# Optional: Add labels to original DataFrame
df_clean["cluster"] = labels
df_clean["sinr"] = df["sinr"]  # Ensure UE ID is included in the clean DataFrame
# Visualization
import seaborn as sns

plt.figure(figsize=(10, 6))
sns.scatterplot(x=X_umap[:, 0], y=X_umap[:, 1], hue=labels, palette="tab10", s=15)
plt.title("DBSCAN Clustering on UMAP Projection")
plt.xlabel("UMAP-1")
plt.ylabel("UMAP-2")
plt.legend(title="Cluster")
plt.show()

df_clean["cluster"] = labels
# df_clean["ueId"] = df["ueId"]  # Ensure UE ID is included in the clean DataFrame

cluster_summary = df_clean.groupby("sinr")["cluster"].agg(["nunique", "count"]).reset_index()
cluster_summary = cluster_summary.sort_values("nunique", ascending=False)
cluster_summary.head()

In [None]:
# %% [markdown]
# ## Label Anomalies

# %%
# 1. Mark noise points (DBSCAN labels noise as -1)
df_clean["anomaly"] = df_clean["cluster"] == -1

# 2. UEs with inconsistent clustering: appear in >1 cluster (excluding noise)
ue_cluster_counts = df_clean[df_clean["cluster"] != -1].groupby("ueId")["cluster"].nunique()
inconsistent_ues = ue_cluster_counts[ue_cluster_counts > 1].index

df_clean["inconsistent_ue"] = df_clean["ueId"].isin(inconsistent_ues)

# 3. Final anomaly label: either noise or inconsistent clustering
df_clean["final_anomaly"] = df_clean["anomaly"] | df_clean["inconsistent_ue"]

# Show some samples
df_clean[df_clean["final_anomaly"]].head()
plt.figure(figsize=(10, 6))
sns.scatterplot(
    x=X_umap[:, 0], y=X_umap[:, 1],
    hue=df_clean["final_anomaly"],
    palette={True: "red", False: "green"},
    alpha=0.6, s=15
)
plt.title("Anomalies in UMAP Space")
plt.xlabel("UMAP-1")
plt.ylabel("UMAP-2")
plt.show()

Clustering with KMeans

In [None]:
kmeans = KMeans(n_clusters=3, random_state=42)
kmeans_labels = kmeans.fit_predict(X_scaled)

plt.figure(figsize=(10, 6))
sns.scatterplot(x=X_umap[:, 0], y=X_umap[:, 1], hue=kmeans_labels, palette="tab10", legend="full", s=15)
plt.title("KMeans Clustering")
plt.xlabel("UMAP-1")
plt.ylabel("UMAP-2")
plt.show()

In [None]:
# %% [markdown]
# ## Train Anomaly Detection Model

# %%
from sklearn.ensemble import IsolationForest

# Train Isolation Forest on training data
clf = IsolationForest(n_estimators=100, contamination=0.05, random_state=42)
clf.fit(X_scaled)

# Predict on training data (for visualization or analysis if needed)
y_train_pred = clf.predict(X_scaled)

# Add predictions to training dataframe
df_clean["predicted_anomaly"] = y_train_pred

# Visualize training predictions (optional)
sns.countplot(x="predicted_anomaly", data=df_clean)
plt.title("Anomaly Prediction on Training Data")
plt.xlabel("Anomaly Label")
plt.ylabel("Count")
plt.grid(True)
plt.show()


Inference on New Test Data (Unseen UE Samples)

In [None]:
import pandas as pd
# Load test data
test_path = "../data/non_authentic_2_less_antennas_diff_position.csv"  # adjust if needed
df_test = pd.read_csv(test_path)

# Preprocess test data
df_test_clean, X_test_scaled = preprocess_data(df_test)
df_test_clean["ueId"] = df["ueId"]  # Initialize anomaly column
# Predict using trained anomaly detector
y_test_pred = clf.predict(X_test_scaled)

# Add predictions to dataframe
df_test_clean["predicted_anomaly"] = y_test_pred

# Show prediction summary
print(df_test_clean["predicted_anomaly"].value_counts())

# %%
# Optional: Visualize prediction distribution
sns.countplot(x="predicted_anomaly", data=df_test_clean)
plt.title("Anomaly Prediction Distribution on Test Data")
plt.xlabel("Anomaly Label")
plt.ylabel("Count")
plt.grid(True)
plt.show()


# Get anomaly scores (lower score = more anomalous)
anomaly_scores = clf.decision_function(X_test_scaled)

# Add scores to the test DataFrame
df_test_clean["anomaly_score"] = anomaly_scores

# Set an authentication threshold (tune this based on your data)
# Higher score means more normal; set threshold carefully
AUTH_THRESHOLD = -0.025  # Adjust this based on ROC analysis later

# Classify UEs based on score
df_test_clean["authentication_status"] = df_test_clean["anomaly_score"].apply(
    lambda score: "authenticated" if score >= AUTH_THRESHOLD else "rejected"
)

# Visualize distribution
plt.figure(figsize=(10, 6))
sns.histplot(data=df_test_clean, x="anomaly_score", hue="authentication_status", bins=50, kde=True)
plt.axvline(AUTH_THRESHOLD, color="red", linestyle="--", label="Auth Threshold")
plt.title("Anomaly Score Distribution")
plt.xlabel("Anomaly Score")
plt.ylabel("Count")
plt.legend()
plt.grid(True)
plt.show()

# Filter rows where authentication failed
unauthenticated_df = df_test_clean[df_test_clean["authentication_status"] == "rejected"]

# Show the list (you can choose specific columns to display)
unauthenticated_df[["ueId", "anomaly_score"]].head()
unauthenticated_df.to_csv("../data/rejected_ues.csv", index=False)
# %%
# Optional: Save predictions to file
df_test_clean.to_csv("../data/test_data_with_predictions.csv", index=False)

