## Feature Visualisation

Notebook for exploratory data analysis of features

0. Import Features

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

local = True

# Load the CSV file into a DataFrame
if local:
    df = pd.read_csv("../ANN-code/Data/features_CF4.csv")
else:
    df = pd.read_csv("../ANN-code/features_CF4.csv")

# Get columns
columns = df.columns
print(columns)

# Split the data into two DataFrames based on 'file_name' containing 'C' or 'F'
df_C = df[df['file_name'].str.contains('00_C_')]
df_F = df[df['file_name'].str.contains('00_F_')]


Index(['file_name', 'sum_intensity_camera', 'max_intensity_camera',
       'recoil_angle_camera', 'recoil_length_camera',
       'mean_energy_deposition_camera', 'std_energy_deposition_camera',
       'skew_energy_deposition_camera', 'kurt_energy_deposition_camera',
       'head_tail_mean_difference_camera'],
      dtype='object')


1. Basic Statistical Analysis

In [None]:
# number of entreis in each DataFrame
print("Number of entries in each DataFrame:")
print("df_C: ", len(df_C))
print("df_F: ", len(df_F))

In [None]:
# Use .describe() to get statistics for each species
print("Carbon (C) Statistics:\n", df_C.describe(), "\n")
print("Fluorine (F) Statistics:\n", df_F.describe(), "\n")


with pd.ExcelWriter("statistics.xlsx") as writer:
    df_C.describe().to_excel(writer, sheet_name="Carbon")
    df_F.describe().to_excel(writer, sheet_name="Fluorine")

print("Statistics saved as an Excel file (statistics.xlsx).")


In [None]:
# Feature correlations
import seaborn as sns
import matplotlib.pyplot as plt

if local:
    pre_path = "Features_CF4_1_viz/"  # Update path for CF4-only dataset

# Extract relevant features (assuming numerical columns start after 'file_name')
features = df.columns[1:]  # Exclude 'file_name'

# Compute correlation matrix
correlation_matrix = df[features].corr()

# Plot correlation heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap="coolwarm", fmt=".2f", linewidths=0.5)
plt.title("Feature Correlation Heatmap")
plt.savefig(pre_path+"FeatureCorrelationHeatmap.png")  # Save the plot
plt.show()

# Split the data into two DataFrames based on 'file_name' containing 'C' or 'F'
df_C = df[df['file_name'].str.contains('00_C_')]
df_F = df[df['file_name'].str.contains('00_F_')]

print("Number of entries in each DataFrame:")
print("df_C: ", len(df_C))
print("df_F: ", len(df_F))

# Plot feature distributions using KDE plots for Carbon and Fluorine only
for feature in features:
    plt.figure(figsize=(8, 5))
    sns.kdeplot(df_C[feature], label="Carbon (C)", fill=True, alpha=0.5)
    sns.kdeplot(df_F[feature], label="Fluorine (F)", fill=True, alpha=0.5)
    plt.title(f"Feature Distribution: {feature}")
    plt.xlabel(feature)
    plt.ylabel("Density")
    plt.legend()
    plt.savefig(pre_path+str(feature)+"_distribution.png")  # Save the plot
    plt.show()

In [None]:
# Get columns
columns = df.columns

# Split the data into two DataFrames based on 'file_name' containing 'C' or 'F'
df_C = df[df['file_name'].str.contains('00_C_')]
df_F = df[df['file_name'].str.contains('00_F_')]

# Feature distribution plots
features = columns[1:]

plt.figure(figsize=(15, 10))
for i, feature in enumerate(features):
    plt.subplot(3, 3, i + 1)
    plt.hist(df_C[feature], bins=50, alpha=0.5, label='C', density=True)
    plt.hist(df_F[feature], bins=50, alpha=0.5, label='F', density=True)
    plt.xlabel(feature)
    plt.ylabel("Density")
    plt.legend()
    plt.title(f"Distribution of {feature}")

plt.tight_layout()
plt.savefig(pre_path+"all_features_distributions.png")  # Save the plot
plt.show()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import re

# Load the CSV file into a DataFrame
if local:
    df = pd.read_csv("../ANN-code/Data/features_CF4.csv")
else:
    df = pd.read_csv("../ANN-code/features_CF4.csv")

# Extract energy from file_name using regex
def extract_energy(file_name):
    match = re.search(r"(\d+\.\d+)keV", file_name)
    if match:
        return float(match.group(1))
    return np.nan

df['energy_value'] = df['file_name'].apply(extract_energy)

# Remove rows with NaN energy
df = df.dropna(subset=['energy_value'])

# Split into Carbon and Fluorine
df_C = df[df['file_name'].str.contains('00_C_')]
df_F = df[df['file_name'].str.contains('00_F_')]

# Further split by energy (<30 keV and ≥30 keV)
df_C_low = df_C[df_C['energy_value'] < 30]
df_C_high = df_C[df_C['energy_value'] >= 30]
df_F_low = df_F[df_F['energy_value'] < 30]
df_F_high = df_F[df_F['energy_value'] >= 30]

# Features to plot
columns = df.columns
features = columns[1:]

# Plot histograms for low and high energy ranges
fig, axes = plt.subplots(len(features), 2, figsize=(12, 36))
for i, feature in enumerate(features):
    axes[i, 0].hist(df_C_low[feature], bins=50, alpha=0.5, label='C <30keV', density=True)
    axes[i, 0].hist(df_F_low[feature], bins=50, alpha=0.5, label='F <30keV', density=True)
    axes[i, 0].set_title(f"{feature} Distribution (<30 keV)")
    axes[i, 0].set_xlabel(feature)
    axes[i, 0].set_ylabel("Density")
    axes[i, 0].legend()
    
    axes[i, 1].hist(df_C_high[feature], bins=50, alpha=0.5, label='C >30keV', density=True)
    axes[i, 1].hist(df_F_high[feature], bins=50, alpha=0.5, label='F >30keV', density=True)
    axes[i, 1].set_title(f"{feature} Distribution (>30 keV)")
    axes[i, 1].set_xlabel(feature)
    axes[i, 1].set_ylabel("Density")
    axes[i, 1].legend()

plt.tight_layout()
plt.savefig(pre_path+"feature_distributions_energy_comparison_30keV.png")  # save the plot
plt.show()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.preprocessing import StandardScaler

# Load the dataset
if local:
    df = pd.read_csv("../ANN-code/Data/features_CF4.csv")
else:
    df = pd.read_csv("../ANN-code/features_CF4.csv")

# Split into Carbon and Fluorine
df_C = df[df['file_name'].str.contains('00_C_')]
df_F = df[df['file_name'].str.contains('00_F_')]

# Extract feature columns (excluding 'file_name')
features = df.columns[1:]  # Adjust if necessary

# Apply Standard Scaling
scaler = StandardScaler()

df_C_scaled = df_C.copy()
df_F_scaled = df_F.copy()

df_C_scaled[features] = scaler.fit_transform(df_C[features])
df_F_scaled[features] = scaler.transform(df_F[features])

# Plot histograms after Standard Scaling
plt.figure(figsize=(15, 10))
for i, feature in enumerate(features):
    plt.subplot(3, 3, i + 1)
    plt.hist(df_C_scaled[feature], bins=50, alpha=0.5, label='C', density=True)
    plt.hist(df_F_scaled[feature], bins=50, alpha=0.5, label='F', density=True)
    plt.xlabel(feature)
    plt.ylabel("Density")
    plt.legend()
    plt.title(f"Distribution of {feature} after Standard Scaling")

plt.tight_layout()
plt.savefig(pre_path+"all_features_distributions_standardscaled.png")  # Save the plot
plt.show()


In [None]:
# find greatest energy for each species

# Extract energy from file_name using regex
def extract_energy(file_name):
    match = re.search(r"(\d+\.\d+)keV", file_name)
    if match:
        return float(match.group(1))
    return np.nan

df['energy_value'] = df['file_name'].apply(extract_energy)

df_C = df[df['file_name'].str.contains('00_C_')]
df_F = df[df['file_name'].str.contains('00_F_')]

# Carbon
max_energy_C = df_C['energy_value'].max()
print("Max energy for Carbon:", max_energy_C)

# Fluorine
max_energy_F = df_F['energy_value'].max()
print("Max energy for Fluorine:", max_energy_F)


In [None]:
# Load the CSV file into a DataFrame
if local:
    df = pd.read_csv("../ANN-code/Data/features_CF4.csv")
else:
    df = pd.read_csv("../ANN-code/features_CF4.csv")

# Get columns
columns = df.columns
print(columns)

# Split the data into two DataFrames based on 'file_name' containing 'C' or 'F'
df_C = df[df['file_name'].str.contains('00_C_')]
df_F = df[df['file_name'].str.contains('00_F_')]

In [None]:
# Apply PCA
from sklearn.decomposition import PCA

# drop nans
df_C = df_C.dropna()
df_F = df_F.dropna()

pca = PCA()
df_C_pca = pca.fit_transform(df_C[features])
df_F_pca = pca.transform(df_F[features])

# Explained variance plot
plt.figure(figsize=(8, 5))
plt.plot(range(1, len(pca.explained_variance_ratio_) + 1), np.cumsum(pca.explained_variance_ratio_), marker='o', linestyle='--')
plt.xlabel('Number of Principal Components')
plt.ylabel('Cumulative Explained Variance')
plt.title('PCA Explained Variance')
plt.grid()
# plt.savefig("PCA.png")
plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import re

local = False

# Load the CSV file into a DataFrame
if local:
    df = pd.read_csv("../ANN-code/Data/features_CF4.csv")
else:
    df = pd.read_csv("../ANN-code/features_CF4.csv")

# Extract energy from file_name using regex
def extract_energy(file_name):
    match = re.search(r"(\d+\.\d+)keV", file_name)
    if match:
        return float(match.group(1))
    return np.nan

df['energy_value'] = df['file_name'].apply(extract_energy)

# Remove rows with NaN energy
df = df.dropna(subset=['energy_value'])

# Split into Carbon and Fluorine
df_C = df[df['file_name'].str.contains('_C_')]
df_F = df[df['file_name'].str.contains('_F_')]

# Further split by energy
df_C_low = df_C[df_C['energy_value'] < 30]
df_C_high = df_C[df_C['energy_value'] >= 30]
df_F_low = df_F[df_F['energy_value'] < 30]
df_F_high = df_F[df_F['energy_value'] >= 30]

# Features to plot
columns = df.columns
features = columns[1:]

# Plot histograms for low and high energy ranges
fig, axes = plt.subplots(len(features), 2, figsize=(12, 15))
for i, feature in enumerate(features):
    axes[i, 0].hist(df_C_low[feature], bins=50, alpha=0.5, label='C <30keV', density=True)
    axes[i, 0].hist(df_F_low[feature], bins=50, alpha=0.5, label='F <30keV', density=True)
    axes[i, 0].set_title(f"{feature} Distribution (<30 keV)")
    axes[i, 0].set_xlabel(feature)
    axes[i, 0].set_ylabel("Density")
    axes[i, 0].legend()
    
    axes[i, 1].hist(df_C_high[feature], bins=50, alpha=0.5, label='C >30keV', density=True)
    axes[i, 1].hist(df_F_high[feature], bins=50, alpha=0.5, label='F >30keV', density=True)
    axes[i, 1].set_title(f"{feature} Distribution (>30 keV)")
    axes[i, 1].set_xlabel(feature)
    axes[i, 1].set_ylabel("Density")
    axes[i, 1].legend()

plt.tight_layout()
plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import re
from ipywidgets import interact, FloatSlider

local = False

# Load the CSV file into a DataFrame
if local:
    df = pd.read_csv("../ANN-code/Data/features_CF4.csv")
else:
    df = pd.read_csv("../ANN-code/features_CF4.csv")

# Extract energy from file_name using regex
def extract_energy(file_name):
    match = re.search(r"(\d+\.\d+)keV", file_name)
    if match:
        return float(match.group(1))
    return np.nan

df['energy_value'] = df['file_name'].apply(extract_energy)

# Remove rows with NaN energy
df = df.dropna(subset=['energy_value'])

# Split into Carbon and Fluorine
df_C = df[df['file_name'].str.contains('_C_')]
df_F = df[df['file_name'].str.contains('_F_')]

# Features to plot (excluding 'file_name', 'sum_intensity_camera', 'energy_value')
features = ['max_intensity_camera', 'recoil_angle_camera', 'recoil_length_camera',
            'mean_energy_deposition_camera', 'std_energy_deposition_camera',
            'skew_energy_deposition_camera', 'kurt_energy_deposition_camera',
            'head_tail_mean_difference_camera']

def plot_histograms(energy_threshold):
    df_C_low = df_C[df_C['energy_value'] < energy_threshold]
    df_C_high = df_C[df_C['energy_value'] >= energy_threshold]
    df_F_low = df_F[df_F['energy_value'] < energy_threshold]
    df_F_high = df_F[df_F['energy_value'] >= energy_threshold]
    
    fig, axes = plt.subplots(len(features), 2, figsize=(12, 15))
    for i, feature in enumerate(features):
        axes[i, 0].hist(df_C_low[feature], bins=50, alpha=0.5, label='C <{:.1f}keV'.format(energy_threshold), density=True)
        axes[i, 0].hist(df_F_low[feature], bins=50, alpha=0.5, label='F <{:.1f}keV'.format(energy_threshold), density=True)
        axes[i, 0].set_title(f"{feature} Distribution (<{np.round(energy_threshold,3)} keV)")
        axes[i, 0].set_xlabel(feature)
        axes[i, 0].set_ylabel("Density")
        axes[i, 0].legend()
        
        axes[i, 1].hist(df_C_high[feature], bins=50, alpha=0.5, label='C >{:.1f}keV'.format(energy_threshold), density=True)
        axes[i, 1].hist(df_F_high[feature], bins=50, alpha=0.5, label='F >{:.1f}keV'.format(energy_threshold), density=True)
        axes[i, 1].set_title(f"{feature} Distribution (>{np.round(energy_threshold,3)} keV)")
        axes[i, 1].set_xlabel(feature)
        axes[i, 1].set_ylabel("Density")
        axes[i, 1].legend()
    
    plt.tight_layout()
    plt.show()

# Interactive widget
interact(plot_histograms, energy_threshold=FloatSlider(min=5, max=500, step=5, value=30));
   

In [None]:
plt.savefig("energy_feature_distribution_comparison_100keV.png")

In [None]:
# KS tests

import pandas as pd
import numpy as np
import re
from scipy.stats import ks_2samp

local = False

# Load the CSV file into a DataFrame
if local:
    df = pd.read_csv("../ANN-code/Data/features_CF4.csv")
else:
    df = pd.read_csv("../ANN-code/features_CF4.csv")

# Extract energy from file_name using regex
def extract_energy(file_name):
    match = re.search(r"(\d+\.\d+)keV", file_name)
    if match:
        return float(match.group(1))
    return np.nan

df['energy_value'] = df['file_name'].apply(extract_energy)

# Remove rows with NaN energy
df = df.dropna(subset=['energy_value'])

# Split into Carbon and Fluorine
df_C = df[df['file_name'].str.contains('_C_')]
df_F = df[df['file_name'].str.contains('_F_')]

# Features to compare (excluding 'file_name', 'sum_intensity_camera', 'energy_value')
features = ['max_intensity_camera', 'recoil_angle_camera', 'recoil_length_camera',
            'mean_energy_deposition_camera', 'std_energy_deposition_camera',
            'skew_energy_deposition_camera', 'kurt_energy_deposition_camera',
            'head_tail_mean_difference_camera']

# Perform KS test for each feature
ks_results = {}
for feature in features:
    ks_stat, p_value = ks_2samp(df_C[feature].dropna(), df_F[feature].dropna())
    ks_results[feature] = {'KS Statistic': ks_stat, 'P-Value': p_value}

# Convert to DataFrame and display results
ks_df = pd.DataFrame.from_dict(ks_results, orient='index')
print(ks_df)


In [None]:
# KS tests at low energies

import pandas as pd
import numpy as np
import re
from scipy.stats import ks_2samp
from ipywidgets import interact, FloatSlider

local = False

# Load the CSV file into a DataFrame
if local:
    df = pd.read_csv("../ANN-code/Data/features_CF4.csv")
else:
    df = pd.read_csv("../ANN-code/features_CF4.csv")

# Extract energy from file_name using regex
def extract_energy(file_name):
    match = re.search(r"(\d+\.\d+)keV", file_name)
    if match:
        return float(match.group(1))
    return np.nan

df['energy_value'] = df['file_name'].apply(extract_energy)

# Remove rows with NaN energy
df = df.dropna(subset=['energy_value'])

# Split into Carbon and Fluorine
df_C = df[df['file_name'].str.contains('_C_')]
df_F = df[df['file_name'].str.contains('_F_')]

# Features to compare (excluding 'file_name', 'sum_intensity_camera', 'energy_value')
features = ['max_intensity_camera', 'recoil_angle_camera', 'recoil_length_camera',
            'mean_energy_deposition_camera', 'std_energy_deposition_camera',
            'skew_energy_deposition_camera', 'kurt_energy_deposition_camera',
            'head_tail_mean_difference_camera']

def ks_test_by_energy(energy_threshold):
    df_C_low = df_C[df_C['energy_value'] < energy_threshold]
    df_C_high = df_C[df_C['energy_value'] >= energy_threshold]
    df_F_low = df_F[df_F['energy_value'] < energy_threshold]
    df_F_high = df_F[df_F['energy_value'] >= energy_threshold]
    
    ks_results_low = {}
    ks_results_high = {}
    
    for feature in features:
        ks_stat_low, p_value_low = ks_2samp(df_C_low[feature].dropna(), df_F_low[feature].dropna())
        ks_results_low[feature] = {'KS Statistic': ks_stat_low, 'P-Value': p_value_low}
        
        ks_stat_high, p_value_high = ks_2samp(df_C_high[feature].dropna(), df_F_high[feature].dropna())
        ks_results_high[feature] = {'KS Statistic': ks_stat_high, 'P-Value': p_value_high}
    
    ks_df_low = pd.DataFrame.from_dict(ks_results_low, orient='index')
    ks_df_high = pd.DataFrame.from_dict(ks_results_high, orient='index')
    
    print(f"KS Test Results for < {energy_threshold} keV")
    print(ks_df_low)
    print("\n")
    print(f"KS Test Results for > {energy_threshold} keV")
    print(ks_df_high)

# Interactive widget
interact(ks_test_by_energy, energy_threshold=FloatSlider(min=5, max=600, step=5, value=30));


Feature preprocessing

In [None]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt

local = True

# Load the CSV file into a DataFrame
if local:
    df = pd.read_csv("../ANN-code/Data/features_CF4.csv")
else:
    df = pd.read_csv("../ANN-code/features_CF4.csv")

# Get columns
columns = df.columns
print(columns)

# Split the data into two DataFrames based on 'file_name' containing 'C' or 'F'
df_C = df[df['file_name'].str.contains('00_C_')]
df_F = df[df['file_name'].str.contains('00_F_')]

# Define feature columns (excluding 'file_name')
features = df.columns[1:]  

In [None]:
# print how many nans in each df
print("Number of NaNs in each DataFrame:")
print("df_C: ", df_C.isnull().sum().sum())
print("df_F: ", df_F.isnull().sum().sum())


In [None]:
# Fill missing values with the mean for each DataFrame
df_C[features] = df_C[features].fillna(df_C[features].mean())
df_F[features] = df_F[features].fillna(df_F[features].mean())


print("Missing values filled with column means.")

In [None]:
from sklearn.preprocessing import MinMaxScaler

# Initialize MinMaxScaler
scaler = MinMaxScaler()

# Apply MinMax Scaling only to the feature columns
df_C_scaled = df_C.copy()
df_F_scaled = df_F.copy()

df_C_scaled[features] = scaler.fit_transform(df_C[features])
df_F_scaled[features] = scaler.transform(df_F[features])


df_processed = pd.concat([df_C_scaled, df_F_scaled])
df_processed.to_csv("features_CF4_processed.csv", index=False) # Save the processed DataFrame

print("MinMax Scaling applied to features .")

In [None]:
import matplotlib.pyplot as plt
pre_path = "Features_CF4_1_viz/"

# Plot histograms after MinMax Scaling
plt.figure(figsize=(15, 10))
for i, feature in enumerate(features):
    plt.subplot(3, 3, i + 1)
    plt.hist(df_C_scaled[feature], bins=50, alpha=0.5, label='C', density=True)
    plt.hist(df_F_scaled[feature], bins=50, alpha=0.5, label='F', density=True)
    plt.xlabel(feature)
    plt.ylabel("Density")
    plt.legend()
    plt.title(f"Distribution of {feature} after MinMax Scaling")

plt.tight_layout()
plt.savefig(pre_path+"all_features_distributions_minmaxscaled.png")  # Save the plot
plt.show()