In [None]:
#Feature Viz for Ar/CF4

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

local = True

# Load the CSV file into a DataFrame
if local:
    df = pd.read_csv("../ANN-code/Data/features_Ar_CF4.csv")
else:
    df = pd.read_csv("../ANN-code/features_Ar_CF4.csv")

# Get columns
columns = df.columns
print(columns)

# Split the data into three DataFrames based on 'file_name' containing 'C', 'F', or 'Ar'
df_C = df[df['file_name'].str.contains('00_C_')]
df_F = df[df['file_name'].str.contains('00_F_')]
df_Ar = df[df['file_name'].str.contains('00_Ar')]  # New addition for Argon recoils

In [None]:
# number of entreis in each DataFrame
print("Number of entries in each DataFrame:")
print("df_C: ", len(df_C))
print("df_F: ", len(df_F))
print("df_Ar: ", len(df_Ar))

In [None]:

# Use .describe() to get statistics for each species
print("Carbon (C) Statistics:\n", df_C.describe(), "\n")
print("Fluorine (F) Statistics:\n", df_F.describe(), "\n")
print("Argon (Ar) Statistics:\n", df_Ar.describe(), "\n")  # New addition for Argon recoils


with pd.ExcelWriter("statistics.xlsx") as writer:
    df_C.describe().to_excel(writer, sheet_name="Carbon")
    df_F.describe().to_excel(writer, sheet_name="Fluorine")
    df_Ar.describe().to_excel(writer, sheet_name="Argon")

print("Statistics saved as an Excel file (statistics.xlsx).")


In [None]:
# Feature correlations
import seaborn as sns
import matplotlib.pyplot as plt

if local:
    pre_path = "Features_Ar_CF4_1_viz/"


# Extract relevant features (assuming numerical columns start after 'file_name')
features = df.columns[1:]  # Exclude 'file_name'

# Compute correlation matrix
correlation_matrix = df[features].corr()

# Plot correlation heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap="coolwarm", fmt=".2f", linewidths=0.5)
plt.title("Feature Correlation Heatmap")
plt.savefig(pre_path+"FeatureCorrelationHeatmap.png") # save the plot
plt.show()

# Split the data into three DataFrames based on 'file_name' containing 'C', 'F', or 'Ar'
df_C = df[df['file_name'].str.contains('00_C_')]
df_F = df[df['file_name'].str.contains('00_F_')]
df_Ar = df[df['file_name'].str.contains('00_Ar')]  # New addition for Argon recoils

print("Number of entries in each DataFrame:")
print("df_C: ", len(df_C))
print("df_F: ", len(df_F))
print("df_Ar: ", len(df_Ar))


# Plot feature distributions using KDE plots for all three species
for feature in features:
    plt.figure(figsize=(8, 5))
    sns.kdeplot(df_C[feature], label="Carbon (C)", fill=True, alpha=0.5)
    sns.kdeplot(df_F[feature], label="Fluorine (F)", fill=True, alpha=0.5)
    sns.kdeplot(df_Ar[feature], label="Argon (Ar)", fill=True, alpha=0.5)
    plt.title(f"Feature Distribution: {feature}")
    plt.xlabel(feature)
    plt.ylabel("Density")
    plt.legend()
    plt.savefig(pre_path+str(feature)+"_distribution.png") # save the plot
    plt.show()

In [None]:
# Get columns
columns = df.columns

df_C = df[df['file_name'].str.contains('00_C_')]
df_F = df[df['file_name'].str.contains('00_F_')]
df_Ar = df[df['file_name'].str.contains('00_Ar_')]  # New addition for Argon recoils

# Feature distribution plots
features = columns[1:]

plt.figure(figsize=(15, 10))
for i, feature in enumerate(features):
    plt.subplot(3, 3, i + 1)
    plt.hist(df_C[feature], bins=50, alpha=0.5, label='C', density=True)
    plt.hist(df_F[feature], bins=50, alpha=0.5, label='F', density=True)
    plt.hist(df_Ar[feature], bins=50, alpha=0.5, label='Ar', density=True)  # Add Argon distribution
    plt.xlabel(feature)
    plt.ylabel("Density")
    plt.legend()
    plt.title(f"Distribution of {feature}")

plt.tight_layout()
plt.savefig(pre_path+"all_features_distributions.png") # save the plot
plt.show()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import re


# Load the CSV file into a DataFrame
if local:
    df = pd.read_csv("../ANN-code/Data/features_Ar_CF4.csv")
else:
    df = pd.read_csv("../ANN-code/features_Ar_CF4.csv")

# Extract energy from file_name using regex
def extract_energy(file_name):
    match = re.search(r"(\d+\.\d+)keV", file_name)
    if match:
        return float(match.group(1))
    return np.nan

df['energy_value'] = df['file_name'].apply(extract_energy)

# Remove rows with NaN energy
df = df.dropna(subset=['energy_value'])

df_C = df[df['file_name'].str.contains('00_C_')]
df_F = df[df['file_name'].str.contains('00_F_')]
df_Ar = df[df['file_name'].str.contains('00_Ar')]  # New addition for Argon recoils

# Further split by energy (<262 keV and ≥262 keV)
df_C_low = df_C[df_C['energy_value'] < 262]
df_C_high = df_C[df_C['energy_value'] >= 262]
df_F_low = df_F[df_F['energy_value'] < 262]
df_F_high = df_F[df_F['energy_value'] >= 262]
df_Ar_low = df_Ar[df_Ar['energy_value'] < 262]
df_Ar_high = df_Ar[df_Ar['energy_value'] >= 262]

# Features to plot
columns = df.columns
features = columns[1:]

# Plot histograms for low and high energy ranges
fig, axes = plt.subplots(len(features), 2, figsize=(12, 36))
for i, feature in enumerate(features):
    axes[i, 0].hist(df_C_low[feature], bins=50, alpha=0.5, label='C <262keV', density=True)
    axes[i, 0].hist(df_F_low[feature], bins=50, alpha=0.5, label='F <262keV', density=True)
    axes[i, 0].hist(df_Ar_low[feature], bins=50, alpha=0.5, label='Ar <262keV', density=True)
    axes[i, 0].set_title(f"{feature} Distribution (<262 keV)")
    axes[i, 0].set_xlabel(feature)
    axes[i, 0].set_ylabel("Density")
    axes[i, 0].legend()
    
    axes[i, 1].hist(df_C_high[feature], bins=50, alpha=0.5, label='C >262keV', density=True)
    axes[i, 1].hist(df_F_high[feature], bins=50, alpha=0.5, label='F >262keV', density=True)
    axes[i, 1].hist(df_Ar_high[feature], bins=50, alpha=0.5, label='Ar >262keV', density=True)
    axes[i, 1].set_title(f"{feature} Distribution (>262 keV)")
    axes[i, 1].set_xlabel(feature)
    axes[i, 1].set_ylabel("Density")
    axes[i, 1].legend()

plt.tight_layout()
plt.savefig(pre_path+"feature_distributions_energy_comparison_262keV.png")  # save the plot
plt.show()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.preprocessing import StandardScaler

# Load the dataset
if local:
    df = pd.read_csv("../ANN-code/Data/features_Ar_CF4.csv")
else:
    df = pd.read_csv("../ANN-code/features_Ar_CF4.csv")

# Split into Carbon, Fluorine, and Argon
df_C = df[df['file_name'].str.contains('00_C_')]
df_F = df[df['file_name'].str.contains('00_F_')]
df_Ar = df[df['file_name'].str.contains('00_Ar')]

# Extract feature columns (excluding 'file_name')
features = df.columns[1:]  # Adjust if necessary

# Apply Standard Scaling
scaler = StandardScaler()

df_C_scaled = df_C.copy()
df_F_scaled = df_F.copy()
df_Ar_scaled = df_Ar.copy()

df_C_scaled[features] = scaler.fit_transform(df_C[features])
df_F_scaled[features] = scaler.transform(df_F[features])
df_Ar_scaled[features] = scaler.transform(df_Ar[features])

# Plot histograms after Standard Scaling
plt.figure(figsize=(15, 10))
for i, feature in enumerate(features):
    plt.subplot(3, 3, i + 1)
    plt.hist(df_C_scaled[feature], bins=50, alpha=0.5, label='C', density=True)
    plt.hist(df_F_scaled[feature], bins=50, alpha=0.5, label='F', density=True)
    plt.hist(df_Ar_scaled[feature], bins=50, alpha=0.5, label='Ar', density=True)
    plt.xlabel(feature)
    plt.ylabel("Density")
    plt.legend()
    plt.title(f"Distribution of {feature} after Standard Scaling")

plt.tight_layout()
plt.savefig(pre_path+"all_features_distributions_scaled.png")  # save the plot
plt.show()



In [None]:
# find greatest energy for each species

# Carbon
max_energy_C = df_C['energy_value'].max()
print("Max energy for Carbon:", max_energy_C)

# Fluorine
max_energy_F = df_F['energy_value'].max()
print("Max energy for Fluorine:", max_energy_F)

# Argon
max_energy_Ar = df_Ar['energy_value'].max()
print("Max energy for Argon:", max_energy_Ar)

Feature Preprocessing

In [1]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt

local = True

# Load the CSV file into a DataFrame
if local:
    df = pd.read_csv("../ANN-code/Data/features_Ar_CF4.csv")
else:
    df = pd.read_csv("../ANN-code/features_Ar_CF4.csv")

# Get columns
columns = df.columns
print(columns)

# Split the data into three DataFrames based on 'file_name' containing 'C', 'F', or 'Ar'
df_C = df[df['file_name'].str.contains('00_C_')]
df_F = df[df['file_name'].str.contains('00_F_')]
df_Ar = df[df['file_name'].str.contains('00_Ar')]  # New addition for Argon recoils

features = df.columns[1:]  


Index(['file_name', 'sum_intensity_camera', 'max_intensity_camera',
       'recoil_angle_camera', 'recoil_length_camera',
       'mean_energy_deposition_camera', 'std_energy_deposition_camera',
       'skew_energy_deposition_camera', 'kurt_energy_deposition_camera',
       'head_tail_mean_difference_camera'],
      dtype='object')


In [2]:
# print how many nans in each df
print("Number of NaNs in each DataFrame:")
print("df_C: ", df_C.isnull().sum().sum())
print("df_F: ", df_F.isnull().sum().sum())
print("df_Ar: ", df_Ar.isnull().sum().sum())


Number of NaNs in each DataFrame:
df_C:  0
df_F:  0
df_Ar:  3


In [3]:
# Fill missing values with the mean for each DataFrame
df_C[features] = df_C[features].fillna(df_C[features].mean())
df_F[features] = df_F[features].fillna(df_F[features].mean())
df_Ar[features] = df_Ar[features].fillna(df_Ar[features].mean())

print("Missing values filled with column means.")

Missing values filled with column means.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_C[features] = df_C[features].fillna(df_C[features].mean())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_F[features] = df_F[features].fillna(df_F[features].mean())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_Ar[features] = df_Ar[features].fillna(df_Ar[features].mean())


In [4]:
from sklearn.preprocessing import MinMaxScaler

# Initialize MinMaxScaler
scaler = MinMaxScaler()

# Apply MinMax Scaling only to the feature columns
df_C_scaled = df_C.copy()
df_F_scaled = df_F.copy()
df_Ar_scaled = df_Ar.copy()

df_C_scaled[features] = scaler.fit_transform(df_C[features])
df_F_scaled[features] = scaler.transform(df_F[features])
df_Ar_scaled[features] = scaler.transform(df_Ar[features])

df_processed = pd.concat([df_C_scaled, df_F_scaled, df_Ar_scaled])
df_processed.to_csv("features_Ar_CF4_processed.csv", index=False) # Save the processed DataFrame

print("MinMax Scaling applied to features .")

MinMax Scaling applied to features .


In [None]:
import matplotlib.pyplot as plt

# Plot histograms after MinMax Scaling
plt.figure(figsize=(15, 10))
for i, feature in enumerate(features):
    plt.subplot(3, 3, i + 1)
    plt.hist(df_C_scaled[feature], bins=50, alpha=0.5, label='C', density=True)
    plt.hist(df_F_scaled[feature], bins=50, alpha=0.5, label='F', density=True)
    plt.hist(df_Ar_scaled[feature], bins=50, alpha=0.5, label='Ar', density=True)
    plt.xlabel(feature)
    plt.ylabel("Density")
    plt.legend()
    plt.title(f"Distribution of {feature} after MinMax Scaling")

plt.tight_layout()
plt.show()
