# Exercise 3: Test the SelectPercentile class 
 
## SIB - Intelligent Systems for Bioinformatics

Bárbara Freitas PG55693

In [9]:
import os
import sys

# --- 1. Path Configuration ---
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..', 'src')))

from si.io.csv_file import read_csv
from si.feature_selection.select_percentile import SelectPercentile
from si.statistics.f_classification import f_classification

# --- 2. Load the Dataset ---

filename = os.path.join('..', 'datasets', 'iris', 'iris.csv')

# features=True and label=True are essential for the iris.csv file
iris_dataset = read_csv(filename, sep=',', features=True, label=True)

print("--- Original Dataset ---")
print(f"Shape: {iris_dataset.shape()}")
print(f"Features: {iris_dataset.features}")
# The label in iris.csv is 'class', not 'species'. The code will read whatever is there.
print(f"Label: {iris_dataset.label}") 

# --- 3. Initialize SelectPercentile ---
# Select 50% of features (2 out of 4)
percentile = 50

# CRITICAL FIX: We pass 'score_func=f_classification' explicitly.
selector = SelectPercentile(percentile=percentile, score_func=f_classification)

# --- 4. Train (Fit) ---
selector.fit(iris_dataset)

print(f"\n--- Calculated Scores ({percentile}%) ---")
print(f"{'Feature':<15} | {'F-Score':<10} | {'p-Value':<10}")
print("-" * 45)

# Check and print scores if they exist
if selector.F is not None:
    # If selector.p is None (in case p-value is not implemented), use a dummy
    p_values = selector.p if selector.p is not None else np.zeros_like(selector.F)
    
    for feat, score, p_val in zip(iris_dataset.features, selector.F, p_values):
        print(f"{feat:<15} | {score:.4f}     | {p_val:.4f}")

# --- 5. Transform ---
iris_selected = selector.transform(iris_dataset)

print("\n--- Selection Result ---")
print(f"New Shape: {iris_selected.shape()}")
print(f"Selected Features: {iris_selected.features}")

# --- 6. Validation ---
expected_features = ['petal_length', 'petal_width']

if set(iris_selected.features) == set(expected_features):
    print("\n The 'petal' features were selected.")
else:
    print(f"\n⚠️ Unexpected Result: {iris_selected.features}")

--- Original Dataset ---
Shape: (150, 4)
Features: Index(['sepal_length', 'sepal_width', 'petal_length', 'petal_width'], dtype='object')
Label: class

--- Calculated Scores (50%) ---
Feature         | F-Score    | p-Value   
---------------------------------------------
sepal_length    | 119.2645     | 0.0000
sepal_width     | 47.3645     | 0.0000
petal_length    | 1179.0343     | 0.0000
petal_width     | 959.3244     | 0.0000

--- Selection Result ---
New Shape: (150, 2)
Selected Features: ['petal_length', 'petal_width']

 The 'petal' features were selected.


The SelectPercentile class successfully performed feature selection on the Iris dataset. By setting the percentile to 50%, the model selected the two features (petal_length and petal_width) that exhibited the highest F-scores (1179.0343 and 959.3244), confirming that the implementation correctly identifies and retains the most discriminative features for classification.