In [1]:
from sklearn.model_selection import train_test_split
import pandas as pd


# Configure problem number here
PROBLEM_NUM = 36

X_path = f"./data_31_40/problem_{PROBLEM_NUM}/dataset_{PROBLEM_NUM}.csv"
y_path = f"./data_31_40/problem_{PROBLEM_NUM}/target_{PROBLEM_NUM}.csv"
Xeval_path = f"./data_31_40/problem_{PROBLEM_NUM}/EVAL_{PROBLEM_NUM}.csv"

X = pd.read_csv(X_path)
y = pd.read_csv(y_path)
X_eval = pd.read_csv(Xeval_path)

y1 = y["target01"]

print(f"Problem {PROBLEM_NUM}")
print(f"X: {X.shape}, y1: {y1.shape}, X_eval: {X_eval.shape}")
assert list(X.columns) == list(X_eval.columns), "Train/EVAL column mismatch!"

# Create train/validation split to detect overfitting
X_train, X_val, y_train, y_val = train_test_split(
    X, y1, test_size=0.2, random_state=42, shuffle=True
)

print(f"\nTrain/Val Split:")
print(f"X_train: {X_train.shape}, y_train: {y_train.shape}")
print(f"X_val: {X_val.shape}, y_val: {y_val.shape}")


Problem 36
X: (10000, 273), y1: (10000,), X_eval: (10000, 273)

Train/Val Split:
X_train: (8000, 273), y_train: (8000,)
X_val: (2000, 273), y_val: (2000,)


In [9]:
# Feature selection: Remove noise features
from sklearn.feature_selection import VarianceThreshold, f_regression, mutual_info_regression
import numpy as np

print("=== Feature Selection to Remove Noise ===\n")

# Method 1: Remove low-variance features (near-constant)
selector = VarianceThreshold(threshold=0.005)  # Remove features with variance < 0.005
X_train_var = selector.fit_transform(X_train)
low_var_mask = selector.get_support()
print(f"Low-variance filter: {X_train.shape[1]} → {X_train_var.shape[1]} features")
print(f"Removed {(~low_var_mask).sum()} low-variance features\n")

# Method 2: Statistical test - F-test (linear relationship with target)
# Higher F-score = more likely to be signal, not noise
f_scores, p_values = f_regression(X_train, y_train)
# Keep features with p-value < 0.05 (statistically significant)
f_test_mask = p_values < 0.05
print(f"F-test filter (p<0.05): {X_train.shape[1]} → {f_test_mask.sum()} features")
print(f"Removed {(~f_test_mask).sum()} statistically insignificant features\n")

# Method 3: Mutual Information (captures non-linear relationships too)
mi_scores = mutual_info_regression(X_train, y_train, random_state=42)
# Keep features with MI > 0 (some information about target)
mi_threshold = np.percentile(mi_scores, 25)  # Remove bottom 25%
mi_mask = mi_scores > mi_threshold
print(f"Mutual Info filter (>25th percentile): kept {mi_mask.sum()} features")
print(f"Removed {(~mi_mask).sum()} low-information features\n")

# FIXED: Use AND logic - keep features that pass BOTH F-test AND MI test
combined_mask = f_test_mask & mi_mask
good_features = X_train.columns[combined_mask]

print(f"=== FINAL (F-test AND MI): {X_train.shape[1]} → {len(good_features)} features ===")
print(f"Removed {X_train.shape[1] - len(good_features)} likely noise features\n")

# Apply filtering - NOTE: X_eval is NOT filtered (keep original)
X_train_filtered = X_train[good_features]
X_val_filtered = X_val[good_features]

print(f"Filtered shapes:")
print(f"X_train: {X_train_filtered.shape}")
print(f"X_val: {X_val_filtered.shape}")
print(f"X_eval: {X_eval.shape} (unchanged)")


=== Feature Selection to Remove Noise ===

Low-variance filter: 273 → 273 features
Removed 0 low-variance features

F-test filter (p<0.05): 273 → 35 features
Removed 238 statistically insignificant features

Mutual Info filter (>25th percentile): kept 141 features
Removed 132 low-information features

=== FINAL (F-test AND MI): 273 → 24 features ===
Removed 249 likely noise features

Filtered shapes:
X_train: (8000, 24)
X_val: (2000, 24)
X_eval: (10000, 273) (unchanged)


In [None]:
X_train_filtered

In [8]:
# feature engineering and reduction
# Step 1: Scale features first (PCA is sensitive to scale!)
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_filtered)  # Use filtered features
X_val_scaled = scaler.transform(X_val_filtered)
# X_eval uses original unfiltered data
X_eval_scaled = scaler.transform(X_eval)

# Step 2: Apply PCA on scaled data
pca = PCA(n_components=0.95)  # retain 95% variance
X_train_pca = pca.fit_transform(X_train_scaled)
X_val_pca = pca.transform(X_val_scaled)
X_eval_pca = pca.transform(X_eval_scaled)

print(f"\nAfter Scaling + PCA Transformation:")
print(f"Filtered features: {X_train_filtered.shape[1]}")
print(f"PCA components: {X_train_pca.shape[1]} (retaining {pca.explained_variance_ratio_.sum():.1%} variance)")
print(f"X_train_pca: {X_train_pca.shape}")
print(f"X_val_pca: {X_val_pca.shape}")
print(f"X_eval_pca: {X_eval_pca.shape}")


ValueError: The feature names should match those that were passed during fit.
Feature names unseen at fit time:
- feat_0
- feat_1
- feat_10
- feat_100
- feat_101
- ...
