# Feature Selection Analysis for Battery Capacity Prediction

This notebook performs comprehensive feature selection analysis and outputs a new dataset with only the best selected features for improved model performance.

**Output Format**: Cell ID | Average Capacity | Selected Features...


## 1. Setup and Data Loading


In [12]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# Feature selection imports
from sklearn.feature_selection import SelectKBest, f_regression, mutual_info_regression
from sklearn.feature_selection import SelectPercentile
from scipy.stats import pearsonr
from sklearn.linear_model import Ridge
from sklearn.model_selection import cross_val_score, train_test_split

# Import custom data loader
from src.data_loader import DataLoader

# Set up plotting style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

print("✅ All imports successful!")


✅ All imports successful!


In [13]:
# Initialize paths and load data
current_dir = Path.cwd()
data_path = current_dir / "ecm_eis_merge_299.xlsx"
results_dir = current_dir / "results"
results_dir.mkdir(exist_ok=True)

print(f"📁 Data path: {data_path}")
print(f"📁 Results directory: {results_dir}")
print(f"✅ Data file exists: {data_path.exists()}")

# Load data
print("\n📊 Loading data...")
data_loader = DataLoader(data_path)
df = data_loader.load_data()
X, y = data_loader.split_features_target(df)

print(f"📊 Dataset shape: {df.shape}")
print(f"📊 Features shape: {X.shape}")
print(f"📊 Target shape: {y.shape}")
print(f"📊 Original column order: {list(df.columns[:3])}... (first 3)")

# Split for evaluation
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
print(f"\n📊 Train/Test split: {X_train.shape[0]}/{X_test.shape[0]} samples")


📁 Data path: /Users/amirbabamahmoudi/Documents/Battery-Capacity/ecm_eis_merge_299.xlsx
📁 Results directory: /Users/amirbabamahmoudi/Documents/Battery-Capacity/results
✅ Data file exists: True

📊 Loading data...
Successfully loaded data with shape: (299, 68)
Index(['Cell ID', 'Average Capacity'], dtype='object')
📊 Dataset shape: (299, 68)
📊 Features shape: (299, 66)
📊 Target shape: (299,)
📊 Original column order: ['Cell ID', 'Average Capacity', '1.3197 Hz RE']... (first 3)

📊 Train/Test split: 209/90 samples


## 2. Feature Selection Analysis


In [14]:
# Feature selection functions
def pearson_feature_selection(X, y, n_features=20):
    """Select features based on Pearson correlation with target"""
    correlations = []
    for column in X.columns:
        corr, p_val = pearsonr(X[column], y)
        correlations.append((column, abs(corr), p_val))
    
    correlations.sort(key=lambda x: x[1], reverse=True)
    selected_features = [item[0] for item in correlations[:n_features]]
    feature_scores = pd.DataFrame(correlations[:n_features], 
                                 columns=['feature', 'abs_correlation', 'p_value'])
    return X[selected_features], feature_scores, selected_features

def mutual_info_feature_selection(X, y, n_features=20):
    """Select features based on mutual information with target"""
    selector = SelectKBest(score_func=mutual_info_regression, k=n_features)
    X_selected = selector.fit_transform(X, y)
    selected_indices = selector.get_support(indices=True)
    selected_features = X.columns[selected_indices].tolist()
    
    feature_scores = pd.DataFrame({
        'feature': selected_features,
        'mutual_info_score': selector.scores_[selected_indices]
    }).sort_values('mutual_info_score', ascending=False)
    
    return pd.DataFrame(X_selected, columns=selected_features, index=X.index), feature_scores, selected_features

def f_regression_feature_selection(X, y, n_features=20):
    """Select features based on F-regression (ANOVA F-test)"""
    selector = SelectKBest(score_func=f_regression, k=n_features)
    X_selected = selector.fit_transform(X, y)
    selected_indices = selector.get_support(indices=True)
    selected_features = X.columns[selected_indices].tolist()
    
    feature_scores = pd.DataFrame({
        'feature': selected_features,
        'f_score': selector.scores_[selected_indices],
        'p_value': selector.pvalues_[selected_indices]
    }).sort_values('f_score', ascending=False)
    
    return pd.DataFrame(X_selected, columns=selected_features, index=X.index), feature_scores, selected_features

# Run feature selection analysis
print("🔍 FEATURE SELECTION ANALYSIS")
print("="*50)

feature_counts = [10, 15, 20, 25, 30]
methods = {
    'Pearson Correlation': pearson_feature_selection,
    'Mutual Information': mutual_info_feature_selection,
    'F-Regression': f_regression_feature_selection
}

selection_results = []

# Test each method
for method_name, method_func in methods.items():
    print(f"\n📊 Testing {method_name}:")
    for n_features in feature_counts:
        if n_features > X_train.shape[1]:
            continue
        try:
            X_selected, feature_scores, selected_features = method_func(X_train, y_train, n_features)
            ridge_model = Ridge(alpha=1.0, random_state=42)
            cv_scores = cross_val_score(ridge_model, X_selected, y_train, cv=5, scoring='neg_mean_squared_error')
            cv_rmse = np.sqrt(-cv_scores.mean())
            cv_std = np.sqrt(-cv_scores).std()
            
            selection_results.append({
                'Method': method_name,
                'N_Features': n_features,
                'CV_RMSE': cv_rmse,
                'CV_STD': cv_std,
                'Selected_Features': selected_features,
                'Feature_Scores': feature_scores
            })
            print(f"   {n_features:2d} features: RMSE = {cv_rmse:.4f} (±{cv_std:.4f})")
        except Exception as e:
            print(f"   ❌ Error with {n_features} features: {e}")

# Test baseline
ridge_baseline = Ridge(alpha=1.0, random_state=42)
baseline_scores = cross_val_score(ridge_baseline, X_train, y_train, cv=5, scoring='neg_mean_squared_error')
baseline_rmse = np.sqrt(-baseline_scores.mean())
baseline_std = np.sqrt(-baseline_scores).std()

selection_results.append({
    'Method': 'No Selection (Baseline)',
    'N_Features': X_train.shape[1],
    'CV_RMSE': baseline_rmse,
    'CV_STD': baseline_std,
    'Selected_Features': X_train.columns.tolist(),
    'Feature_Scores': None
})

print(f"\n📊 Baseline (all {X_train.shape[1]} features): RMSE = {baseline_rmse:.4f}")
print("✅ Feature selection analysis completed!")


🔍 FEATURE SELECTION ANALYSIS

📊 Testing Pearson Correlation:
   10 features: RMSE = 678.4975 (±151.1540)
   15 features: RMSE = 678.1549 (±151.0860)
   20 features: RMSE = 677.7957 (±151.0112)
   25 features: RMSE = 677.6141 (±150.9718)
   30 features: RMSE = 677.3536 (±150.9105)

📊 Testing Mutual Information:
   10 features: RMSE = 678.7215 (±151.2005)
   15 features: RMSE = 678.3490 (±151.1272)
   20 features: RMSE = 677.9942 (±151.0544)
   25 features: RMSE = 677.6146 (±150.9717)
   30 features: RMSE = 676.9688 (±150.8024)

📊 Testing F-Regression:
   10 features: RMSE = 678.4975 (±151.1540)
   15 features: RMSE = 678.1549 (±151.0860)
   20 features: RMSE = 677.7957 (±151.0112)
   25 features: RMSE = 677.6141 (±150.9718)
   30 features: RMSE = 677.3536 (±150.9105)

📊 Baseline (all 66 features): RMSE = 995.5476
✅ Feature selection analysis completed!


## 3. Create Dataset with Best Selected Features


In [15]:
# Find best method and create new dataset
results_df = pd.DataFrame(selection_results)

print("\n📊 FEATURE SELECTION RESULTS:")
print("="*60)
display(results_df[['Method', 'N_Features', 'CV_RMSE', 'CV_STD']].round(6))

# Find best result
best_result = results_df.loc[results_df['CV_RMSE'].idxmin()]
improvement = ((baseline_rmse - best_result['CV_RMSE']) / baseline_rmse) * 100

print(f"\n🏆 BEST FEATURE SELECTION:")
print(f"   Method: {best_result['Method']}")
print(f"   Features: {best_result['N_Features']}")
print(f"   CV RMSE: {best_result['CV_RMSE']:.6f}")
print(f"   Improvement: {improvement:.2f}% over baseline")

best_selected_features = best_result['Selected_Features']
best_method = best_result['Method']
best_n_features = best_result['N_Features']

print(f"\n🔝 Selected features:")
for i, feature in enumerate(best_selected_features[:10], 1):
    print(f"   {i:2d}. {feature}")
if len(best_selected_features) > 10:
    print(f"   ... and {len(best_selected_features) - 10} more")

print(f"\n📊 CREATING NEW DATASET WITH CORRECT COLUMN ORDER")
print("="*60)

# 🔧 CORRECTED: Cell ID, Average Capacity, then selected features
new_dataset_columns = ['Cell ID', 'Average Capacity'] + best_selected_features

print(f"✅ Correct column order:")
print(f"   1. Cell ID (identifier)")
print(f"   2. Average Capacity (target - in 2nd position as requested)")
print(f"   3-{len(best_selected_features)+2}. Selected features ({len(best_selected_features)} features)")

# Verify columns exist
missing_columns = [col for col in new_dataset_columns if col not in df.columns]
if missing_columns:
    print(f"❌ Missing columns: {missing_columns}")
else:
    print("✅ All columns found in original dataset")

# Create new dataset with CORRECT column order
new_dataset = df[new_dataset_columns].copy()

print(f"\n📊 New dataset created:")
print(f"   Shape: {new_dataset.shape}")
print(f"   Reduction: {len(df.columns)-2} → {len(best_selected_features)} features")
print(f"   Reduction: {(1 - len(best_selected_features)/(len(df.columns)-2))*100:.1f}%")

print(f"\n📋 Verifying column order (first 3 rows):")
display(new_dataset.head(3))

print(f"\n✅ Column order verification:")
print(f"   Column 1: '{new_dataset.columns[0]}' (should be 'Cell ID') ✓")
print(f"   Column 2: '{new_dataset.columns[1]}' (should be 'Average Capacity') ✓")
print(f"   Remaining: {len(new_dataset.columns)-2} selected features ✓")



📊 FEATURE SELECTION RESULTS:


Unnamed: 0,Method,N_Features,CV_RMSE,CV_STD
0,Pearson Correlation,10,678.497451,151.154009
1,Pearson Correlation,15,678.154923,151.086001
2,Pearson Correlation,20,677.795721,151.011199
3,Pearson Correlation,25,677.614058,150.971785
4,Pearson Correlation,30,677.353613,150.910508
5,Mutual Information,10,678.721468,151.200459
6,Mutual Information,15,678.348968,151.127153
7,Mutual Information,20,677.99418,151.054438
8,Mutual Information,25,677.614596,150.971745
9,Mutual Information,30,676.96876,150.802415



🏆 BEST FEATURE SELECTION:
   Method: Mutual Information
   Features: 30
   CV RMSE: 676.968760
   Improvement: 32.00% over baseline

🔝 Selected features:
    1. 1.6496 Hz RE
    2. 1.9795 Hz RE
    3. 2.6394 Hz RE
    4. 4.6189 Hz RE
    5. 5.9386 Hz RE
    6. 21.3623 Hz RE
    7. 27.4658 Hz RE
    8. 36.6211 Hz RE
    9. 45.7764 Hz RE
   10. 64.0869 Hz RE
   ... and 20 more

📊 CREATING NEW DATASET WITH CORRECT COLUMN ORDER
✅ Correct column order:
   1. Cell ID (identifier)
   2. Average Capacity (target - in 2nd position as requested)
   3-32. Selected features (30 features)
✅ All columns found in original dataset

📊 New dataset created:
   Shape: (299, 32)
   Reduction: 66 → 30 features
   Reduction: 54.5%

📋 Verifying column order (first 3 rows):


Unnamed: 0,Cell ID,Average Capacity,1.6496 Hz RE,1.9795 Hz RE,2.6394 Hz RE,4.6189 Hz RE,5.9386 Hz RE,21.3623 Hz RE,27.4658 Hz RE,36.6211 Hz RE,...,866.6992 Hz RE,866.6992 Hz IM,1159.6680 Hz RE,1550.2930 Hz RE,2075.1953 Hz RE,2783.2031 Hz RE,3735.3516 Hz RE,3735.3516 Hz IM,5004.8828 Hz RE,R0
0,202207190092,7556.106,0.002639,0.002586,0.002612,0.002564,0.002589,0.002296,0.002192,0.00209,...,0.001001,0.000113,0.00088,0.000877,0.000835,0.000863,0.000836,-0.00055,0.000813,0.000866
1,202207190097,7812.56,0.002898,0.002884,0.002891,0.00286,0.002831,0.002544,0.002466,0.002365,...,0.001159,8.2e-05,0.001123,0.00111,0.001129,0.001096,0.001089,-0.0005,0.001108,0.001134
2,202207194006,7854.593,0.00251,0.002509,0.00253,0.002485,0.002477,0.002243,0.002129,0.002045,...,0.000996,9.9e-05,0.000939,0.000903,0.000898,0.000903,0.000871,-0.00054,0.000896,0.000912



✅ Column order verification:
   Column 1: 'Cell ID' (should be 'Cell ID') ✓
   Column 2: 'Average Capacity' (should be 'Average Capacity') ✓
   Remaining: 30 selected features ✓


## 4. Save Results and New Dataset


In [16]:
# Save all results and the new dataset
print("💾 SAVING RESULTS AND NEW DATASET")
print("="*50)

# Save feature selection results
feature_results_file = results_dir / 'feature_selection_results.csv'
results_df[['Method', 'N_Features', 'CV_RMSE', 'CV_STD']].to_csv(feature_results_file, index=False)

# Save selected features list
best_features_file = results_dir / 'best_selected_features.txt'
with open(best_features_file, 'w') as f:
    f.write(f"# Best Feature Selection Results\n")
    f.write(f"# Method: {best_method}\n")
    f.write(f"# Number of Features: {best_n_features}\n")
    f.write(f"# CV RMSE: {best_result['CV_RMSE']:.6f}\n")
    f.write(f"# Improvement: {improvement:.2f}% over baseline\n")
    f.write(f"# Dataset Column Order: Cell ID, Average Capacity, Selected Features\n\n")
    f.write(f"# Selected Features:\n")
    for i, feature in enumerate(best_selected_features, 1):
        f.write(f"{i:2d}. {feature}\n")

# Save the new dataset with CORRECT column order
output_file = current_dir / f"battery_dataset_selected_features_{best_n_features}.xlsx"
new_dataset.to_excel(output_file, index=False)

print(f"✅ Files saved:")
print(f"   📊 Results: {feature_results_file}")
print(f"   📋 Features: {best_features_file}")
print(f"   📁 Dataset: {output_file}")

print(f"\n🎉 FEATURE SELECTION COMPLETED!")
print("="*50)
print(f"🏆 Best method: {best_method} ({best_n_features} features)")
print(f"📊 Performance: {best_result['CV_RMSE']:.6f} RMSE ({improvement:.2f}% improvement)")
print(f"📁 Output file: {output_file.name}")

print(f"\n✅ READY TO USE:")
print(f"   • Column order: Cell ID → Average Capacity → Selected Features")
print(f"   • File: '{output_file.name}'")
print(f"   • Compatible with your existing battery_analysis.ipynb")

print(f"\n🚀 TO USE IN BATTERY_ANALYSIS.IPYNB:")
print(f"   Change this line:")
print(f"   data_path = current_dir / 'ecm_eis_merge_299.xlsx'")
print(f"   To this:")
print(f"   data_path = current_dir / '{output_file.name}'")

print(f"\n🔧 The capacity column is now correctly in the 2nd position!")
print(f"   Original order: Cell ID, Average Capacity, Feature1, Feature2, ...")
print(f"   New order:      Cell ID, Average Capacity, {best_selected_features[0][:20]}..., ...")


💾 SAVING RESULTS AND NEW DATASET
✅ Files saved:
   📊 Results: /Users/amirbabamahmoudi/Documents/Battery-Capacity/results/feature_selection_results.csv
   📋 Features: /Users/amirbabamahmoudi/Documents/Battery-Capacity/results/best_selected_features.txt
   📁 Dataset: /Users/amirbabamahmoudi/Documents/Battery-Capacity/battery_dataset_selected_features_30.xlsx

🎉 FEATURE SELECTION COMPLETED!
🏆 Best method: Mutual Information (30 features)
📊 Performance: 676.968760 RMSE (32.00% improvement)
📁 Output file: battery_dataset_selected_features_30.xlsx

✅ READY TO USE:
   • Column order: Cell ID → Average Capacity → Selected Features
   • File: 'battery_dataset_selected_features_30.xlsx'
   • Compatible with your existing battery_analysis.ipynb

🚀 TO USE IN BATTERY_ANALYSIS.IPYNB:
   Change this line:
   data_path = current_dir / 'ecm_eis_merge_299.xlsx'
   To this:
   data_path = current_dir / 'battery_dataset_selected_features_30.xlsx'

🔧 The capacity column is now correctly in the 2nd position