In [2]:
import pandas as pd
from sklearn.model_selection import (
    train_test_split, 
    RepeatedStratifiedKFold, 
    cross_val_score, 
    GridSearchCV,
    StratifiedKFold)
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, classification_report

In [3]:
train_df = pd.read_csv('EN_combined_train_with_labels.csv.gz')
train_df

Unnamed: 0,id,energy_label,psd_label_low_avse,psd_label_high_avse,psd_label_dcr,psd_label_lq,tp0,ED,HWP,LQ80,...,current_kurtosis,total_power,time_to_main_peak,time_to_peak,late_over_early,tdrift99,tfr,peak_count,gbn,bpr
0,0_train_0,582.364295,False,True,True,True,957,3409.0,2299.0,-717094.898532,...,2.117825,1.709302e+09,85,85,0.987910,85.0,0.142357,3,1.198436,0.059642
1,1_train_0,250.159995,False,True,True,True,948,3404.0,2446.0,-331957.541919,...,2.058622,2.991376e+08,87,87,0.988301,87.0,0.151160,3,1.381123,0.061975
2,2_train_0,1212.323954,False,True,False,True,965,3411.0,2262.0,-425532.152706,...,-0.067003,6.244385e+08,95,95,0.987491,95.0,0.142606,3,1.361857,0.050813
3,3_train_0,240.878110,False,True,True,False,927,3408.0,2833.0,-306980.459766,...,2.443885,2.811583e+08,116,116,0.988450,116.0,0.133192,2,1.165654,0.062954
4,4_train_0,285.124189,False,True,True,False,958,3406.0,2397.0,-362746.925366,...,-0.173890,3.885442e+08,94,94,0.988541,94.0,0.145504,7,1.390918,0.055013
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1039995,1039995_train_15,210.834626,True,True,True,True,949,3403.0,2533.0,-278607.352936,...,2.026649,2.133856e+08,90,90,0.985736,90.0,0.151747,6,1.008092,0.062825
1039996,1039996_train_15,380.170340,True,True,True,True,949,3404.0,2426.0,-477777.966558,...,1.943512,6.842378e+08,82,82,0.987814,79.0,0.150305,3,1.245492,0.062706
1039997,1039997_train_15,370.109563,False,True,True,True,947,3409.0,2417.0,-466589.983952,...,1.601361,6.641180e+08,102,102,0.987833,102.0,0.142950,2,1.149456,0.062153
1039998,1039998_train_15,98.258524,True,True,True,False,921,3400.0,3799.0,-143212.214717,...,3.825219,4.630840e+07,92,92,0.989948,92.0,0.153135,5,1.192639,0.065264


# MODEL TRAINING

## Model Training - psd_label_high_avse

### Baseline Model - Logistic Regression

In [4]:
label_cols = [
    "psd_label_lq",
    "psd_label_high_avse",
    "psd_label_low_avse",
    "psd_label_dcr",
    "energy_label",
]

feature_cols = [col for col in train_df.columns if col not in label_cols + ['id']]

In [5]:
feature_cols

['tp0',
 'ED',
 'HWP',
 'LQ80',
 'PPR',
 'SC',
 'current_skewness',
 'spectral_centroid_power',
 'tail_charge_diff',
 'current_kurtosis',
 'total_power',
 'time_to_main_peak',
 'time_to_peak',
 'late_over_early',
 'tdrift99',
 'tfr',
 'peak_count',
 'gbn',
 'bpr']

In [6]:
train_df["psd_label_high_avse"].value_counts(normalize=True)

psd_label_high_avse
True     0.991884
False    0.008116
Name: proportion, dtype: float64

In [7]:
# DATA PREP
X = train_df[feature_cols]
y = train_df["psd_label_high_avse"]

In [None]:
# SPLIT
# We split first to protect the final test set
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y 
)

In [None]:
# DEFINE THE PIPELINE (THE MODEL)
pipe = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),  
    ("scaler", StandardScaler()),
    ("lr", LogisticRegression(
        max_iter=5000,
        class_weight="balanced",
        solver="lbfgs"
    ))
])

In [None]:
# Baseline performance
# Train
pipe.fit(X_train, y_train)

# Predict 
y_pred = pipe.predict(X_test)

# Evaluate
print("F1:", f1_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

In [None]:
# HYPERPARAMETER TUNING (Find the Best Version)
# tune the 'lr' (Logistic Regression) part of the pipe
param_grid = {
    'lr__penalty': ['l1', 'l2'],    # Compare "feature selection" (l1) vs "shrinkage" (l2)
    'lr__class_weight': ['balanced', {0:1, 1:10}, {0:1, 1:100}],
    'lr__C': [0.1, 1, 10]   # Tune regularization strength
}

# Use Standard Stratified K-Fold for speed during the search
grid = GridSearchCV(
    estimator=pipe,        # pass the pipe here
    param_grid=param_grid, 
    cv=StratifiedKFold(n_splits=5), 
    scoring='f1', 
    n_jobs=-1
)

print("Tuning hyperparameters...")
grid.fit(X_train, y_train) # runs the internal CV loops

# the best model found
best_model = grid.best_estimator_
print(f"Best Params found: {grid.best_params_}")

Tuning hyperparameters...




PicklingError: Could not pickle the task to send it to the workers.

In [None]:
# RESULTS VISUALIZATION
# Convert the results dictionary to a DataFrame
results_df = pd.DataFrame(grid.cv_results_)

# Filter for the columns we want to see
columns_to_keep = [
    "params", 
    "mean_test_score", 
    "std_test_score", 
    "rank_test_score",
    "split0_test_score", 
    "split1_test_score", 
    "split2_test_score", 
    "split3_test_score", 
    "split4_test_score"
]

# Create a clean view sorted by best performance
clean_results = results_df[columns_to_keep].sort_values(by="rank_test_score")

# Display the top 10 rows
pd.set_option('display.max_colwidth', None) # Show full param dict

clean_results

In [None]:
# VERIFICATION (The "Double Check" Cross-Validation)
# Now we use RepeatedStratifiedKFold to stress-test the winner
print("\nVerifying stability with Repeated Stratified CV...")
robust_cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=42)

# Note: We pass 'best_model' here, not the original 'pipe'
robust_scores = cross_val_score(best_model, X_train, y_train, cv=robust_cv, scoring='f1')

print(f"Mean F1: {robust_scores.mean():.4f}")
print(f"Std Dev: {robust_scores.std():.4f}")

In [None]:
# FINAL TEST
print("\nFinal Test Set Performance:")
y_pred = best_model.predict(X_test)
print(classification_report(y_test, y_pred))