In [11]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, classification_report

In [12]:
df = pd.read_csv('combined_test_with_labels.csv.gz')

In [13]:
df

Unnamed: 0,id,energy_label,psd_label_low_avse,psd_label_high_avse,psd_label_dcr,psd_label_lq,tp0,ED,HWP,LQ80,...,spectral_centroid_power,tail_charge_diff,late_over_early,tdrift10,tdrift50,tdrift99,tfr,peak_count,gbn,bpr
0,2395098_test_0,1167.174731,True,True,True,True,967,3407.0,2036.0,-1.300536e+06,...,108.796954,-0.621062,0.987423,34.0,55.0,76.0,0.148591,1,1.282821,0.062497
1,2395099_test_0,870.765543,False,True,True,False,960,3405.0,2019.0,-9.729822e+05,...,110.305348,-0.597317,0.987924,45.0,70.0,90.0,0.147334,1,1.203640,0.063886
2,2395100_test_0,582.980526,False,True,True,True,960,3412.0,2107.0,-6.390870e+05,...,108.213213,-0.590051,0.988065,40.0,76.0,100.0,0.139090,2,1.189320,0.057715
3,2395101_test_0,238.918902,True,True,True,True,930,3408.0,2053.0,-2.760460e+05,...,109.196803,-0.606667,0.987626,38.0,88.0,110.0,0.138597,2,1.226275,0.067184
4,2395102_test_0,214.491195,False,True,True,True,924,3406.0,1939.0,-7.611188e+04,...,109.163049,-0.662500,0.986214,0.0,83.0,135.0,0.149185,13,1.203153,0.055602
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
389995,2785093_test_5,238.472881,False,True,True,False,940,3406.0,2081.0,-2.667054e+05,...,106.821196,-0.610226,0.987577,23.0,69.0,111.0,0.144375,2,1.198436,0.062338
389996,2785094_test_5,452.840234,True,True,True,True,952,3406.0,2058.0,-5.194853e+05,...,108.447129,-0.581642,0.988177,36.0,59.0,83.0,0.148058,3,2.268535,0.064586
389997,2785095_test_5,344.740556,True,True,True,True,948,3406.0,2036.0,-3.684673e+05,...,108.862927,-0.632580,0.987142,0.0,80.0,125.0,0.140774,3,1.217066,0.062375
389998,2785096_test_5,163.807547,True,True,True,True,925,3406.0,2067.0,-1.903450e+05,...,110.009456,-0.538835,0.989035,1.0,94.0,116.0,0.140754,2,0.975055,0.066704


In [14]:
type(df['psd_label_dcr'][1])

numpy.bool

In [15]:
print(df.shape)
print(df.columns)


(390000, 28)
Index(['id', 'energy_label', 'psd_label_low_avse', 'psd_label_high_avse',
       'psd_label_dcr', 'psd_label_lq', 'tp0', 'ED', 'HWP', 'LQ80', 'PPR',
       'SCA', 'ND80', 'total_power', 'time_to_main_peak', 'current_skewness',
       'current_kurtosis', 'time_to_peak', 'spectral_centroid_power',
       'tail_charge_diff', 'late_over_early', 'tdrift10', 'tdrift50',
       'tdrift99', 'tfr', 'peak_count', 'gbn', 'bpr'],
      dtype='object')


Target Labels

In [7]:
label_cols = [
    "psd_label_lq",
    "psd_label_high_avse",
    "psd_label_low_avse",
    "psd_label_dcr",
]

# MODEL TRAINING

## Eunice Model Training - psd_label_lq

### Baseline Model

In [16]:
feature_cols = [
    'ED', 'HWP', 'LQ80', 'PPR', 'SCA',
    'current_skewness', 'spectral_centroid_power',
    'tail_charge_diff', 'current_kurtosis',
    'total_power', 'time_to_main_peak',
    'time_to_peak', 'late_over_early', 'tp0'
]

y_col = "psd_label_lq"

X = df[feature_cols]
y = df[y_col]


In [17]:
df[feature_cols].isna().sum().sort_values(ascending=False)


ED                         0
HWP                        0
LQ80                       0
PPR                        0
SCA                        0
current_skewness           0
spectral_centroid_power    0
tail_charge_diff           0
current_kurtosis           0
total_power                0
time_to_main_peak          0
time_to_peak               0
late_over_early            0
tp0                        0
dtype: int64

In [18]:
df["HWP"].isna().sum(), df["PPR"].isna().sum(), (df[["HWP","PPR"]].isna().any(axis=1)).sum()


(np.int64(0), np.int64(0), np.int64(0))

In [19]:

# Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)


In [20]:
pipe = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),  
    ("scaler", StandardScaler()),
    ("lr", LogisticRegression(
        max_iter=2000,
        class_weight="balanced",
        solver="lbfgs"
    ))
])



In [21]:
# Train
pipe.fit(X_train, y_train)

# Predict
y_pred = pipe.predict(X_test)

# Evaluate
print("F1:", f1_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


F1: 0.8108300877481113
              precision    recall  f1-score   support

       False       0.55      0.64      0.59     22846
        True       0.84      0.78      0.81     55154

    accuracy                           0.74     78000
   macro avg       0.70      0.71      0.70     78000
weighted avg       0.76      0.74      0.75     78000



- Shows us the class is imbalanced.
- Precision: When model predicts True, correct 82%. When model predicts False, correct 49%
- Recall: Catches 74% True events. Catches 60% False events.



The baseline Logistic Regression model achieved an F1 score of 0.78 for predicting the psd_label_lq column. 