In [31]:
from sklearn.base import clone

import pandas as pd
import numpy as np
from tqdm import tqdm

from sklearn.pipeline import Pipeline
from imblearn.pipeline import Pipeline as ImbPipeline

from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.feature_selection import VarianceThreshold, SelectKBest, mutual_info_classif
from sklearn.decomposition import PCA
from sklearn.preprocessing import RobustScaler

from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.model_selection import StratifiedKFold

from sklearn.metrics import (
    roc_auc_score, average_precision_score, balanced_accuracy_score,
    accuracy_score, precision_score, recall_score, f1_score,
    confusion_matrix, precision_recall_curve
)

from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE
from imblearn import FunctionSampler
from collections import Counter

from joblib import Parallel, delayed


In [1]:
import pandas as pd
df = pd.read_csv("excel_files/baseline_features.csv",na_values=["#NULL!", "NULL", "N/A"], decimal=",")

df = df.apply(pd.to_numeric, errors="coerce")
df = df[df.isnull().sum(axis=1) <= 500]

In [3]:
###code to create binary labels using 93rd percentile.below=normal, above=clinical

# Step 1: Automatically find matching YSR_93_XX and YSR_98_XX pairs
suffixes = []
for col in df.columns:
    if col.startswith("YSR_93_"):
        suffix = col.replace("YSR_93_", "")
        if f"YSR_98_{suffix}" in df.columns:
            suffixes.append(suffix)

# Step 2: Create combined columns
combined_cols = {}
for suffix in suffixes:
    col_93 = f'YSR_93_{suffix}'
    col_98 = f'YSR_98_{suffix}'
    new_col = f'YSR_93_98_{suffix}'
    
    combined_cols[new_col] = ((df[col_93] == 1) | (df[col_98] == 1)).astype(int)

# Step 3: Insert new columns after the first 60 columns
# Split df into two parts: first 60 and the rest
first_part = df.iloc[:, :60].copy()
second_part = df.iloc[:, 60:].copy()

# Add new combined columns to first_part
for col_name, col_data in combined_cols.items():
    first_part[col_name] = col_data.values

# Concatenate first_part (now including new columns) and second_part
df = pd.concat([first_part, second_part], axis=1)

###code to eliminate the doubles acoustic features which I dont know where they came from

cols_to_check = df.columns[77:]
cols_to_keep = [col for col in cols_to_check if col.startswith("baseline_")]
df = df[df.columns[:77].tolist() + cols_to_keep]

In [4]:
# Identify all new combined columns (those starting with 'YSR_93_98_')
combined_cols = [col for col in df.columns if col.startswith('YSR_93_98_')]

# Create a summary DataFrame
summary = pd.DataFrame({
    'num_1s': df[combined_cols].sum(),
    'total_non_null': df[combined_cols].notnull().sum()
})

# Add a proportion column (optional)
summary['proportion_1s'] = summary['num_1s'] / summary['total_non_null']

# Sort by number of 1s descending (optional)
summary = summary.sort_values(by='num_1s', ascending=False)

# Show the summary
import ace_tools_open as tools; tools.display_dataframe_to_user(name="YSR Combined Column Summary", dataframe=summary)


YSR Combined Column Summary


0
Loading ITables v2.4.0 from the internet...  (need help?)


In [9]:
df.columns[88:]

Index(['baseline_avgDF0', 'baseline_avgDDF0', 'baseline_avgJitter',
       'baseline_avgShimmer', 'baseline_avgapq', 'baseline_avgppq',
       'baseline_avglogE', 'baseline_stdDF0', 'baseline_stdDDF0',
       'baseline_stdJitter',
       ...
       'baseline_paa_deltachroma_12_min', 'baseline_paa_deltachroma_12_max',
       'baseline_paa_deltachroma_12_skew',
       'baseline_paa_deltachroma_12_kurtosis',
       'baseline_paa_deltachroma_std_mean', 'baseline_paa_deltachroma_std_std',
       'baseline_paa_deltachroma_std_min', 'baseline_paa_deltachroma_std_max',
       'baseline_paa_deltachroma_std_skew',
       'baseline_paa_deltachroma_std_kurtosis'],
      dtype='object', length=1027)

In [None]:
targets_clinical = ['YSR_93_98_ad', 'YSR_93_98_wd',
       'YSR_93_98_sc', 'YSR_93_98_sp', 'YSR_93_98_tp', 'YSR_93_98_ap',
       'YSR_93_98_rb', 'YSR_93_98_ab', 'YSR_93_98_ip', 'YSR_93_98_ep',
       'YSR_93_98_ts']

In [6]:
### code to create median based labels

targets=['YSR_sad', 'YSR_swd', 'YSR_ssc', 'YSR_ssp', 'YSR_stp', 'YSR_sap',
       'YSR_srb', 'YSR_sab', 'YSR_sip', 'YSR_sep', 'YSR_sts']

insert_position = 77
for target in targets:
    median_value = df[target].median()
    binary_label = (df[target] > median_value).astype(int)
    df.insert(loc=insert_position, column=f"{target}_binary", value=binary_label)
    insert_position += 1  # Shift for the next insert

In [None]:
########## up to here!!!!