In [1]:
import json

with open("LightGBM_PUB.ipynb", "r", encoding="utf-8") as f:
    try:
        data = json.load(f)
        print("Notebook file is valid JSON!")
    except json.JSONDecodeError as e:
        print(f"Notebook format error: {e}")


Notebook file is valid JSON!


In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA, NMF
from sklearn.preprocessing import StandardScaler
from sklearn.manifold import TSNE
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import chi2
from sklearn.feature_selection import f_classif
from sklearn.preprocessing import LabelEncoder
import seaborn as sns

In [2]:
# Data Loader
csf_data = pd.read_csv("merged_csf_cross_sectional_call_rate_pau_data_0205.xls", low_memory=False)
plasma_data = pd.read_csv("merged_plasma_cross_sectional_call_rate_data_0130.xls", low_memory=False)

In [3]:
plasma_data['Status_at_draw_mapping'].value_counts()

Status_at_draw_mapping
CO     1282
AD      865
PD      687
DLB     122
FTD      44
Name: count, dtype: int64

In [4]:
# plasma_data

In [5]:
csf_significant_rows = pd.read_csv("csf_significant_rows_0205.csv")
plasma_significant_rows = pd.read_csv("plasma_significant_rows_0203.csv")

### Data Loader

In [35]:
data = plasma_data.copy()
print(data.shape)

(4750, 6648)


In [36]:
x_columns = [col for col in plasma_data.columns if col.startswith('X')]
len(x_columns)

6607

In [39]:
protein_list = list(plasma_significant_rows['Analytes'])
print(len(protein_list))
# protein_list

3607


In [40]:
data['Status_at_draw_mapping'].value_counts()

Status_at_draw_mapping
CO     1282
AD      865
PD      687
DLB     122
FTD      44
Name: count, dtype: int64

In [41]:
data['Project_y'].value_counts()

Project_y
MAP         3110
PD          1012
Stanford     628
Name: count, dtype: int64

In [42]:
statuses_to_keep = ['CO', 'AD', 'PD', 'FTD', 'DLB']

filtered_data = data[data['Status_at_draw_mapping'].isin(statuses_to_keep)]
 
filtered_data.shape

(3000, 6648)

In [43]:
filtered_data['Project_y'].value_counts()

Project_y
MAP         2184
PD           689
Stanford     127
Name: count, dtype: int64

In [44]:
filtered_data['Status_at_draw_mapping'].value_counts()

Status_at_draw_mapping
CO     1282
AD      865
PD      687
DLB     122
FTD      44
Name: count, dtype: int64

In [45]:
filtered_data = filtered_data.rename(columns={'final_decision': 'PET_imaging'})

columns_to_keep = ['UniquePhenoID', 'DrawDate', 'Project_x', 'Project_y', 'Age_at_draw', 'Sex', 'AT_class', 'PET_imaging', 'T1_pTau217',
       'T2_pTau181', 'Status_at_draw_mapping', 'Status_at_draw', 'Final_Status']
print(columns_to_keep)

selected_protein_columns = [col for col in data.columns if col in protein_list]

selected_columns = list(columns_to_keep) + selected_protein_columns

selected_data = filtered_data[selected_columns]

selected_data.shape

['UniquePhenoID', 'DrawDate', 'Project_x', 'Project_y', 'Age_at_draw', 'Sex', 'AT_class', 'PET_imaging', 'T1_pTau217', 'T2_pTau181', 'Status_at_draw_mapping', 'Status_at_draw', 'Final_Status']


(3000, 3620)

In [46]:
# Check which columns contain non-float values
non_float_columns = selected_data.iloc[:,13:].applymap(lambda x: isinstance(x, (float, int))).all() == False
non_float_columns_indices = non_float_columns[non_float_columns].index

if not non_float_columns_indices.empty:
    print(f"Columns with non-float values: {list(non_float_columns_indices)}")
else:
    print("All columns are float type.")

  non_float_columns = selected_data.iloc[:,13:].applymap(lambda x: isinstance(x, (float, int))).all() == False


All columns are float type.


In [47]:
# Check for NA values in selected_data
na_counts = selected_data.iloc[:,13:].isna().sum()

# Get columns with NA values
na_columns = na_counts[na_counts > 0]

# Print the total number of NA values and columns with NA values
total_na = na_counts.sum()
print(f"Total number of NA values in selected_data: {total_na}")
if not na_columns.empty:
    print("Columns with NA values and their counts:")
    print(na_columns)
else:
    print("No NA values in selected_data.")

Total number of NA values in selected_data: 356254
Columns with NA values and their counts:
X10000.28    245
X10001.7     125
X10003.15    188
X10006.25    160
X10010.10    184
            ... 
X9986.14      20
X9989.12      76
X9991.112    240
X9993.11      87
X9995.6      142
Length: 3588, dtype: int64


In [48]:
import numpy as np

np.random.seed(42)

def bootstrap_impute(series):
    observed = series.dropna()
    n_missing = series.isna().sum()
    if n_missing > 0 and len(observed) > 0:
        imputed_values = np.random.choice(observed, size=n_missing, replace=True)
        series.loc[series.isna()] = imputed_values
    return series

num_cols = selected_data.columns[13:]

selected_data[num_cols] = (
    selected_data.groupby("Status_at_draw", group_keys=False)[num_cols]
    .apply(lambda g: g.apply(bootstrap_impute, axis=0))
)

selected_data[num_cols] = selected_data[num_cols].fillna(selected_data[num_cols].median())

na_counts_after = selected_data[num_cols].isna().sum()
total_na_after = na_counts_after.sum()
print(f"Total number of NA values in selected_data after filling: {total_na_after}")

if not na_counts_after[na_counts_after > 0].empty:
    print("Columns with remaining NA values and their counts:")
    print(na_counts_after[na_counts_after > 0])
else:
    print("No NA values in selected_data after filling.")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  selected_data[num_cols] = (
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  selected_data[num_cols] = selected_data[num_cols].fillna(selected_data[num_cols].median())


Total number of NA values in selected_data after filling: 0
No NA values in selected_data after filling.


#### LightGBM

In [49]:
from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, roc_auc_score, f1_score, balanced_accuracy_score
from sklearn.feature_selection import SelectKBest, f_classif, VarianceThreshold
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE, BorderlineSMOTE, ADASYN
from imblearn.under_sampling import RandomUnderSampler
from imblearn.combine import SMOTETomek
import numpy as np
import pandas as pd
from scipy.stats import zscore

np.random.seed(42)

X = selected_data.iloc[:, 13:]
print(X.shape)

y = selected_data['Status_at_draw_mapping']

label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

variance_filter = VarianceThreshold(threshold=0.01)
X_filtered = variance_filter.fit_transform(X)
print(X_filtered.shape)

k_best_selector = SelectKBest(score_func=f_classif, k=min(400, X_filtered.shape[1]))
X_selected = k_best_selector.fit_transform(X_filtered, y_encoded)
print(X_selected.shape)

X_train, X_test, y_train, y_test = train_test_split(X_selected, y_encoded, test_size=0.3, random_state=42, stratify=y_encoded)

smote_tomek = SMOTETomek(random_state=42)
X_train_resampled, y_train_resampled = smote_tomek.fit_resample(X_train, y_train)

lgbm_model = LGBMClassifier(
    n_estimators=300, max_depth=20, learning_rate=0.05, random_state=42,
    min_child_samples=10, min_split_gain=0.0, objective='multiclass', metric='multi_logloss',
    force_col_wise=True
)

lgbm_model.fit(X_train_resampled, y_train_resampled)

class_labels = label_encoder.inverse_transform(np.unique(y_encoded))
print("Class Labels:", class_labels)

# Prediction
y_pred = lgbm_model.predict(X_test)
y_pred_proba = lgbm_model.predict_proba(X_test)

accuracy = accuracy_score(y_test, y_pred)
balanced_acc = balanced_accuracy_score(y_test, y_pred)
f1_macro = f1_score(y_test, y_pred, average="macro")

auc_ovr_weighted = roc_auc_score(y_test, y_pred_proba, multi_class="ovr", average="weighted")
auc_ovr_macro = roc_auc_score(y_test, y_pred_proba, multi_class="ovr", average="macro")
auc_ovo_weighted = roc_auc_score(y_test, y_pred_proba, multi_class="ovo", average="weighted")
auc_ovo_macro = roc_auc_score(y_test, y_pred_proba, multi_class="ovo", average="macro")

(3000, 3607)
(3000, 3607)
(3000, 400)
[LightGBM] [Info] Total Bins 102000
[LightGBM] [Info] Number of data points in the train set: 4483, number of used features: 400
[LightGBM] [Info] Start training from score -1.610107
[LightGBM] [Info] Start training from score -1.610107
[LightGBM] [Info] Start training from score -1.608992
[LightGBM] [Info] Start training from score -1.608992
[LightGBM] [Info] Start training from score -1.608992
Class Labels: ['AD' 'CO' 'DLB' 'FTD' 'PD']


In [50]:
print(f"Accuracy: {accuracy * 100:.2f}%")
print(f"Balanced Accuracy: {balanced_acc * 100:.2f}%")
print(f"Macro F1-score: {f1_macro:.4f}")
print("Classification Report:\n", classification_report(y_test, y_pred, digits=4))
print(f"OvR AUC Weighted: {auc_ovr_weighted:.4f}")
print(f"OvR AUC Macro: {auc_ovr_macro:.4f}")
print(f"OvO AUC Weighted: {auc_ovo_weighted:.4f}")
print(f"OvO AUC Macro: {auc_ovo_macro:.4f}")

Accuracy: 66.67%
Balanced Accuracy: 40.34%
Macro F1-score: 0.3972
Classification Report:
               precision    recall  f1-score   support

           0     0.6303    0.6911    0.6593       259
           1     0.6839    0.8260    0.7482       385
           2     0.0000    0.0000    0.0000        37
           3     0.0000    0.0000    0.0000        13
           4     0.6867    0.5000    0.5787       206

    accuracy                         0.6667       900
   macro avg     0.4002    0.4034    0.3972       900
weighted avg     0.6311    0.6667    0.6423       900

OvR AUC Weighted: 0.8477
OvR AUC Macro: 0.7633
OvO AUC Weighted: 0.7645
OvO AUC Macro: 0.7293


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [19]:
import joblib
import os

save_dir = "plasma_model_output"
os.makedirs(save_dir, exist_ok=True)

model_path = os.path.join(save_dir, "plasma_lgbm_model_700_0411.pkl")
joblib.dump(lgbm_model, model_path)

# label_encoder_path = os.path.join(save_dir, "label_encoder.pkl")
# joblib.dump(label_encoder, label_encoder_path)

# variance_filter_path = os.path.join(save_dir, "variance_filter.pkl")
# joblib.dump(variance_filter, variance_filter_path)

# k_best_selector_path = os.path.join(save_dir, "k_best_selector.pkl")
# joblib.dump(k_best_selector, k_best_selector_path)

selected_feature_names = X.columns[variance_filter.get_support()][k_best_selector.get_support()]
feature_names_path = os.path.join(save_dir, "plasma_selected_features_700_0411.txt")

with open(feature_names_path, "w") as f:
    for feature in selected_feature_names:
        f.write(feature + "\n")

print(f"Model and processing components saved in {save_dir}")

Model and processing components saved in plasma_model_output
