In [1]:
import pandas as pd
import numpy as np
import os
import tensorflow as tf
#import networkx as nx

In [2]:
dataFolderPath = "/Users/adsriram98/Documents/PARK_LAB/PRS_calc/data";

In [8]:
import pandas as pd
import numpy as np
import os
import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import Lasso
from sklearn.model_selection import train_test_split
from sklearn.inspection import permutation_importance

# Load your data
dataFolderPath = "/Users/adsriram98/Documents/PARK_LAB/PRS_calc/data"

# Change for each phenotype accordingly 
XDataFileName = "42p_82894samples_06-07-24_AHDincidence_X.txt"
XData = pd.read_csv(dataFolderPath + os.path.sep + XDataFileName, sep="\t")

YDataFileName = '42p_82894samples_06-07-24_AHDincidence_Y.txt'
YData = pd.read_csv(dataFolderPath + os.path.sep + YDataFileName, sep="\t")

XValues = XData.values
YValues = YData.values.ravel()  # Ensure Y is 1-dimensional

# Split data into training and testing sets
X_train, X_test, Y_train, Y_test = train_test_split(XValues, YValues, test_size=0.2, random_state=42)

### 1. XGBoost Model with Automatic Feature Selection

model_xgb = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss')
model_xgb.fit(X_train, Y_train)

# Get feature importances from the XGBoost model
feature_importances_xgb = model_xgb.feature_importances_

# Select features based on a threshold automatically determined (e.g., mean importance)
threshold_xgb = feature_importances_xgb.mean()  # Example: Using mean importance as a threshold
selected_indices_xgb = np.where(feature_importances_xgb > threshold_xgb)[0]
selected_features_xgb = XData.columns[selected_indices_xgb]

print(f"Selected features based on XGBoost automatic feature selection: {selected_features_xgb}")

# Filter the training and test data to keep only selected features for XGBoost
X_train_selected_xgb = X_train[:, selected_indices_xgb]
X_test_selected_xgb = X_test[:, selected_indices_xgb]

# Retrain XGBoost Model with selected features
model_xgb.fit(X_train_selected_xgb, Y_train)

# Save selected features and importances for XGBoost
importance_df_xgb = pd.DataFrame({
    "FeatureIndex": selected_indices_xgb,
    "FeatureName": selected_features_xgb,
    "Importance": feature_importances_xgb[selected_indices_xgb]
})
importance_df_xgb.to_csv(dataFolderPath + os.path.sep + "08_28_24_XGBoost_selected_features.csv", index=False)
print(f"XGBoost selected features and importances saved to {dataFolderPath + os.path.sep + '08_28_24_XGBoost_selected_features.csv'}")

### 2. Random Forest with Permutation Importance

model_rf = RandomForestClassifier(n_estimators=100, random_state=42)
model_rf.fit(X_train, Y_train)

# Calculate permutation importances
result_rf = permutation_importance(model_rf, X_train, Y_train, n_repeats=10, random_state=42)

# Automatically determine a threshold based on permutation importance (e.g., mean importance)
threshold_rf = result_rf.importances_mean.mean()
selected_indices_rf = np.where(result_rf.importances_mean > threshold_rf)[0]
selected_features_rf = XData.columns[selected_indices_rf]

print(f"Selected features based on Random Forest permutation importance: {selected_features_rf}")

# Filter the training and test data to keep only selected features for Random Forest
X_train_selected_rf = X_train[:, selected_indices_rf]
X_test_selected_rf = X_test[:, selected_indices_rf]

# Retrain Random Forest Model with selected features
model_rf.fit(X_train_selected_rf, Y_train)

# Save selected features and importances for Random Forest
importance_df_rf = pd.DataFrame({
    "FeatureIndex": selected_indices_rf,
    "FeatureName": selected_features_rf,
    "Importance": result_rf.importances_mean[selected_indices_rf]
})
importance_df_rf.to_csv(dataFolderPath + os.path.sep + "08_28_24_RandomForest_selected_features.csv", index=False)
print(f"Random Forest selected features and importances saved to {dataFolderPath + os.path.sep + '08_28_24_RandomForest_selected_features.csv'}")

### 3. Linear Regression with Lasso for Automatic Feature Selection

# Initialize Lasso with cross-validation to find optimal alpha (regularization strength)
model_lasso = Lasso(alpha=0.01)  # Start with a small alpha value
model_lasso.fit(X_train, Y_train)

# Automatically select features where coefficients are non-zero
selected_indices_lasso = np.where(model_lasso.coef_ != 0)[0]
selected_features_lasso = XData.columns[selected_indices_lasso]

print(f"Selected features based on Lasso Regression automatic feature selection: {selected_features_lasso}")

# Save selected features and importances for Lasso
importance_df_lasso = pd.DataFrame({
    "FeatureIndex": selected_indices_lasso,
    "FeatureName": selected_features_lasso,
    "Importance": model_lasso.coef_[selected_indices_lasso]  # Only use selected_indices_lasso here
})
importance_df_lasso.to_csv(dataFolderPath + os.path.sep + "08_28_24_Lasso_selected_features.csv", index=False)
print(f"Lasso Regression selected features and importances saved to {dataFolderPath + os.path.sep + '08_28_24_Lasso_selected_features.csv'}")

# Filter the training and test data to keep only selected features for Lasso
X_train_selected_lasso = X_train[:, selected_indices_lasso]
X_test_selected_lasso = X_test[:, selected_indices_lasso]

# Retrain Lasso Model with selected features
model_lasso.fit(X_train_selected_lasso, Y_train)


Selected features based on XGBoost automatic feature selection: Index(['F31', 'F41'], dtype='object')
XGBoost selected features and importances saved to /Users/adsriram98/Documents/PARK_LAB/PRS_calc/data/08_28_24_XGBoost_selected_features.csv
Selected features based on Random Forest permutation importance: Index(['F1', 'F5', 'F6', 'F7', 'F8', 'F11', 'F12', 'F13', 'F24', 'F31', 'F34',
       'F35', 'F41'],
      dtype='object')
Random Forest selected features and importances saved to /Users/adsriram98/Documents/PARK_LAB/PRS_calc/data/08_28_24_RandomForest_selected_features.csv
Selected features based on Lasso Regression automatic feature selection: Index(['F3', 'F5', 'F6', 'F8', 'F11', 'F12', 'F13', 'F18', 'F23', 'F25', 'F26',
       'F28', 'F29', 'F30', 'F31', 'F32', 'F33', 'F35', 'F37', 'F41'],
      dtype='object')
Lasso Regression selected features and importances saved to /Users/adsriram98/Documents/PARK_LAB/PRS_calc/data/08_28_24_Lasso_selected_features.csv


In [None]:
# Apply FDR control to DNN result
from DL.FDR.FDR_control import FDR_control
control = FDR_control()

XDataFileName = "42p_7110samples_06-10-24_mortalityAHD_X.txt"
selected_features = control.controlFilter(dataFolderPath + os.path.sep + XDataFileName, resultDir, offset=1, q=0.05)

# Save the selected associations with statistic and threshold
selected_associations = []
for feature, statistic, threshold in selected_features:
    selected_associations.append({
        "Feature1": feature,
        "Feature2": "Y",
        "Statistic": statistic,
        "Threshold": threshold
    })

pd.DataFrame(selected_associations).to_csv(dataFolderPath + os.path.sep + "08_20_24_STATknockoffs_AHDmortality_UKB_DNN_selected_associations.csv")



In [None]:
# Calculate multiple W statistics for the selected features
num_iterations = 10  # Set the number of iterations you want
all_W_statistics = control.calculate_multiple_W_for_selected(
    dataFolderPath + os.path.sep + XDataFileName, 
    resultDir, 
    selected_features,
    num_iterations=num_iterations, 
    offset=1, 
    q=0.05
)

# Process the results
selected_associations_with_W = []

for feature in selected_features:
    feature_name = feature[0]
    for iteration, statistic in enumerate(all_W_statistics[feature_name]):
        selected_associations_with_W.append({
            "Iteration": iteration,
            "Feature1": feature_name,
            "Feature2": "Y",
            "Statistic": statistic
        })

# Save the selected associations with statistics
pd.DataFrame(selected_associations_with_W).to_csv(dataFolderPath + os.path.sep + "07_08_24_STATknockoffs_AHDmortality_UKB_DNN_selected_associations_with_W.csv")
