In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingClassifier
from imblearn.pipeline import Pipeline as ImbPipeline
from sklearn.metrics import classification_report
import warnings
warnings.filterwarnings('ignore')
warnings.filterwarnings("ignore", category=UserWarning, module="joblib")
warnings.filterwarnings("ignore", category=UserWarning, module="sklearn")
warnings.filterwarnings('ignore', category=UserWarning, message="Line Search failed")

In [None]:
#load data and preprocess
data = pd.read_csv('final_result1.csv').fillna(0)
print(data.head())
print(data.isnull().sum())

In [None]:
#feature selection
features = data[['baseMean', 'log2FoldChange', 'pvalue']]
targets = {'NSCLC': data['NSCLC'], 'SCLC': data['SCLC']}

In [None]:
#EDA
#histograms and Correlation Matrix
features.hist(bins=15, figsize=(15, 6), layout=(2, 2))
plt.show()
sns.heatmap(features.corr(), annot=True)
plt.show()

In [None]:
#SMOTE and Data Splitting
def split_and_resample(features, target, test_size=0.4, random_state=42):
    x_train, x_temp, y_train, y_temp = train_test_split(features, target, test_size=test_size, random_state=random_state)
    smote = SMOTE()
    x_train_res, y_train_res = smote.fit_resample(x_train, y_train)
    x_test, x_val, y_test, y_val = train_test_split(x_temp, y_temp, test_size=0.5, random_state=random_state)
    return x_train_res, x_test, x_val, y_train_res, y_test, y_val

data_nsclc = split_and_resample(features, targets['NSCLC'])
data_sclc = split_and_resample(features, targets['SCLC'])

In [None]:
#gradient boosting pipeline
pipeline = ImbPipeline([
    ('scaler', StandardScaler()),  #scaling for Gradient Boosting
    ('gradient_boosting', GradientBoostingClassifier())  #gradient boosting classifier
])

#parameter grid for gradient boosting
param_grid_gradient_boosting = {
    'gradient_boosting__loss': ['deviance', 'exponential'], 
    'gradient_boosting__learning_rate': [0.1, 0.5, 1.0],  
    'gradient_boosting__n_estimators': [100, 200, 300],  
    'gradient_boosting__subsample': [1.0, 0.8, 0.6],  
    'gradient_boosting__criterion': ['friedman_mse', 'squared_error'],  
    'gradient_boosting__min_samples_split': [2, 4, 8],  
    'gradient_boosting__min_samples_leaf': [1, 2, 4],  
    'gradient_boosting__min_weight_fraction_leaf': [0.0, 0.1, 0.2],  
    'gradient_boosting__max_depth': [3, 5, 7],  
    'gradient_boosting__min_impurity_decrease': [0.0, 0.1, 0.2],  
    'gradient_boosting__init': [None, 'zero'],  
    'gradient_boosting__random_state': [None, 42, 100],  
    'gradient_boosting__max_features': [None, 'sqrt', 'log2'], 
    'gradient_boosting__verbose': [0, 1, 2],  
    'gradient_boosting__max_leaf_nodes': [None, 5, 10],  
    'gradient_boosting__warm_start': [False, True],  
    'gradient_boosting__validation_fraction': [0.1, 0.2, 0.3],  
    'gradient_boosting__n_iter_no_change': [None, 10, 20],  
    'gradient_boosting__tol': [1e-4, 1e-3, 1e-2],  
    'gradient_boosting__ccp_alpha': [0.0, 0.1, 0.2]  
}


#function for GridSearchCV and model evaluation
def evaluate_gradient_boosting(data, param_grid):
    grid_search = GridSearchCV(estimator=pipeline, param_grid=param_grid, scoring='f1', cv=5, verbose=1, n_jobs=-1)
    grid_search.fit(data[0], data[3])  #fit on training data
    best_params = grid_search.best_params_
    best_score = grid_search.best_score_
    best_gradient_boosting = pipeline.set_params(**best_params)
    best_gradient_boosting.fit(data[0], data[3])  #refit on training data
    y_val_pred = best_gradient_boosting.predict(data[2])  #predict on validation data
    report = classification_report(data[5], y_val_pred)
    return best_params, best_score, report

In [None]:
#evaluate for NSCLC
best_params_nsclc, best_score_nsclc, report_nsclc = evaluate_gradient_boosting(data_nsclc, param_grid_gradient_boosting)
print("Best Parameters for NSCLC:", best_params_nsclc)
print("Best F1 Score for NSCLC:", best_score_nsclc)
print("Classification Report for NSCLC (Validation Data):\n", report_nsclc)

In [None]:
#evaluate for SCLC
best_params_sclc, best_score_sclc, report_sclc = evaluate_gradient_boosting(data_sclc, param_grid_gradient_boosting)
print("Best Parameters for SCLC:", best_params_sclc)
print("Best F1 Score for SCLC:", best_score_sclc)
print("Classification Report for SCLC (Validation Data):\n", report_sclc)