In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import AdaBoostClassifier
from imblearn.pipeline import Pipeline as ImbPipeline
from sklearn.metrics import classification_report
from sklearn.tree import DecisionTreeClassifier
import warnings
warnings.filterwarnings('ignore')
warnings.filterwarnings("ignore", category=UserWarning, module="joblib")
warnings.filterwarnings("ignore", category=UserWarning, module="sklearn")
warnings.filterwarnings('ignore', category=UserWarning, message="Line Search failed")

In [None]:
#load data and preprocess
data = pd.read_csv('final_result1.csv').fillna(0)
print(data.head())
print(data.isnull().sum())

In [None]:
#feature selection
features = data[['baseMean', 'log2FoldChange', 'pvalue']]
targets = {'NSCLC': data['NSCLC'], 'SCLC': data['SCLC']}

In [None]:
#EDA
#histograms and correlation matrix
features.hist(bins=15, figsize=(15, 6), layout=(2, 2))
plt.show()
sns.heatmap(features.corr(), annot=True)
plt.show()

In [None]:
#SMOTE and data splitting
def split_and_resample(features, target, test_size=0.4, random_state=42):
    x_train, x_temp, y_train, y_temp = train_test_split(features, target, test_size=test_size, random_state=random_state)
    smote = SMOTE()
    x_train_res, y_train_res = smote.fit_resample(x_train, y_train)
    x_test, x_val, y_test, y_val = train_test_split(x_temp, y_temp, test_size=0.5, random_state=random_state)
    return x_train_res, x_test, x_val, y_train_res, y_test, y_val

data_nsclc = split_and_resample(features, targets['NSCLC'])
data_sclc = split_and_resample(features, targets['SCLC'])

In [None]:
#AdaBoost Pipeline
base_estimator = DecisionTreeClassifier(max_depth=1)  #base estimator
pipeline = ImbPipeline([
    ('scaler', StandardScaler()),  
    ('adaboost', AdaBoostClassifier(base_estimator=base_estimator))  
])

#parameter Grid for AdaBoost
param_grid_adaboost = {
    'adaboost__n_estimators': [50, 100, 200],  
    'adaboost__learning_rate': [0.1, 0.5, 1.0],  
    'adaboost__algorithm': ['SAMME', 'SAMME.R'],  
    'adaboost__random_state': [None, 42, 100]  
}

In [None]:
#function for GridSearchCV and Model Evaluation
def evaluate_adaboost(data, param_grid):
    grid_search = GridSearchCV(estimator=pipeline, param_grid=param_grid, scoring='f1', cv=5, verbose=1, n_jobs=-1)
    grid_search.fit(data[0], data[3])  
    best_params = grid_search.best_params_
    best_score = grid_search.best_score_
    best_adaboost = pipeline.set_params(**best_params)
    best_adaboost.fit(data[0], data[3])  
    y_val_pred = best_adaboost.predict(data[2])  
    report = classification_report(data[5], y_val_pred)
    return best_params, best_score, report

In [None]:
#evaluate for NSCLC
best_params_nsclc, best_score_nsclc, report_nsclc = evaluate_adaboost(data_nsclc, param_grid_adaboost)
print("Best Parameters for NSCLC:", best_params_nsclc)
print("Best F1 Score for NSCLC:", best_score_nsclc)
print("Classification Report for NSCLC (Validation Data):\n", report_nsclc)

In [None]:
#evaluate for SCLC
best_params_sclc, best_score_sclc, report_sclc = evaluate_adaboost(data_sclc, param_grid_adaboost)
print("Best Parameters for SCLC:", best_params_sclc)
print("Best F1 Score for SCLC:", best_score_sclc)
print("Classification Report for SCLC (Validation Data):\n", report_sclc)