In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [3]:
import pandas as pd
from sklearn.model_selection import KFold, StratifiedKFold
import numpy as np
from sklearn.model_selection import train_test_split
from IPython.display import display, HTML

df = pd.read_csv('lung cancer survey.csv')
df_no_na = df.dropna()

df_age = df_no_na[df_no_na["AGE"] > 21]

X = df_age.drop('LUNG_CANCER', axis=1).values
y = df_age['LUNG_CANCER'].values

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=888)

print(f"Lung cancer proportion in full dataset: {y.mean()}")
print(f"Lung cancer proportion in training dataset: {y_train.mean()}")
print(f"Lung cancer proportion in validation dataset: {y_val.mean()}", "\n")

# Number of folds
n_splits = 5

# Initialize KFold and StratifiedKFold
kf = KFold(n_splits=n_splits, shuffle=True, random_state=888)
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=888)

# Function to calculate mean of y in each validation set
def calculate_y_means(splitter, X, y):
    y_train_means = []
    y_val_means = []
    for train_index, val_index in splitter.split(X, y):
        y_train_mean = y[train_index].mean()  # Calculate mean of y in validation set
        y_train_means.append(y_train_mean)
        y_val_mean = y[val_index].mean()  # Calculate mean of y in validation set
        y_val_means.append(y_val_mean)
    return y_train_means, y_val_means

# Calculate y means for each fold in KFold and StratifiedKFold
kf_y_train_means, kf_y_val_means = calculate_y_means(kf, X, y)
skf_y_train_means, skf_y_val_means = calculate_y_means(skf, X, y)

diff_train = []
diff_val = []
for i in range(0,5):
    diff_train.append(round((skf_y_train_means[i] - kf_y_train_means[i]) * 9000, 0))
    diff_val.append(round((skf_y_val_means[i] - kf_y_val_means[i]) * 9000, 0))

# Creating the DataFrame with the data
data = {
    "Kfold": kf_y_train_means + kf_y_val_means,
    "Stratified Kfold": skf_y_train_means + skf_y_val_means,
    "Difference in observation from Stratified Kfold": diff_train + diff_val
}

index_labels = ["Training Data 1", "Training Data 2", "Training Data 3", "Training Data 4", "Training Data 5",
                "Validation Data 1", "Validation Data 2", "Validation Data 3", "Validation Data 4", "Validation Data 5"]


    
# Constructing the DataFrame
df = pd.DataFrame(data, index=index_labels)
print(df)



Lung cancer proportion in full dataset: 0.805
Lung cancer proportion in training dataset: 0.8073611111111111
Lung cancer proportion in validation dataset: 0.7955555555555556 

                      Kfold  Stratified Kfold  \
Training Data 1    0.807361             0.805   
Training Data 2    0.803889             0.805   
Training Data 3    0.804167             0.805   
Training Data 4    0.806111             0.805   
Training Data 5    0.803472             0.805   
Validation Data 1  0.795556             0.805   
Validation Data 2  0.809444             0.805   
Validation Data 3  0.808333             0.805   
Validation Data 4  0.800556             0.805   
Validation Data 5  0.811111             0.805   

                   Difference in observation from Stratified Kfold  
Training Data 1                                              -21.0  
Training Data 2                                               10.0  
Training Data 3                                                8.0  
Training

In [14]:
import pandas as pd
from IPython.display import display, HTML

# Data for the table with HTML line breaks
data = {
    "Machine Learning Method": [
        "Logistic Regression",
        "KNN Classifier (No PCA)",
        "Random Forest",
        "Neural Network (Brute Force)",
        "Best Model: Neural Network (Random Search)"
    ],
    "Best Parameters": [
        "Regress with all features except 'AGE'",
        "k = 19, with the inclusion of pairwise features",
        "Max depth: 15<br>Max features: 0.4<br>Fixed at n estimators: 300",
        "Hidden layer 1: 8, sigmoid<br>Hidden layer 2: 8, sigmoid<br>Hidden layer 3: 8, sigmoid<br>Learning rate: 0.01<br>Optimizer: Adam<br>Epoch: 200",
        "Hidden layer 1: 8, linear<br>Hidden layer 2: 23, relu<br>Hidden layer 3: 8, sigmoid<br>Learning rate: 0.02653<br>Optimizer: Adam<br>Epoch: 150"
    ],
    "Precision": [0.8594, 0.8624, 0.9832, 0.8880, 0.9097],
    "Recall": [0.9986, 0.9846, 0.8783, 0.9965, 1.0],
    "F1 Score": [0.9238, 0.9195, 0.9278, 0.9391, 0.9527]
}

# Create DataFrame
df = pd.DataFrame(data)

# Display the table with HTML formatting
display(HTML(df.to_html(escape=False, index=False, justify="center", border=1, table_id="best_model_table", classes="table table-striped")))


display(HTML("<p style='text-align: left; font-weight: bold;'>Table. 1</p>"))

Machine Learning Method,Best Parameters,Precision,Recall,F1 Score
Logistic Regression,Regress with all features except 'AGE',0.8594,0.9986,0.9238
KNN Classifier (No PCA),"k = 19, with the inclusion of pairwise features",0.8624,0.9846,0.9195
Random Forest,Max depth: 15 Max features: 0.4 Fixed at n estimators: 300,0.9832,0.8783,0.9278
Neural Network (Brute Force),"Hidden layer 1: 8, sigmoid Hidden layer 2: 8, sigmoid Hidden layer 3: 8, sigmoid Learning rate: 0.01 Optimizer: Adam Epoch: 200",0.888,0.9965,0.9391
Best Model: Neural Network (Random Search),"Hidden layer 1: 8, linear Hidden layer 2: 23, relu Hidden layer 3: 8, sigmoid Learning rate: 0.02653 Optimizer: Adam Epoch: 150",0.9097,1.0,0.9527
