In [19]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [35]:
# Load Dataset
import numpy as np
import pandas as pd
from IPython.display import display, HTML
df = pd.read_csv('lung cancer survey.csv')
df_no_na = df.dropna()

df_age = df_no_na[df_no_na["AGE"] > 21]
# Use df_age for all models, where clustering algorithm models, such as Kmeans, randomforest, decision tree as they would have already categorise an age threshold within the model
df_age

# For Age
original_features = df_age.columns.drop("LUNG_CANCER")

features_list = original_features.tolist()

print(features_list) 


Unnamed: 0,GENDER,AGE,SMOKING,YELLOW_FINGERS,ANXIETY,PEER_PRESSURE,CHRONIC DISEASE,FATIGUE,ALLERGY,WHEEZING,ALCOHOL CONSUMING,COUGHING,SHORTNESS OF BREATH,SWALLOWING DIFFICULTY,CHEST PAIN,LUNG_CANCER
0,0.0,61.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
1,1.0,70.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0
2,1.0,59.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,1.0,54.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0
4,0.0,54.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8996,1.0,62.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0
8997,0.0,71.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0
8998,1.0,63.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
8999,1.0,70.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0


['GENDER', 'AGE', 'SMOKING', 'YELLOW_FINGERS', 'ANXIETY', 'PEER_PRESSURE', 'CHRONIC DISEASE', 'FATIGUE ', 'ALLERGY ', 'WHEEZING', 'ALCOHOL CONSUMING', 'COUGHING', 'SHORTNESS OF BREATH', 'SWALLOWING DIFFICULTY', 'CHEST PAIN']


In [21]:
# Import necessary libraries
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split

y = df_age['LUNG_CANCER'].values
X = df_age.drop("LUNG_CANCER", axis = 1).values

# Split the data into training and validation sets (80% training, 20% validation)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=888)

# Model Assumption Checks

In [22]:
# Import relevant dependencies
import scipy.stats as stats
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor
import matplotlib.pyplot as plt

## Linearity of independent variables and log-odds (Box-Tidwell Test)

In [23]:
# Logistic Regression with statsmodel 
# Inclusion of interaction term (logit transform) as part of Box-Tidwell test

df_lt = df_age.copy()

continuous_var = 'AGE'
# Add logit transform interaction terms (natural log) for continuous variables e.g. Age * Log(Age)
df_lt[f'{continuous_var}:Log_{continuous_var}'] = df_lt[continuous_var].apply(lambda x: x * np.log(x)) #np.log = natural log

# Redefine independent variables to include interaction terms
X_lt = df_lt[['AGE','AGE:Log_AGE']]
y_lt = df_lt['LUNG_CANCER']

# Add constant
X_lt = sm.add_constant(X_lt, prepend=False)
  
# Fit the logistic regression model on the training data using statsmodels
logit_model = sm.Logit(y_lt, X_lt)
result = logit_model.fit()
# Display summary results
print(result.summary())

display(HTML("<p style='text-align: left; font-weight: bold;'>Figure. 1</p>"))

Optimization terminated successfully.
         Current function value: 0.491946
         Iterations 6
                           Logit Regression Results                           
Dep. Variable:            LUNG_CANCER   No. Observations:                 9000
Model:                          Logit   Df Residuals:                     8997
Method:                           MLE   Df Model:                            2
Date:                Sun, 10 Nov 2024   Pseudo R-squ.:                0.002932
Time:                        23:02:29   Log-Likelihood:                -4427.5
converged:                       True   LL-Null:                       -4440.5
Covariance Type:            nonrobust   LLR p-value:                 2.212e-06
                  coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------
AGE            -0.1672      0.167     -1.000      0.317      -0.495       0.161
AGE:Log_AGE     0.0351    

## Absence of multicolinearity (Variance Inflation Factor)

In [26]:
# Use variance inflation factor to identify any significant multi-collinearity
df_notarget = df_age.drop("LUNG_CANCER", axis = 1)
def calc_vif(df):
    vif = pd.DataFrame()
    vif["variables"] = df.columns
    vif["VIF"] = [variance_inflation_factor(df.values, i) for i in range(df.shape[1])]
    return(vif)

calc_vif(df_notarget)
display(HTML("<p style='text-align: left; font-weight: bold;'>Figure. 2</p>"))

df_notarget_noage = df_age.drop(["LUNG_CANCER","AGE"], axis = 1)
calc_vif(df_notarget_noage)
display(HTML("<p style='text-align: left; font-weight: bold;'>Figure. 3</p>"))

Unnamed: 0,variables,VIF
0,GENDER,2.133998
1,AGE,12.353038
2,SMOKING,2.043608
3,YELLOW_FINGERS,2.110007
4,ANXIETY,1.850106
5,PEER_PRESSURE,2.038058
6,CHRONIC DISEASE,1.841788
7,FATIGUE,3.110911
8,ALLERGY,2.186342
9,WHEEZING,1.985015


Unnamed: 0,variables,VIF
0,GENDER,2.054497
1,SMOKING,1.957444
2,YELLOW_FINGERS,2.039804
3,ANXIETY,1.766997
4,PEER_PRESSURE,1.928564
5,CHRONIC DISEASE,1.772197
6,FATIGUE,2.790122
7,ALLERGY,2.11977
8,WHEEZING,1.932506
9,ALCOHOL CONSUMING,2.072337


## Independence of Observation
This is implicitly satisfied due to the nature of the dataset, where each observation represents a unique individual and their medical conditions and lifestyle habits

# Baseline logistics regression

In [27]:
# Import necessary libraries
import numpy as np
import pandas as pd
from sklearn.datasets import load_diabetes
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
import statsmodels.api as sm

# Add a constant to the model (for the intercept) to the training data
X_train_f = sm.add_constant(pd.DataFrame(X_train, columns=features_list))
X_val_f = sm.add_constant(pd.DataFrame(X_val, columns=features_list))  


# Fit the logistic regression model on the training data using statsmodels
logit_model = sm.Logit(y_train, X_train_f)
result = logit_model.fit()

# Print summary of the model
print(result.summary())

# Get the predicted probabilities from the test set
y_pred_prob = result.predict(X_val_f)

# Apply threshold to get the class predictions
y_pred = (y_pred_prob >= 0.5).astype(int)

# Calculate the F1 score
f1 = f1_score(y_val, y_pred)

# Print the F1 score
print(f"F1 Score: {f1}")

display(HTML("<p style='text-align: left; font-weight: bold;'>Figure. 4</p>"))

Optimization terminated successfully.
         Current function value: 0.398573
         Iterations 7
                           Logit Regression Results                           
Dep. Variable:                      y   No. Observations:                 7200
Model:                          Logit   Df Residuals:                     7184
Method:                           MLE   Df Model:                           15
Date:                Sun, 10 Nov 2024   Pseudo R-squ.:                  0.1866
Time:                        23:04:24   Log-Likelihood:                -2869.7
converged:                       True   LL-Null:                       -3528.2
Covariance Type:            nonrobust   LLR p-value:                1.217e-271
                            coef    std err          z      P>|z|      [0.025      0.975]
-----------------------------------------------------------------------------------------
const                    -0.5355      0.226     -2.370      0.018      -0.978      -0.

In [28]:
# Without AGE due to multicolinearity
y = df_age['LUNG_CANCER'].values
X = df_age.drop(["LUNG_CANCER","AGE"], axis = 1).values

# Split the data into training and validation sets (80% training, 20% validation)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=888)

features_list_no_age = features_list.remove("AGE")
# Add a constant to the model (for the intercept) to the training data
X_train_f = sm.add_constant(pd.DataFrame(X_train, columns=features_list))
X_val_f = sm.add_constant(pd.DataFrame(X_val, columns=features_list))  


# Fit the logistic regression model on the training data using statsmodels
logit_model = sm.Logit(y_train, X_train_f)
result = logit_model.fit()

# Print summary of the model
print(result.summary())

# Get the predicted probabilities from the test set
y_pred_prob = result.predict(X_val_f)

# Apply threshold to get the class predictions
y_pred = (y_pred_prob >= 0.5).astype(int)

# Calculate the F1 score
f1 = f1_score(y_val, y_pred)

# Print the F1 score
print(f"F1 Score: {f1}")
display(HTML("<p style='text-align: left; font-weight: bold;'>Figure. 5</p>"))

Optimization terminated successfully.
         Current function value: 0.399331
         Iterations 7
                           Logit Regression Results                           
Dep. Variable:                      y   No. Observations:                 7200
Model:                          Logit   Df Residuals:                     7185
Method:                           MLE   Df Model:                           14
Date:                Sun, 10 Nov 2024   Pseudo R-squ.:                  0.1851
Time:                        23:04:36   Log-Likelihood:                -2875.2
converged:                       True   LL-Null:                       -3528.2
Covariance Type:            nonrobust   LLR p-value:                2.729e-270
                            coef    std err          z      P>|z|      [0.025      0.975]
-----------------------------------------------------------------------------------------
const                     0.0394      0.144      0.274      0.784      -0.242       0.

In [32]:
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, KFold, train_test_split
from sklearn.datasets import load_breast_cancer
from sklearn.metrics import f1_score

# Define cross-validation strategy (e.g., 5-fold cross-validation)
kf = KFold(n_splits=10, shuffle=True, random_state=888)

# Generate a large grid for C values
C_grid = np.logspace(-5, 3, num=100)  # From 10^-5 to 10^3 with 10 points

# Define parameter grids for Lasso, Ridge, and ElasticNet
lasso_param_grid = {'C': C_grid}
ridge_param_grid = {'C': C_grid}
elasticnet_param_grid = {
    'C': C_grid,
    'l1_ratio': np.linspace(0.01, 1, num=5)  # L1_ratio from 0.01 to 1 with 5 points
}

# Initialize the models
lasso_logistic_model = LogisticRegression(penalty='l1', solver='liblinear', max_iter=10000, random_state=888)
ridge_logistic_model = LogisticRegression(penalty='l2', solver='liblinear', max_iter=10000, random_state=888)
elasticnet_logistic_model = LogisticRegression(penalty='elasticnet', solver='saga', max_iter=10000, random_state=888)

# Lasso: Perform Grid Search with Cross-Validation on the training set
lasso_logistic_grid_search = GridSearchCV(lasso_logistic_model, lasso_param_grid, cv=kf, scoring='f1')
lasso_logistic_grid_search.fit(X_train, y_train)

print("Best parameters for Lasso Logistic Regression:", lasso_logistic_grid_search.best_params_)
print(f"Best cross-validated f1-score for Lasso Logistic Regression: {lasso_logistic_grid_search.best_score_:.4f}\n")

# Ridge: Perform Grid Search with Cross-Validation on the training set
ridge_logistic_grid_search = GridSearchCV(ridge_logistic_model, ridge_param_grid, cv=kf, scoring='f1')
ridge_logistic_grid_search.fit(X_train, y_train)

print("Best parameters for Ridge Logistic Regression:", ridge_logistic_grid_search.best_params_)
print(f"Best cross-validated f1-score for Ridge Logistic Regression: {ridge_logistic_grid_search.best_score_:.4f}\n")

# Elastic Net: Perform Grid Search with Cross-Validation on the training set
elasticnet_logistic_grid_search = GridSearchCV(elasticnet_logistic_model, elasticnet_param_grid, cv=kf, scoring='f1')
elasticnet_logistic_grid_search.fit(X_train, y_train)

print("Best parameters for Elastic Net Logistic Regression:", elasticnet_logistic_grid_search.best_params_)
print(f"Best cross-validated f1-score for Elastic Net Logistic Regression: {elasticnet_logistic_grid_search.best_score_:.4f}\n")

# Retrieve best model
lasso_best_model = lasso_logistic_grid_search.best_estimator_
ridge_best_model = ridge_logistic_grid_search.best_estimator_
elasticnet_best_model = elasticnet_logistic_grid_search.best_estimator_

# Use best model to predict on the validation set
y_pred_lasso = lasso_best_model.predict(X_val)
y_pred_ridge = ridge_best_model.predict(X_val)
y_pred_elasticnet = elasticnet_best_model.predict(X_val)

# Evaluate the best models: Calculate F1-score for each best model on the validation set
print(f"Lasso Logistic Regression Accuracy Score on Validation Set: {accuracy_score(y_val, y_pred_lasso):.4f}")
print(f"Ridge Logistic Regression Accuracy Score on Validation Set: {accuracy_score(y_val, y_pred_ridge):.4f}")
print(f"Elastic Net Logistic Regression Accuracy Score on Validation Set: {accuracy_score(y_val, y_pred_elasticnet):.4f}")
print(f"Lasso Logistic Regression Precision Score on Validation Set: {precision_score(y_val, y_pred_lasso):.4f}")
print(f"Ridge Logistic Regression Precision Score on Validation Set: {precision_score(y_val, y_pred_ridge):.4f}")
print(f"Elastic Net Logistic Regression Precision Score on Validation Set: {precision_score(y_val, y_pred_elasticnet):.4f}")
print(f"Lasso Logistic Regression Recall Score on Validation Set: {recall_score(y_val, y_pred_lasso):.4f}")
print(f"Ridge Logistic Regression Recall Score on Validation Set: {recall_score(y_val, y_pred_ridge):.4f}")
print(f"Elastic Net Logistic Regression Recall Score on Validation Set: {recall_score(y_val, y_pred_elasticnet):.4f}")
print(f"Lasso Logistic Regression F1 Score on Validation Set: {f1_score(y_val, y_pred_lasso):.4f}")
print(f"Ridge Logistic Regression F1 Score on Validation Set: {f1_score(y_val, y_pred_ridge):.4f}")
print(f"Elastic Net Logistic Regression F1 Score on Validation Set: {f1_score(y_val, y_pred_elasticnet):.4f}")

display(HTML("<p style='text-align: left; font-weight: bold;'>Figure. 6</p>"))

Best parameters for Lasso Logistic Regression: {'C': 7.924828983539186}
Best cross-validated f1-score for Lasso Logistic Regression: 0.9310



Best parameters for Ridge Logistic Regression: {'C': 327.4549162877732}
Best cross-validated f1-score for Ridge Logistic Regression: 0.9311



Best parameters for Elastic Net Logistic Regression: {'C': 24.201282647943835, 'l1_ratio': 1.0}
Best cross-validated f1-score for Elastic Net Logistic Regression: 0.9311

Lasso Logistic Regression Accuracy Score on Validation Set: 0.8689
Ridge Logistic Regression Accuracy Score on Validation Set: 0.8689
Elastic Net Logistic Regression Accuracy Score on Validation Set: 0.8689
Lasso Logistic Regression Precision Score on Validation Set: 0.8594
Ridge Logistic Regression Precision Score on Validation Set: 0.8594
Elastic Net Logistic Regression Precision Score on Validation Set: 0.8594
Lasso Logistic Regression Recall Score on Validation Set: 0.9986
Ridge Logistic Regression Recall Score on Validation Set: 0.9986
Elastic Net Logistic Regression Recall Score on Validation Set: 0.9986
Lasso Logistic Regression F1 Score on Validation Set: 0.9238
Ridge Logistic Regression F1 Score on Validation Set: 0.9238
Elastic Net Logistic Regression F1 Score on Validation Set: 0.9238


In [34]:
# Define the final model with the selected regularization parameter
lasso_logistic_final_model = LogisticRegression(penalty='l1', solver='liblinear', max_iter=10000, random_state=888,
                                                C=lasso_logistic_grid_search.best_params_['C'])
ridge_logistic_final_model = LogisticRegression(penalty='l2', solver='liblinear', max_iter=10000, random_state=888,
                                                C=ridge_logistic_grid_search.best_params_['C'])
elasticnet_logistic_final_model = LogisticRegression(penalty='elasticnet', solver='saga', max_iter=10000, random_state=888,
                                                     C=elasticnet_logistic_grid_search.best_params_['C'],
                                                     l1_ratio=elasticnet_logistic_grid_search.best_params_['l1_ratio'])

# Fit the final models
lasso_logistic_final_model.fit(X, y)
ridge_logistic_final_model.fit(X, y)
elasticnet_logistic_final_model.fit(X, y)

# Retrieve the coefficients
features_list
print(f"Lasso Coefficients: {lasso_logistic_final_model.coef_}")
print(f"Lasso Intercept: {lasso_logistic_final_model.intercept_}\n")

print(f"Ridge Coefficients: {ridge_logistic_final_model.coef_}")
print(f"Ridge Intercept: {ridge_logistic_final_model.intercept_}\n")

print(f"Elastic Net Coefficients: {elasticnet_logistic_final_model.coef_}")
print(f"Elastic Net Intercept: {elasticnet_logistic_final_model.intercept_}\n")
display(HTML("<p style='text-align: left; font-weight: bold;'>Figure. 7</p>"))

['GENDER',
 'SMOKING',
 'YELLOW_FINGERS',
 'ANXIETY',
 'PEER_PRESSURE',
 'CHRONIC DISEASE',
 'FATIGUE ',
 'ALLERGY ',
 'WHEEZING',
 'ALCOHOL CONSUMING',
 'COUGHING',
 'SHORTNESS OF BREATH',
 'SWALLOWING DIFFICULTY',
 'CHEST PAIN']

Lasso Coefficients: [[-0.86072428  0.38998326  1.36091438  0.03863578  0.25641876 -0.13759969
   0.56400809  0.71976192 -0.63376744  1.49969053 -0.02010705 -0.34658734
   0.73018915 -0.04867494]]
Lasso Intercept: [0.0608817]

Ridge Coefficients: [[-0.86133569  0.39051987  1.36164057  0.03892791  0.25683957 -0.13815733
   0.56463184  0.7203818  -0.63440242  1.50061511 -0.02051129 -0.34724646
   0.73088401 -0.04912568]]
Ridge Intercept: [0.0608094]

Elastic Net Coefficients: [[-0.86108571  0.39037844  1.36140214  0.03881236  0.25668937 -0.13799801
   0.56444717  0.72020936 -0.63422002  1.50028956 -0.02038433 -0.34704875
   0.73069583 -0.04899489]]
Elastic Net Intercept: [0.06079248]

