In [5]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [7]:
import numpy as np
import pandas as pd
df = pd.read_csv('lung cancer survey.csv')
df_no_na = df.dropna()

df_age = df_no_na[df_no_na["AGE"] > 21]
# Use df_age for all models, where clustering algorithm models, such as Kmeans, randomforest, decision tree as they would have already categorise an age threshold within the model
df_age

# Use df_cluster only for logistic regression, lasso, ridge and elastic net - cross comparison of these models with df_age
df_cluster = df_age.copy()
df_cluster['cluster'] = df_cluster['AGE'].apply(lambda x: 1 if x >= 61 else 0)
df_cluster = df_cluster.drop("AGE", axis = 1)
df_cluster

Unnamed: 0,GENDER,AGE,SMOKING,YELLOW_FINGERS,ANXIETY,PEER_PRESSURE,CHRONIC DISEASE,FATIGUE,ALLERGY,WHEEZING,ALCOHOL CONSUMING,COUGHING,SHORTNESS OF BREATH,SWALLOWING DIFFICULTY,CHEST PAIN,LUNG_CANCER
0,0.0,61.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
1,1.0,70.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0
2,1.0,59.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,1.0,54.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0
4,0.0,54.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8996,1.0,62.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0
8997,0.0,71.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0
8998,1.0,63.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
8999,1.0,70.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0


Unnamed: 0,GENDER,SMOKING,YELLOW_FINGERS,ANXIETY,PEER_PRESSURE,CHRONIC DISEASE,FATIGUE,ALLERGY,WHEEZING,ALCOHOL CONSUMING,COUGHING,SHORTNESS OF BREATH,SWALLOWING DIFFICULTY,CHEST PAIN,LUNG_CANCER,cluster
0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1
1,1.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,1
2,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0
3,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,0
4,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8996,1.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1
8997,0.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,1
8998,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1
8999,1.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,1


In [9]:
# Cost complexity pruning, find the best alpha 
from sklearn.datasets import load_breast_cancer
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import numpy as np
import matplotlib.pyplot as plt
from sklearn.tree import export_text

X = df_age.drop('LUNG_CANCER', axis=1)  # Drop the target column to get the features
y = df_age['LUNG_CANCER']  # Extract the target column

# Step 1: Split the data into training and validation sets (80% training, 20% validation)
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=888
)

# Step 2 & 3: Get a sequence of subtrees
# Define your Decision Tree
model_tree = DecisionTreeClassifier(criterion='entropy', random_state=888)

# Use cost complexity pruning and get sequence of alphas
path = model_tree.cost_complexity_pruning_path(X_train, y_train)
ccp_alphas = path.ccp_alphas  # effective alphas of each subtree

# Define the Decision Tree for each alpha
trees = []
for ccp_alpha in ccp_alphas:
    clf = DecisionTreeClassifier(criterion = 'entropy', random_state = 888, ccp_alpha = ccp_alpha)
    trees.append(clf)

# Step 4: Use K-fold cross-validation to choose the best alpha
kf = KFold(n_splits=10, shuffle=True, random_state=888)
mean_cv_errors = []

for clf in trees:
    # Perform cross-validation and get the average error for each alpha
    cv_scores = cross_val_score(clf, X_train, y_train, cv=kf, scoring='accuracy')
    mean_cv_errors.append(np.mean(cv_scores))  # Get average accuracy for each alpha

# Step 5: Find the optimal alpha (the one that maximizes the cross-validated accuracy)
optimal_alpha_index = np.argmax(mean_cv_errors)
optimal_alpha = ccp_alphas[optimal_alpha_index]
print(f"Optimal alpha: {optimal_alpha}")

# Step 6: Prune the tree using the optimal alpha on the training set
pruned_tree = DecisionTreeClassifier(criterion='entropy', random_state=888, ccp_alpha=optimal_alpha)
pruned_tree.fit(X_train, y_train)

# Step 7: Evaluate the pruned tree on the validation set
y_pred_pruned = pruned_tree.predict(X_val)
print(f"Accuracy with pruning: {accuracy_score(y_val, y_pred_pruned):.4f}")
print(f"Precision with pruning: {precision_score(y_val, y_pred_pruned):.4f}")
print(f"Recall with pruning: {recall_score(y_val, y_pred_pruned):.4f}")
print(f"F1-Score with pruning: {f1_score(y_val, y_pred_pruned):.4f}")

Optimal alpha: 0.000805279635025675


Accuracy with pruning: 0.8728
Precision with pruning: 0.8697
Recall with pruning: 0.9881
F1-Score with pruning: 0.9251


In [36]:
from sklearn.ensemble import RandomForestClassifier

X = df_age.drop('LUNG_CANCER', axis=1)  # Drop the target column to get the features
y = df_age['LUNG_CANCER']  # Extract the target column

# Step 1: Split the data into training and validation sets (80% training, 20% validation)
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=888
)

# Initialize the model (Note: Bagging is a special case of RF when m = p) & max_features = 1 (by default, meaning all features considered)
Bagging_classifier = RandomForestClassifier(n_estimators = 100, random_state = 42, criterion = 'entropy',) 

# Train the model on the training data
Bagging_classifier.fit(X_train, y_train)

# Make predictions on the validation data
y_pred_bagging = Bagging_classifier.predict(X_val)

# Display the results
print(f"Accuracy: {accuracy_score(y_val, y_pred_pruned):.4f}")
print(f"Precision: {precision_score(y_val, y_pred_pruned):.4f}")
print(f"Recall: {recall_score(y_val, y_pred_pruned):.4f}")
print(f"F1 Score: {f1_score(y_val, y_pred_bagging):.4f}")

# Should we do a 70-30% Spilt for training n test? Does it affect the results?

Accuracy: 0.8728
Precision: 0.8697
Recall: 0.9881
F1 Score: 0.9227


In [38]:
# With GridSearch
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

X = df_age.drop('LUNG_CANCER', axis=1)  # Drop the target column to get the features
y = df_age['LUNG_CANCER']  # Extract the target column

# Step 1: Split the data into training and validation sets (80% training, 20% validation)
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=888
)

# Define the parameter grid for Grid Search
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10]
}

# Initialize the Random Forest model
Bagging_classifier = RandomForestClassifier(random_state=42, criterion='entropy')

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=Bagging_classifier, param_grid=param_grid, 
                           scoring='f1', cv=5, n_jobs=-1, verbose=2)

# Train the model on the training data
grid_search.fit(X_train, y_train)

# Make predictions on the validation data using the best estimator
y_pred_bagging = grid_search.best_estimator_.predict(X_val)

# Display the results
print(f"Best Parameters: {grid_search.best_params_}")
print(f"Accuracy: {accuracy_score(y_val, y_pred_bagging):.4f}")
print(f"Precision: {precision_score(y_val, y_pred_bagging):.4f}")
print(f"Recall: {recall_score(y_val, y_pred_bagging):.4f}")
print(f"Bagging F1 Score: {f1_score(y_val, y_pred_bagging):.4f}")

Fitting 5 folds for each of 36 candidates, totalling 180 fits


Best Parameters: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 50}
Accuracy: 0.8811
Precision: 0.8759
Recall: 0.9909
Bagging F1 Score: 0.9299


In [13]:
# CAN IGNORE - idt the pruned tree shld be used in conjunction with bagging or RF
# with ccp_alpha
# from sklearn.ensemble import RandomForestClassifier

# # Initialize the model (Note: Bagging is a special case of RF when m = p)
# Bagging_classifier = RandomForestClassifier(n_estimators = 100, random_state = 42, criterion = 'entropy', ccp_alpha=ccp_alpha) # max_features = None (by default)

# # Train the model on the training data
# Bagging_classifier.fit(X_train, y_train)

# # Make predictions on the validation data
# y_pred_bagging = Bagging_classifier.predict(X_val)

# # Display the results
# print(f"Accuracy with pruning: {accuracy_score(y_val, y_pred_pruned):.4f}")
# print(f"Precision with pruning: {precision_score(y_val, y_pred_pruned):.4f}")
# print(f"Recall with pruning: {recall_score(y_val, y_pred_pruned):.4f}")
# print(f"Bagging Validation F1 Score: {f1_score(y_val, y_pred_bagging):.4f}")

Accuracy with pruning: 0.8728
Precision with pruning: 0.8697
Recall with pruning: 0.9881
Bagging Validation F1 Score: 0.8861


In [15]:
? RandomForestClassifier

[0;31mInit signature:[0m
 [0mRandomForestClassifier[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mn_estimators[0m[0;34m=[0m[0;36m100[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0;34m*[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mcriterion[0m[0;34m=[0m[0;34m'gini'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mmax_depth[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mmin_samples_split[0m[0;34m=[0m[0;36m2[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mmin_samples_leaf[0m[0;34m=[0m[0;36m1[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mmin_weight_fraction_leaf[0m[0;34m=[0m[0;36m0.0[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mmax_features[0m[0;34m=[0m[0;34m'sqrt'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mmax_leaf_nodes[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mmin_impurity_decrease[0m[0;34m=[0m[0;36m0.0[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mbootstrap[0m[0;34m=[0m[0;32mTrue[0m[0;34m,[0m

## Pairwise features included 

In [24]:
import pandas as pd
from itertools import product

feature_names = [col for col in df_cluster.columns if col != "LUNG_CANCER"]

df_pairwise = df_cluster.copy()
# Generate all combinations of interaction terms for these features
for i in range(len(feature_names)):
    for j in range(i + 1, len(feature_names)):
        new_column_name = f'{feature_names[i]}_{feature_names[j]}'
        df_pairwise[new_column_name] = df_pairwise[feature_names[i]] * df_pairwise[feature_names[j]]

print(df_pairwise)

      GENDER  SMOKING  YELLOW_FINGERS  ANXIETY  PEER_PRESSURE  \
0        0.0      0.0             0.0      1.0            1.0   
1        1.0      1.0             1.0      0.0            0.0   
2        1.0      0.0             0.0      0.0            0.0   
3        1.0      0.0             0.0      0.0            1.0   
4        0.0      1.0             0.0      0.0            1.0   
...      ...      ...             ...      ...            ...   
8996     1.0      0.0             1.0      1.0            1.0   
8997     0.0      1.0             1.0      1.0            0.0   
8998     1.0      1.0             0.0      0.0            1.0   
8999     1.0      1.0             1.0      0.0            0.0   
9099     1.0      0.0             0.0      0.0            1.0   

      CHRONIC DISEASE  FATIGUE   ALLERGY   WHEEZING  ALCOHOL CONSUMING  ...  \
0                 0.0       1.0       0.0       0.0                0.0  ...   
1                 1.0       0.0       1.0       1.0          

  df_pairwise[new_column_name] = df_pairwise[feature_names[i]] * df_pairwise[feature_names[j]]
  df_pairwise[new_column_name] = df_pairwise[feature_names[i]] * df_pairwise[feature_names[j]]
  df_pairwise[new_column_name] = df_pairwise[feature_names[i]] * df_pairwise[feature_names[j]]
  df_pairwise[new_column_name] = df_pairwise[feature_names[i]] * df_pairwise[feature_names[j]]
  df_pairwise[new_column_name] = df_pairwise[feature_names[i]] * df_pairwise[feature_names[j]]
  df_pairwise[new_column_name] = df_pairwise[feature_names[i]] * df_pairwise[feature_names[j]]
  df_pairwise[new_column_name] = df_pairwise[feature_names[i]] * df_pairwise[feature_names[j]]


In [40]:
# Cost complexity pruning, find the best alpha 
from sklearn.datasets import load_breast_cancer
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import numpy as np
import matplotlib.pyplot as plt
from sklearn.tree import export_text

y = df_pairwise['LUNG_CANCER'].values
X = df_pairwise.drop("LUNG_CANCER", axis = 1).values

# Split the data into training and validation sets (80% training, 20% validation)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=888)

# Get a sequence of subtrees
# Define your Decision Tree
model_tree = DecisionTreeClassifier(criterion='entropy', random_state=888)

# Use cost complexity pruning and get sequence of alphas
path = model_tree.cost_complexity_pruning_path(X_train, y_train)
ccp_alphas = path.ccp_alphas  # effective alphas of each subtree

# Define the Decision Tree for each alpha
trees = []
for ccp_alpha in ccp_alphas:
    clf = DecisionTreeClassifier(criterion = 'entropy', random_state = 888, ccp_alpha = ccp_alpha)
    trees.append(clf)

# Use K-fold cross-validation to choose the best alpha
kf = KFold(n_splits=10, shuffle=True, random_state=888)
mean_cv_errors = []

for clf in trees:
    # Perform cross-validation and get the average error for each alpha
    cv_scores = cross_val_score(clf, X_train, y_train, cv=kf, scoring='accuracy')
    mean_cv_errors.append(np.mean(cv_scores))  # Get average accuracy for each alpha

# Find the optimal alpha (the one that maximizes the cross-validated accuracy)
optimal_alpha_index = np.argmax(mean_cv_errors)
optimal_alpha = ccp_alphas[optimal_alpha_index]
print(f"Optimal alpha: {optimal_alpha}")

# Prune the tree using the optimal alpha on the training set
pruned_tree = DecisionTreeClassifier(criterion='entropy', random_state=888, ccp_alpha=optimal_alpha)
pruned_tree.fit(X_train, y_train)

# Evaluate the pruned tree on the validation set
y_pred_pruned = pruned_tree.predict(X_val)
print(f"Accuracy with pruning: {accuracy_score(y_val, y_pred_pruned):.4f}")
print(f"Precision with pruning: {precision_score(y_val, y_pred_pruned):.4f}")
print(f"Recall with pruning: {recall_score(y_val, y_pred_pruned):.4f}")
print(f"F1-Score with pruning: {f1_score(y_val, y_pred_pruned):.4f}")

[CV] END max_depth=None, min_samples_split=2, n_estimators=100; total time=   0.7s
[CV] END max_depth=None, min_samples_split=2, n_estimators=200; total time=   1.4s
[CV] END max_depth=None, min_samples_split=5, n_estimators=200; total time=   1.1s
[CV] END max_depth=None, min_samples_split=10, n_estimators=200; total time=   1.0s
[CV] END max_depth=10, min_samples_split=2, n_estimators=100; total time=   0.5s
[CV] END .max_depth=10, min_samples_split=5, n_estimators=50; total time=   0.3s
[CV] END max_depth=10, min_samples_split=5, n_estimators=100; total time=   0.5s
[CV] END max_depth=10, min_samples_split=5, n_estimators=200; total time=   1.0s
[CV] END max_depth=10, min_samples_split=10, n_estimators=200; total time=   1.0s
[CV] END max_depth=20, min_samples_split=2, n_estimators=100; total time=   0.6s
[CV] END .max_depth=20, min_samples_split=5, n_estimators=50; total time=   0.3s
[CV] END max_depth=20, min_samples_split=5, n_estimators=100; total time=   0.6s
[CV] END max_depth

Accuracy with pruning: 0.8767
Precision with pruning: 0.8735
Recall with pruning: 0.9881
F1-Score with pruning: 0.9273


In [34]:
y = df_pairwise['LUNG_CANCER'].values
X = df_pairwise.drop("LUNG_CANCER", axis = 1).values

# Split the data into training and validation sets (80% training, 20% validation)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=888)

# Initialize the model (Note: Bagging is a special case of RF when m = p) & max_features = 1 (by default, meaning all features considered)
Bagging_classifier = RandomForestClassifier(n_estimators = 100, random_state = 42, criterion = 'entropy',) 

# Train the model on the training data
Bagging_classifier.fit(X_train, y_train)

# Make predictions on the validation data
y_pred_bagging = Bagging_classifier.predict(X_val)

# Display the results
print(f"Accuracy: {accuracy_score(y_val, y_pred_pruned):.4f}")
print(f"Precision: {precision_score(y_val, y_pred_pruned):.4f}")
print(f"Recall: {recall_score(y_val, y_pred_pruned):.4f}")
print(f"Bagging: {f1_score(y_val, y_pred_bagging):.4f}")

Accuracy: 0.8728
Precision: 0.8697
Recall: 0.9881
Bagging: 0.9133


In [32]:
# With GridSearch
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

y = df_pairwise['LUNG_CANCER'].values
X = df_pairwise.drop("LUNG_CANCER", axis = 1).values

# Split the data into training and validation sets (80% training, 20% validation)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=888)

# Define the parameter grid for Grid Search
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10]
}

# Initialize the Random Forest model
Bagging_classifier = RandomForestClassifier(random_state=42, criterion='entropy')

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=Bagging_classifier, param_grid=param_grid, 
                           scoring='f1', cv=5, n_jobs=-1, verbose=2)

# Train the model on the training data
grid_search.fit(X_train, y_train)

# Make predictions on the validation data using the best estimator
y_pred_bagging = grid_search.best_estimator_.predict(X_val)

# Display the results
print(f"Best Parameters: {grid_search.best_params_}")
print(f"Accuracy: {accuracy_score(y_val, y_pred_bagging):.4f}")
print(f"Precision: {precision_score(y_val, y_pred_bagging):.4f}")
print(f"Recall: {recall_score(y_val, y_pred_bagging):.4f}")
print(f"Bagging F1 Score: {f1_score(y_val, y_pred_bagging):.4f}")

Fitting 5 folds for each of 36 candidates, totalling 180 fits


Best Parameters: {'max_depth': 20, 'min_samples_split': 10, 'n_estimators': 200}
Accuracy: 0.8817
Precision: 0.8760
Recall: 0.9916
Bagging F1 Score: 0.9302
