In [3]:
# Import necessary libraries
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
import numpy as np

In [4]:
# Load the test dataset
test_data = pd.read_csv("lung cancer survey_test.csv")

# Data Cleaning: find + remove rows with missing values and standardise column names
test_data.dropna() #dropping na values
test_data.columns = test_data.columns.str.replace('_', ' ') #standradised data columns 

#Seperating the data into relevant predictors and target variable
X = test_data.drop(['LUNG CANCER', 'ALLERGY ', 'ANXIETY', 'PEER PRESSURE'], axis=1) #removing target variable and irrelevant predictors
y = test_data['LUNG CANCER'] #target variable

# Define the final model with the selected regularization parameters
elasticnet_logistic_final_model = LogisticRegression(
    penalty='elasticnet', 
    solver='saga', 
    max_iter=10000, 
    random_state=888,      # random state for reproducability 
    C=16.681005372000556,  # C best parameters
    l1_ratio=0.01          # l1_ratio best parameters
)

# Fit the final model on the entire dataset
elasticnet_logistic_final_model.fit(X, y)

# Predict lung cancer on test data using the trained ElasticNet model
y_pred = elasticnet_logistic_final_model.predict(X)

# Evaluate the prediction based on F1-Score
print(f"The F1 Score of the Logistic Regression with ElasticNet Regularisation : {f1_score(y, y_pred)}")

The F1 Score of the Logistic Regression with ElasticNet Regularisation : 0.9256567915036332
