<a href="https://colab.research.google.com/github/brianhphillips/testrepo/blob/main/LogReg_LungCancer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import kagglehub

# Step 1: Download dataset
path = kagglehub.dataset_download("aagambshah/lung-cancer-dataset")
print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/aagambshah/lung-cancer-dataset?dataset_version_number=1...


100%|██████████| 2.00k/2.00k [00:00<00:00, 3.37MB/s]

Extracting files...
Path to dataset files: /root/.cache/kagglehub/datasets/aagambshah/lung-cancer-dataset/versions/1





In [2]:
# Step 2: Load and inspect the data
import pandas as pd
import os

df = pd.read_csv(os.path.join(path, "survey lung cancer.csv"))
print(df.head())

  GENDER  AGE  SMOKING  YELLOW_FINGERS  ANXIETY  PEER_PRESSURE  \
0      M   69        1               2        2              1   
1      M   74        2               1        1              1   
2      F   59        1               1        1              2   
3      M   63        2               2        2              1   
4      F   63        1               2        1              1   

   CHRONIC DISEASE  FATIGUE   ALLERGY   WHEEZING  ALCOHOL CONSUMING  COUGHING  \
0                1         2         1         2                  2         2   
1                2         2         2         1                  1         1   
2                1         2         1         2                  1         2   
3                1         1         1         1                  2         1   
4                1         1         1         2                  1         2   

   SHORTNESS OF BREATH  SWALLOWING DIFFICULTY  CHEST PAIN LUNG_CANCER  
0                    2                      

In [10]:
# Step 3: Preprocessing

# Strip leading/trailing spaces from column names
df.columns = df.columns.str.strip()

# Encode gender (M = 1, F = 0)
df['GENDER'] = df['GENDER'].astype(str).str.strip().str.upper().map({'M': 1, 'F': 0})

# Encode target variable (LUNG_CANCER) from YES/NO to 1/0
df['LUNG_CANCER'] = df['LUNG_CANCER'].astype(str).str.strip().str.upper().map({'YES': 1, 'NO': 0})

# Identify YES/NO columns excluding AGE, GENDER, and LUNG_CANCER
yes_no_columns = [
    col for col in df.columns
    if col not in ['AGE', 'GENDER', 'LUNG_CANCER']
    and df[col].nunique() == 2
]

# Convert all other YES/NO categorical columns to binary (1 = YES, 0 = NO)
for col in yes_no_columns:
    df[col] = df[col].astype(str).str.strip().str.upper().map({'YES': 1, 'NO': 0})

KeyError: 'LUNG_CANCER'

In [None]:
# Step 4: Train-test split
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Step 5: Feature scaling (optional but improves convergence)
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
# Step 6: Train Logistic Regression
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

model = LogisticRegression()
model.fit(X_train_scaled, y_train)

In [None]:
# Step 7: Evaluation
y_pred = model.predict(X_test_scaled)
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("ROC AUC Score:", roc_auc_score(y_test, model.predict_proba(X_test_scaled)[:, 1]))

In [None]:
# Step 8: Visualize Feature Importance
import matplotlib.pyplot as plt
import numpy as np

feature_importance = model.coef_[0]
feature_names = X.columns

# Plot
plt.figure(figsize=(10, 6))
indices = np.argsort(np.abs(feature_importance))[::-1]
plt.barh(range(len(feature_names)), feature_importance[indices], align='center')
plt.yticks(range(len(feature_names)), [feature_names[i] for i in indices])
plt.title("Feature Importance (Logistic Regression Coefficients)")
plt.xlabel("Coefficient Value")
plt.gca().invert_yaxis()
plt.grid(True)
plt.tight_layout()
plt.show()