# Required Libraries Importation

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.metrics import roc_curve, RocCurveDisplay, confusion_matrix, ConfusionMatrixDisplay
import ipywidgets as widgets
from IPython.display import display, clear_output
import warnings
warnings.filterwarnings('ignore')

# Dataset Loading

In [2]:
df = pd.read_excel('/content/Cervical Cancer Data.xlsx', engine='openpyxl')

FileNotFoundError: [Errno 2] No such file or directory: '/content/Cervical Cancer Data.xlsx'

In [None]:
# Display the first 5 rows
print(df.head())

In [None]:
# Show dataset shape
print(f"Dataset shape: {df.shape}")

In [None]:
# Check column names and data types
print(df.dtypes)

In [None]:
# Get summary statistics
print(df.describe(include='all'))

# Data Preprocessing

In [None]:
# Check missing values
print("Missing values per column:")
print(df.isnull().sum())

# Drop rows with missing values
df_cleaned = df.dropna()
print("Shape after removing missing values:", df_cleaned.shape)

In [None]:
# Check for duplicates
duplicates = df_cleaned.duplicated().sum()
print("Duplicate rows:", duplicates)

# Drop duplicates if any
df_cleaned = df_cleaned.drop_duplicates()
print("Shape after removing duplicates:", df_cleaned.shape)

In [None]:
# Drop non-informative columns
columns_to_drop = ['AutoID', 'StataID']
df_cleaned = df_cleaned.drop(columns=columns_to_drop)

In [None]:
# Encode categorical columns
label_enc = LabelEncoder()

# Identify object (categorical) columns
cat_cols = df_cleaned.select_dtypes(include='object').columns

# Apply LabelEncoder to each categorical column
for col in cat_cols:
    df_cleaned[col] = label_enc.fit_transform(df_cleaned[col])

In [None]:
# Data types
print("Data types after encoding:")
print(df_cleaned.dtypes)

In [None]:
# Data preview
print("Sample of preprocessed data:")
print(df_cleaned.head())

# Exploratory Data Analysis (EDA)

In [None]:
# Set plot style
sns.set(style="whitegrid")
plt.figure(figsize=(10, 6))

In [None]:
# Bar plot of SHS Class distribution
plt.figure(figsize=(8, 5))
sns.countplot(data=df, x='SHSClass', palette='Set2')
plt.title("Distribution of SHS Class")
plt.show()

In [None]:
# Pie chart of CCCause responses
plt.figure(figsize=(6, 6))
df['CCCause'].value_counts().plot.pie(autopct='%1.1f%%', colors=['#66b3ff', '#ff9999'])
plt.title("Proportion of Correct vs Incorrect Causes")
plt.ylabel('')
plt.show()

In [None]:
# Histogram of age
plt.figure(figsize=(8, 5))
sns.histplot(df['age'], bins=10, kde=True, color='purple')
plt.title("Age Distribution of Respondents")
plt.xlabel("Age")
plt.ylabel("Frequency")
plt.show()

In [None]:
# Heatmap of correlations among domain scores
plt.figure(figsize=(8, 6))
domain_cols = ['DomainI', 'DomainII', 'DomainIII', 'DomainIV', 'DomainOverAll']
sns.heatmap(df[domain_cols].corr(), annot=True, cmap='coolwarm')
plt.title("Correlation Heatmap of Domain Scores")
plt.show()

In [None]:
# Boxplot of age by SHSClass
plt.figure(figsize=(8, 5))
sns.boxplot(data=df, x='SHSClass', y='age', palette='Set3')
plt.title("Age Variation Across SHS Classes")
plt.show()

In [None]:
# Count plot of RegularScreenPapSmear
plt.figure(figsize=(6, 5))
sns.countplot(data=df, x='RegularScreenPapSmear', palette='pastel')
plt.title("Pap Smear Screening Regularity")
plt.show()

In [3]:
# Strip plot for SexAtYoungAge by CCCurable
plt.figure(figsize=(8, 5))
sns.stripplot(data=df, x='CCCurable', y='age', hue='SexAtYoungAge', palette='cool')
plt.title("Sex at Young Age vs Belief That Cancer is Curable")
plt.legend(loc='upper right')
plt.show()

NameError: name 'df' is not defined

<Figure size 800x500 with 0 Axes>

In [4]:
# Stacked bar plot of 'HPVImmunization' vs 'RegularScreenPapSmear'
hpv_screen = pd.crosstab(df['HPVImmunization'], df['RegularScreenPapSmear'])

hpv_screen.plot(kind='bar', stacked=True, color=['#6a0dad', '#00ced1'], figsize=(8, 5))
plt.title("HPV Immunization vs Pap Smear Screening")
plt.xlabel("HPV Immunization")
plt.ylabel("Count")
plt.legend(title="Regular Pap Smear")
plt.xticks(rotation=0)
plt.tight_layout()
plt.show()

NameError: name 'df' is not defined

In [5]:
# Radar chart of average domain scores
domain_means = df[['DomainI', 'DomainII', 'DomainIII', 'DomainIV']].mean().values
labels = ['Domain I', 'Domain II', 'Domain III', 'Domain IV']

angles = np.linspace(0, 2 * np.pi, len(labels), endpoint=False).tolist()
domain_means = np.concatenate((domain_means, [domain_means[0]]))  # Close loop
angles += angles[:1]

fig = plt.figure(figsize=(6, 6))
ax = fig.add_subplot(111, polar=True)
ax.plot(angles, domain_means, color='red', linewidth=2)
ax.fill(angles, domain_means, color='red', alpha=0.25)
ax.set_yticklabels([])
ax.set_xticks(angles[:-1])
ax.set_xticklabels(labels)
plt.show()

NameError: name 'df' is not defined

# Machine Learning Models

In [6]:
# Features and Target
X = df_cleaned[['age', 'SexAtYoungAge', 'HPVImmunization', 'VaginalBleeding',
                'FrequentUrination', 'CCVaginalDischarge', 'PainfulSex',
                'AvoidSmoking', 'RegularExercise', 'RegularScreenPapSmear']]
y = df_cleaned['DomainOverAllCat']

NameError: name 'df_cleaned' is not defined

In [7]:
# Train-test split with stratification
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42)

NameError: name 'X' is not defined

In [8]:
# Standardization
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

NameError: name 'X_train' is not defined

## Logistic Regression

In [9]:
# Model
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train_scaled, y_train)
y_pred = lr.predict(X_test_scaled)
y_proba = lr.predict_proba(X_test_scaled)[:, 1]

# Evaluation
accuracy = accuracy_score(y_test, y_pred) * 100
print(f"Logistic Regression Accuracy: {accuracy:.2f}%\n")
print("Classification Report:\n", classification_report(y_test, y_pred))
print("ROC-AUC Score:", roc_auc_score(y_test, y_proba))

NameError: name 'X_train_scaled' is not defined

In [10]:
# Confusion Matrix
ConfusionMatrixDisplay.from_estimator(lr, X_test_scaled, y_test, cmap='Blues')
plt.title("Confusion Matrix - Logistic Regression")
plt.show()

NameError: name 'X_test_scaled' is not defined

## Decision Trees

In [11]:
# Model
dt = DecisionTreeClassifier()
dt.fit(X_train_scaled, y_train)
y_pred = dt.predict(X_test_scaled)
y_proba = dt.predict_proba(X_test_scaled)[:, 1]

# Evaluation
accuracy = accuracy_score(y_test, y_pred) * 100
print(f"Decision Tree Accuracy: {accuracy:.2f}%\n")
print("Classification Report:\n", classification_report(y_test, y_pred))
print("ROC-AUC Score:", roc_auc_score(y_test, y_proba))

NameError: name 'X_train_scaled' is not defined

In [12]:
# Confusion Matrix
ConfusionMatrixDisplay.from_estimator(dt, X_test_scaled, y_test, cmap='Greens')
plt.title("Confusion Matrix - Decision Tree")
plt.show()

NameError: name 'X_test_scaled' is not defined

## Random Forest

In [13]:
# Model
rf = RandomForestClassifier()
rf.fit(X_train_scaled, y_train)
y_pred = rf.predict(X_test_scaled)
y_proba = rf.predict_proba(X_test_scaled)[:, 1]

# Evaluation
accuracy = accuracy_score(y_test, y_pred) * 100
print(f"Random Forest Accuracy: {accuracy:.2f}%\n")
print("Classification Report:\n", classification_report(y_test, y_pred))
print("ROC-AUC Score:", roc_auc_score(y_test, y_proba))

NameError: name 'X_train_scaled' is not defined

In [14]:
# Confusion Matrix
ConfusionMatrixDisplay.from_estimator(rf, X_test_scaled, y_test, cmap='Oranges')
plt.title("Confusion Matrix - Random Forest")
plt.show()

NameError: name 'X_test_scaled' is not defined

## Support Vector Machines

In [15]:
# Model
svm = SVC(probability=True)
svm.fit(X_train_scaled, y_train)
y_pred = svm.predict(X_test_scaled)
y_proba = svm.predict_proba(X_test_scaled)[:, 1]

# Evaluation
accuracy = accuracy_score(y_test, y_pred) * 100
print(f"SVM Accuracy: {accuracy:.2f}%\n")
print("Classification Report:\n", classification_report(y_test, y_pred))
print("ROC-AUC Score:", roc_auc_score(y_test, y_proba))

NameError: name 'X_train_scaled' is not defined

In [16]:
# Confusion Matrix
ConfusionMatrixDisplay.from_estimator(svm, X_test_scaled, y_test, cmap='Purples')
plt.title("Confusion Matrix - SVM")
plt.show()

NameError: name 'X_test_scaled' is not defined

## Model Comparisons

### Accuracy Comparison

In [17]:
# Accuracy Scores
accuracies = [lr.score(X_test_scaled, y_test),
              dt.score(X_test_scaled, y_test),
              rf.score(X_test_scaled, y_test),
              svm.score(X_test_scaled, y_test)]

models = ['Logistic Regression', 'Decision Tree', 'Random Forest', 'SVM']

# Use a seaborn color palette
colors = sns.color_palette('Set2', len(models))

# Plot
plt.figure(figsize=(8, 5))
plt.bar(models, [a * 100 for a in accuracies], color=colors)
plt.ylabel('Accuracy (%)')
plt.title('Model Accuracy Comparison')
plt.ylim(0, 100)
plt.xticks(rotation=15)
plt.grid(axis='y')
plt.tight_layout()
plt.show()

NameError: name 'X_test_scaled' is not defined

### Evaluation Metrics Comparison

In [18]:
# Metrics
precision = [precision_score(y_test, m.predict(X_test_scaled)) for m in [lr, dt, rf, svm]]
recall = [recall_score(y_test, m.predict(X_test_scaled)) for m in [lr, dt, rf, svm]]
f1 = [f1_score(y_test, m.predict(X_test_scaled)) for m in [lr, dt, rf, svm]]

import numpy as np

x = np.arange(len(models))
width = 0.25

# Plot
plt.figure(figsize=(10, 5))
plt.bar(x - width, precision, width, label='Precision', color='red')
plt.bar(x, recall, width, label='Recall', color='pink')
plt.bar(x + width, f1, width, label='F1-Score', color='blue')

plt.ylabel('Score')
plt.title('Precision, Recall & F1-Score Comparison')
plt.xticks(x, models, rotation=15)
plt.ylim(0, 1)
plt.legend()
plt.tight_layout()
plt.grid(axis='y')
plt.show()


NameError: name 'y_test' is not defined

### ROC-AUC Comparison

In [19]:
# ROC Curves
plt.figure(figsize=(8, 6))

# Logistic Regression
fpr_lr, tpr_lr, _ = roc_curve(y_test, lr.predict_proba(X_test_scaled)[:, 1])
plt.plot(fpr_lr, tpr_lr, label='Logistic Regression')

# Decision Tree
fpr_dt, tpr_dt, _ = roc_curve(y_test, dt.predict_proba(X_test_scaled)[:, 1])
plt.plot(fpr_dt, tpr_dt, label='Decision Tree')

# Random Forest
fpr_rf, tpr_rf, _ = roc_curve(y_test, rf.predict_proba(X_test_scaled)[:, 1])
plt.plot(fpr_rf, tpr_rf, label='Random Forest')

# SVM
fpr_svm, tpr_svm, _ = roc_curve(y_test, svm.predict_proba(X_test_scaled)[:, 1])
plt.plot(fpr_svm, tpr_svm, label='SVM')

# Plot settings
plt.plot([0, 1], [0, 1], 'k--', label='Chance')
plt.title('ROC Curves for Classification Models')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

NameError: name 'y_test' is not defined

<Figure size 800x600 with 0 Axes>

# Cervical Cancer Risk Prediction GUI

In [20]:
# Input widgets
age = widgets.IntSlider(value=16, min=14, max=18, description='Age:')
sex_at_young_age = widgets.Dropdown(options=[('No', 0), ('Yes', 1)], description='Sex Young:')
hpv_immunization = widgets.Dropdown(options=[('No', 0), ('Yes', 1)], description='HPV Vaccine:')
vaginal_bleeding = widgets.Dropdown(options=[('No', 0), ('Yes', 1)], description='Bleeding:')
frequent_urination = widgets.Dropdown(options=[('No', 0), ('Yes', 1)], description='Urination:')
vaginal_discharge = widgets.Dropdown(options=[('No', 0), ('Yes', 1)], description='Discharge:')
painful_sex = widgets.Dropdown(options=[('No', 0), ('Yes', 1)], description='Painful Sex:')
avoid_smoking = widgets.Dropdown(options=[('No', 0), ('Yes', 1)], description='Avoid Smoking:')
regular_exercise = widgets.Dropdown(options=[('No', 0), ('Yes', 1)], description='Exercise:')
regular_screen = widgets.Dropdown(options=[('No', 0), ('Yes', 1)], description='Pap Smear:')

# Button
predict_btn = widgets.Button(description="Predict Cancer Risk", button_style='danger')
output = widgets.Output()

# Prediction logic
def on_predict_clicked(b):
    with output:
        clear_output()

        # Gather input
        input_data = np.array([[age.value,
                                sex_at_young_age.value,
                                hpv_immunization.value,
                                vaginal_bleeding.value,
                                frequent_urination.value,
                                vaginal_discharge.value,
                                painful_sex.value,
                                avoid_smoking.value,
                                regular_exercise.value,
                                regular_screen.value]])

        # Scale and predict
        input_scaled = scaler.transform(input_data)
        pred = lr.predict(input_scaled)[0]

        # Label mapping
        label_map = {0: "Low Risk", 1: "High Risk"}
        print("Predicted Cervical Cancer Risk Level:", label_map.get(pred, "Unknown"))

# Bind and display
predict_btn.on_click(on_predict_clicked)

display(widgets.VBox([
    age, sex_at_young_age, hpv_immunization, vaginal_bleeding,
    frequent_urination, vaginal_discharge, painful_sex,
    avoid_smoking, regular_exercise, regular_screen,
    predict_btn, output
]))

VBox(children=(IntSlider(value=16, description='Age:', max=18, min=14), Dropdown(description='Sex Young:', opt…