In [1]:
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

In [2]:
df =pd.read_csv("cervical-cancer_csv.csv")

In [3]:
df=df.dropna(subset=["Biopsy"])

In [4]:
X = df.drop('Biopsy', axis=1,inplace=False)
y = df['Biopsy']


In [5]:
numeric_cols = X.select_dtypes(include=['float64', 'int64']).columns
categorical_cols = X.select_dtypes(include=['object', 'bool']).columns

numeric_transformer = SimpleImputer(strategy='mean')
categorical_transformer = SimpleImputer(strategy='most_frequent')

# Create the ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

# Fit and transform the data
df = pd.DataFrame(preprocessor.fit_transform(X), columns=X.columns)
df=pd.concat([df, y.reset_index(drop=True)], axis=1)



In [6]:
# Remove duplicate rows
df = df.drop_duplicates()
df.drop(['STDs: Time since first diagnosis', 'STDs: Time since last diagnosis'], axis=1, inplace=True)
categorical_cols = df.select_dtypes(include=['object', 'bool']).columns

In [8]:
scaler = MinMaxScaler()
# df = scaler.fit_transform(X)
X = df.drop('Biopsy', axis=1,inplace=False)
y = df['Biopsy']
X_normalized_df = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

# Concatenate the normalized features with the target variable
df = pd.concat([X_normalized_df, y.reset_index(drop=True)], axis=1)
X = df.drop('Biopsy', axis=1)
y = df['Biopsy']       
missing_values=df.isnull().sum()
print(f"Missing entries: {missing_values}")

Missing entries: Age                                   0
Number of sexual partners             0
First sexual intercourse              0
Num of pregnancies                    0
Smokes                                0
Smokes (years)                        0
Smokes (packs/year)                   0
Hormonal Contraceptives               0
Hormonal Contraceptives (years)       0
IUD                                   0
IUD (years)                           0
STDs                                  0
STDs (number)                         0
STDs:condylomatosis                   0
STDs:cervical condylomatosis          0
STDs:vaginal condylomatosis           0
STDs:vulvo-perineal condylomatosis    0
STDs:syphilis                         0
STDs:pelvic inflammatory disease      0
STDs:genital herpes                   0
STDs:molluscum contagiosum            0
STDs:AIDS                             0
STDs:HIV                              0
STDs:Hepatitis B                      0
STDs:HPV               

In [9]:
model_gbc = GradientBoostingClassifier()
model_gbc.fit(X, y)

# Predictions
y_pred_gbc = model_gbc.predict(X)

# Evaluation Metrics
accuracy_gbc = accuracy_score(y, y_pred_gbc)
precision_gbc = precision_score(y, y_pred_gbc)
recall_gbc = recall_score(y, y_pred_gbc)
f1_gbc = f1_score(y, y_pred_gbc)
roc_auc_gbc = roc_auc_score(y, model_gbc.predict_proba(X)[:, 1])

# Print Metrics
print("Gradient Boosting Classifier Metrics:")
print(f"Accuracy: {accuracy_gbc:.2f}")
print(f"Precision: {precision_gbc:.2f}")
print(f"Recall: {recall_gbc:.2f}")
print(f"F1 Score: {f1_gbc:.2f}")
print(f"ROC-AUC Score: {roc_auc_gbc:.2f}")

Gradient Boosting Classifier Metrics:
Accuracy: 1.00
Precision: 0.96
Recall: 0.96
F1 Score: 0.96
ROC-AUC Score: 1.00
