In [1]:

import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

In [2]:
df =pd.read_csv("cervical-cancer_csv.csv")

In [3]:
df=df.dropna(subset=["Biopsy"])

In [4]:
X = df.drop('Biopsy', axis=1,inplace=False)
y = df['Biopsy']


In [5]:
numeric_cols = X.select_dtypes(include=['float64', 'int64']).columns
categorical_cols = X.select_dtypes(include=['object', 'bool']).columns

numeric_transformer = SimpleImputer(strategy='mean')
categorical_transformer = SimpleImputer(strategy='most_frequent')

# Create the ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

# Fit and transform the data
df = pd.DataFrame(preprocessor.fit_transform(X), columns=X.columns)
df=pd.concat([df, y.reset_index(drop=True)], axis=1)



In [6]:
# Remove duplicate rows
df = df.drop_duplicates()
df.drop(['STDs: Time since first diagnosis', 'STDs: Time since last diagnosis'], axis=1, inplace=True)
categorical_cols = df.select_dtypes(include=['object', 'bool']).columns

In [74]:
scaler = MinMaxScaler()
# df = scaler.fit_transform(X)
X = df.drop('Biopsy', axis=1,inplace=False)
y = df['Biopsy']
X_normalized_df = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

# Concatenate the normalized features with the target variable
df = pd.concat([X_normalized_df, y.reset_index(drop=True)], axis=1)


In [7]:
from sklearn.model_selection import train_test_split

X = df.drop('Biopsy', axis=1)
y = df['Biopsy']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [15]:
print(df.dtypes)
# print(df.shape)
model_gbc = GradientBoostingClassifier()
model_gbc.fit(X_train, y_train)

# Predictions
y_pred_gbc = model_gbc.predict(X_test)

# Evaluation Metrics
accuracy_gbc = accuracy_score(y_test, y_pred_gbc)
precision_gbc = precision_score(y_test, y_pred_gbc)
recall_gbc = recall_score(y_test, y_pred_gbc)
f1_gbc = f1_score(y_test, y_pred_gbc)
roc_auc_gbc = roc_auc_score(y_test, model_gbc.predict_proba(X_test)[:, 1])

# Print Metrics
# print("Gradient Boosting Classifier Metrics:")
# print(f"Accuracy: {accuracy_gbc:.2f}")
# print(f"Precision: {precision_gbc:.2f}")
# print(f"Recall: {recall_gbc:.2f}")
# print(f"F1 Score: {f1_gbc:.2f}")
# print(f"ROC-AUC Score: {roc_auc_gbc:.2f}")

Age                                   float64
Number of sexual partners             float64
First sexual intercourse              float64
Num of pregnancies                    float64
Smokes                                float64
Smokes (years)                        float64
Smokes (packs/year)                   float64
Hormonal Contraceptives               float64
Hormonal Contraceptives (years)       float64
IUD                                   float64
IUD (years)                           float64
STDs                                  float64
STDs (number)                         float64
STDs:condylomatosis                   float64
STDs:cervical condylomatosis          float64
STDs:vaginal condylomatosis           float64
STDs:vulvo-perineal condylomatosis    float64
STDs:syphilis                         float64
STDs:pelvic inflammatory disease      float64
STDs:genital herpes                   float64
STDs:molluscum contagiosum            float64
STDs:AIDS                         