In [1]:
import tensorflow as tf
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

In [2]:
df =pd.read_csv("cervical-cancer_csv.csv")

In [3]:
df=df.dropna(subset=["Biopsy"])

In [4]:
X = df.drop('Biopsy', axis=1,inplace=False)
y = df['Biopsy']


In [5]:
numeric_cols = X.select_dtypes(include=['float64', 'int64']).columns
categorical_cols = X.select_dtypes(include=['object', 'bool']).columns

numeric_transformer = SimpleImputer(strategy='mean')
categorical_transformer = SimpleImputer(strategy='most_frequent')

# Create the ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

# Fit and transform the data
df = pd.DataFrame(preprocessor.fit_transform(X), columns=X.columns)
df=pd.concat([df, y.reset_index(drop=True)], axis=1)



In [6]:
# Remove duplicate rows
df = df.drop_duplicates()
df.drop(['STDs: Time since first diagnosis', 'STDs: Time since last diagnosis'], axis=1, inplace=True)
categorical_cols = df.select_dtypes(include=['object', 'bool']).columns

In [7]:
scaler = MinMaxScaler()
# df = scaler.fit_transform(X)
X = df.drop('Biopsy', axis=1,inplace=False)
y = df['Biopsy']
X_normalized_df = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

# Concatenate the normalized features with the target variable
df = pd.concat([X_normalized_df, y.reset_index(drop=True)], axis=1)


In [7]:
from sklearn.model_selection import train_test_split

X = df.drop('Biopsy', axis=1)
y = df['Biopsy']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
model = tf.keras.Sequential([
    tf.keras.layers.Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [9]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [10]:
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2)

Epoch 1/10
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 12ms/step - accuracy: 0.9316 - loss: 0.3044 - val_accuracy: 0.9552 - val_loss: 0.2021
Epoch 2/10
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.9291 - loss: 0.2639 - val_accuracy: 0.9552 - val_loss: 0.1715
Epoch 3/10
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.9204 - loss: 0.2715 - val_accuracy: 0.9552 - val_loss: 0.1637
Epoch 4/10
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.9254 - loss: 0.2459 - val_accuracy: 0.9552 - val_loss: 0.1562
Epoch 5/10
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.9286 - loss: 0.2224 - val_accuracy: 0.9552 - val_loss: 0.1624
Epoch 6/10
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.9479 - loss: 0.1787 - val_accuracy: 0.9478 - val_loss: 0.1794
Epoch 7/10
[1m17/17[0m [32m━━━━━━━━━

<keras.src.callbacks.history.History at 0x1abec1e23a0>

In [11]:
print(X_test)
y_pred_proba = model.predict(X_test)  # Probabilities for ROC-AUC
y_pred = (y_pred_proba > 0.5).astype(int)

      Age  Number of sexual partners  First sexual intercourse  \
611  20.0                        3.0                      15.0   
822  19.0                        2.0                      15.0   
290  21.0                        1.0                      20.0   
800  33.0                        3.0                      19.0   
168  18.0                        3.0                      18.0   
..    ...                        ...                       ...   
192  26.0                       10.0                      16.0   
653  42.0                        3.0                      18.0   
456  39.0                        5.0                      18.0   
777  23.0                        2.0                      16.0   
532  20.0                        3.0                      17.0   

     Num of pregnancies  Smokes  Smokes (years)  Smokes (packs/year)  \
611                 2.0     1.0             3.0             3.000000   
822                 2.0     0.0             0.0             0.0

In [12]:
# Flatten y_test and y_pred for sklearn metrics
y_test_flat = y_test.values.flatten()  # Flatten to ensure it's a 1D array
y_pred_flat = y_pred.flatten()         # Flatten to ensure it's a 1D array

# Calculate metrics
accuracy = accuracy_score(y_test_flat, y_pred_flat)
precision = precision_score(y_test_flat, y_pred_flat)
recall = recall_score(y_test_flat, y_pred_flat)
f1 = f1_score(y_test_flat, y_pred_flat)
roc_auc = roc_auc_score(y_test_flat, y_pred_proba)  # ROC-AUC score uses probabilities

# Print metrics
print(f"Predictions: {y_pred_proba}, prob {y_pred}, flat {y_pred_flat}")
print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")
print(f"ROC-AUC Score: {roc_auc:.2f}")


Predictions: [[0.0468315 ]
 [0.05755671]
 [0.04672333]
 [0.03235854]
 [0.02220381]
 [0.04460735]
 [0.02853473]
 [0.04345462]
 [0.05354339]
 [0.06137018]
 [0.05398063]
 [0.01722453]
 [0.03769546]
 [0.19640283]
 [0.23613818]
 [0.07766646]
 [0.05954068]
 [0.12714899]
 [0.01924792]
 [0.05385594]
 [0.03054414]
 [0.13924383]
 [0.02216232]
 [0.06542933]
 [0.02754311]
 [0.04083357]
 [0.05787114]
 [0.05753014]
 [0.04432601]
 [0.08217899]
 [0.23144361]
 [0.03043295]
 [0.14012705]
 [0.06445806]
 [0.02538181]
 [0.04245108]
 [0.088355  ]
 [0.01687614]
 [0.17741323]
 [0.09213729]
 [0.2780392 ]
 [0.04252537]
 [0.3510873 ]
 [0.00686831]
 [0.09481432]
 [0.05282463]
 [0.06649683]
 [0.17244846]
 [0.05136412]
 [0.01980573]
 [0.09945942]
 [0.04545156]
 [0.2699079 ]
 [0.081095  ]
 [0.03867548]
 [0.07307199]
 [0.27090123]
 [0.09605817]
 [0.03586625]
 [0.07530254]
 [0.04073841]
 [0.04182773]
 [0.05420988]
 [0.03893667]
 [0.07302635]
 [0.07373872]
 [0.0300074 ]
 [0.03263985]
 [0.02359693]
 [0.06007965]
 [0.092

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [16]:
# Save the model using the Keras format
model.save('cervical_cancer_model.keras')


In [17]:
model = tf.keras.models.load_model('cervical_cancer_model.keras')

# Convert the model to TensorFlow Lite format
converter = tf.lite.TFLiteConverter.from_keras_model(model)
tflite_model = converter.convert()

# Save the TensorFlow Lite model to a file
with open('cervical_cancer_model.tflite', 'wb') as f:
    f.write(tflite_model)

  saveable.load_own_variables(weights_store.get(inner_path))


INFO:tensorflow:Assets written to: C:\Users\BoreTech\AppData\Local\Temp\tmp_2v77fyp\assets


INFO:tensorflow:Assets written to: C:\Users\BoreTech\AppData\Local\Temp\tmp_2v77fyp\assets


Saved artifact at 'C:\Users\BoreTech\AppData\Local\Temp\tmp_2v77fyp'. The following endpoints are available:

* Endpoint 'serve'
  args_0 (POSITIONAL_ONLY): TensorSpec(shape=(None, 33), dtype=tf.float32, name='input_layer')
Output Type:
  TensorSpec(shape=(None, 1), dtype=tf.float32, name=None)
Captures:
  3234034686432: TensorSpec(shape=(), dtype=tf.resource, name=None)
  3234108755808: TensorSpec(shape=(), dtype=tf.resource, name=None)
  3234108665520: TensorSpec(shape=(), dtype=tf.resource, name=None)
  3234108665168: TensorSpec(shape=(), dtype=tf.resource, name=None)
  3234108808704: TensorSpec(shape=(), dtype=tf.resource, name=None)
  3234108808880: TensorSpec(shape=(), dtype=tf.resource, name=None)


In [23]:
test1_prob1= { "age": [50],
  "num_sexual_partners": [10],
  "first_sexual_intercourse": [15],
  "num_pregnancies": [4],
  "smokes": [1],
  "smokes_years": [20],
  "smokes_packs_per_year": [10.5],
  "hormonal_contraceptives": [1],
  "hormonal_contraceptives_years": [15],
  "iud": [1],
  "iud_years": [10],
  "stds": [1],
  "stds_number": [5],
  "stds_condylomatosis": [1],
  "stds_cervical_condylomatosis": [1],
  "stds_vaginal_condylomatosis": [1],
  "stds_vulvo_perineal_condylomatosis": [1],
  "stds_syphilis": [1],
  "stds_pelvic_inflammatory_disease": [1],
  "stds_genital_herpes": [1],
  "stds_molluscum_contagiosum": [1],
  "stds_aids": [1],
  "stds_hiv": [1],
  "stds_hepatitis_b": [1],
  "stds_hpv": [1],
  "stds_number_of_diagnosis": [5],
  "dx_cancer": [1],
  "dx_cin": [1],
  "dx_hpv": [1],
  "dx": [1],
  "hinselmann": [1],
  "schiller": [1],
  "citology": [1]
}
tester_prob1 = pd.DataFrame(test1_prob1)

y_pred_proba = model.predict(tester_prob1)  # Probabilities for ROC-AUC
y_pred = (y_pred_proba > 0.5).astype(int)

print(f"Predictions: {y_pred_proba}, prob {y_pred}, flat {y_pred_flat}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
Predictions: [[0.73778963]], prob [[1]], flat [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]


In [22]:
test1_prob0= {
  "age": [25],
  "num_sexual_partners": [1],
  "first_sexual_intercourse": [20],
  "num_pregnancies": 1,
  "smokes": [0],
  "smokes_years": [0],
  "smokes_packs_per_year":[ 0.0],
  "hormonal_contraceptives": [0],
  "hormonal_contraceptives_years": [0],
  "iud": [0],
  "iud_years": [0],
  "stds": [0],
  "stds_number": [0],
  "stds_condylomatosis": [0],
  "stds_cervical_condylomatosis": [0],
  "stds_vaginal_condylomatosis": [0],
  "stds_vulvo_perineal_condylomatosis": [0],
  "stds_syphilis": [0],
  "stds_pelvic_inflammatory_disease": [0],
  "stds_genital_herpes": [0],
  "stds_molluscum_contagiosum": [0],
  "stds_aids": [0],
  "stds_hiv": [0],
  "stds_hepatitis_b": [0],
  "stds_hpv": [0],
  "stds_number_of_diagnosis": [0],
  "dx_cancer": [0],
  "dx_cin": [0],
  "dx_hpv": [0],
  "dx": [0],
  "hinselmann": [0],
  "schiller": [0],
  "citology": [0]
}


tester_prob0 = pd.DataFrame(test1_prob0)

y_pred_proba = model.predict(tester_prob0)  # Probabilities for ROC-AUC
y_pred = (y_pred_proba > 0.5).astype(int)

print(f"Predictions: {y_pred_proba}, prob {y_pred}, flat {y_pred_flat}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step
Predictions: [[0.04450294]], prob [[0]], flat [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
