In [29]:
import tensorflow as tf
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
import json
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

In [30]:
df =pd.read_csv("cervical-cancer_csv.csv")


In [31]:
df=df.dropna(subset=["Biopsy"])


In [32]:
X = df.drop('Biopsy', axis=1,inplace=False)
y = df['Biopsy']


In [33]:
numeric_cols = X.select_dtypes(include=['float64', 'int64']).columns
categorical_cols = X.select_dtypes(include=['object', 'bool']).columns

numeric_transformer = SimpleImputer(strategy='mean')
categorical_transformer = SimpleImputer(strategy='most_frequent')

# Create the ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

# Fit and transform the data
df = pd.DataFrame(preprocessor.fit_transform(X), columns=X.columns)
df=pd.concat([df, y.reset_index(drop=True)], axis=1)



In [34]:
# Remove duplicate rows
df = df.drop_duplicates()
df.drop(['STDs: Time since first diagnosis', 'STDs: Time since last diagnosis'], axis=1, inplace=True)
categorical_cols = df.select_dtypes(include=['object', 'bool']).columns

In [35]:


X = df.drop('Biopsy', axis=1)
y = df['Biopsy']



In [36]:
scaler = StandardScaler()
X = df.drop('Biopsy', axis=1,inplace=False)
y = df['Biopsy']
X_normalized_df = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

# Concatenate the normalized features with the target variable
df = pd.concat([X_normalized_df, y.reset_index(drop=True)], axis=1)

# Save the scaler parameters
scaler_params = {
    'mean': scaler.mean_.tolist(),
    'scale': scaler.scale_.tolist()
}
X = df.drop('Biopsy', axis=1,inplace=False)
y = df['Biopsy']

Oversampling Techniques

In [21]:
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline

# Define the resampling technique
smote = SMOTE(sampling_strategy='minority')
under_sampler = RandomUnderSampler(sampling_strategy='majority')

# Combine with pipeline
pipeline = Pipeline(steps=[('o', smote), ('u', under_sampler)])
X, y = pipeline.fit_resample(X, y)


Cost-Sensitive Learning, Algorithm-level data balancing technique

In [None]:
# from sklearn.ensemble import RandomForestClassifier

# # Define class weights
# class_weights = {0: 1, 1: 10}  # Example weights

# # Define and train the model
# model = RandomForestClassifier(class_weight=class_weights)
# model.fit(X_train, y_train)


In [37]:
model = tf.keras.Sequential([
    tf.keras.layers.Dense(64, activation='relu', input_shape=(X.shape[1],)),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [38]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [39]:
with open('real_model_scaler_params.json', 'w') as f:
    json.dump(scaler_params, f)

In [40]:
model.fit(X, y, epochs=10, batch_size=32, validation_split=0.2)

Epoch 1/10
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 22ms/step - accuracy: 0.9055 - loss: 0.5666 - val_accuracy: 0.9521 - val_loss: 0.3737
Epoch 2/10
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.9363 - loss: 0.3375 - val_accuracy: 0.9581 - val_loss: 0.2267
Epoch 3/10
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.9404 - loss: 0.2012 - val_accuracy: 0.9401 - val_loss: 0.1680
Epoch 4/10
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.9666 - loss: 0.1272 - val_accuracy: 0.9521 - val_loss: 0.1493
Epoch 5/10
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.9549 - loss: 0.1338 - val_accuracy: 0.9521 - val_loss: 0.1429
Epoch 6/10
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.9568 - loss: 0.0949 - val_accuracy: 0.9401 - val_loss: 0.1388
Epoch 7/10
[1m21/21[0m [32m━━━━━━━━━

<keras.src.callbacks.history.History at 0x27640501fa0>

In [41]:
from sklearn.metrics import confusion_matrix,classification_report
y_preds=model.predict(X)
y_preds=np.round(y_preds)
print("report \n",classification_report(y,y_preds))

[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step
report 
               precision    recall  f1-score   support

           0       0.98      0.98      0.98       781
           1       0.74      0.72      0.73        54

    accuracy                           0.97       835
   macro avg       0.86      0.85      0.86       835
weighted avg       0.96      0.97      0.97       835



In [27]:
model.save('real_model_cervical_cancer_model_standardized.keras')

In [42]:

# Load the TensorFlow model
model = tf.keras.models.load_model('real_model_cervical_cancer_model_standardized.keras')

# Convert the model to TensorFlow Lite format
converter = tf.lite.TFLiteConverter.from_keras_model(model)
tflite_model = converter.convert()

# Save the TensorFlow Lite model to a file
with open('real_model_cervical_cancer_model_standardized.tflite', 'wb') as f:
    f.write(tflite_model)

# Load the TFLite model and allocate tensors
interpreter = tf.lite.Interpreter(model_path='real_model_cervical_cancer_model_standardized.tflite')
interpreter.allocate_tensors()

# Get input and output tensors.
input_details = interpreter.get_input_details()
output_details = interpreter.get_output_details()

# Test data
test1_prob1= { "age": [50],
  "num_sexual_partners": [10],
  "first_sexual_intercourse": [15],
  "num_pregnancies": [4],
  "smokes": [1],
  "smokes_years": [20],
  "smokes_packs_per_year": [10.5],
  "hormonal_contraceptives": [1],
  "hormonal_contraceptives_years": [15],
  "iud": [1],
  "iud_years": [10],
  "stds": [1],
  "stds_number": [5],
  "stds_condylomatosis": [1],
  "stds_cervical_condylomatosis": [1],
  "stds_vaginal_condylomatosis": [1],
  "stds_vulvo_perineal_condylomatosis": [1],
  "stds_syphilis": [1],
  "stds_pelvic_inflammatory_disease": [1],
  "stds_genital_herpes": [1],
  "stds_molluscum_contagiosum": [1],
  "stds_aids": [1],
  "stds_hiv": [1],
  "stds_hepatitis_b": [1],
  "stds_hpv": [1],
  "stds_number_of_diagnosis": [5],
  "dx_cancer": [1],
  "dx_cin": [1],
  "dx_hpv": [1],
  "dx": [1],
  "hinselmann": [1],
  "schiller": [1],
  "citology": [1]
}

# Load scaler parameters
with open('real_model_scaler_params.json', 'r') as f:
    scaler_params = json.load(f)

mean = np.array(scaler_params['mean'])
scale = np.array(scaler_params['scale'])

tester_prob1 = pd.DataFrame(test1_prob1)

# Normalize input data
tester_prob1_scaled = (tester_prob1 - mean) / scale
input_data = tester_prob1_scaled.astype(np.float32).to_numpy()

# Set the tensor to the input data
interpreter.set_tensor(input_details[0]['index'], input_data)

# Run inference
interpreter.invoke()

# Get the prediction result
tflite_results = interpreter.get_tensor(output_details[0]['index'])

# Print the TensorFlow Lite prediction
print(f"TensorFlow Lite Predictions: {tflite_results}")

predicted_probability = tflite_results[0][0]  # Assumes output shape is [1, 1]
predicted_class = int(predicted_probability > 0.5)  # Binary classification threshold

# For demonstration, using placeholder values for confidence interval and model metrics
confidence_interval = [0.45, 0.55]  # Example placeholder values
model_metrics = {
    'accuracy': 0.85,
    'precision': 0.80,
    'recall': 0.78,
    'f1_score': 0.79
}

# Output results
results = {
    'raw_input_data': test1_prob1,
    'normalized_input_data': tester_prob1_scaled.to_dict(orient='records'),
    'predicted_probability': float(predicted_probability),
    'predicted_class': predicted_class,
    'confidence_interval': confidence_interval,
    'model_metrics': model_metrics
}

# Print the results
print("Model Output:")
print(results)


INFO:tensorflow:Assets written to: C:\Users\BoraTech\AppData\Local\Temp\tmpuk3gg2sd\assets


INFO:tensorflow:Assets written to: C:\Users\BoraTech\AppData\Local\Temp\tmpuk3gg2sd\assets


Saved artifact at 'C:\Users\BoraTech\AppData\Local\Temp\tmpuk3gg2sd'. The following endpoints are available:

* Endpoint 'serve'
  args_0 (POSITIONAL_ONLY): TensorSpec(shape=(None, 33), dtype=tf.float32, name='input_layer_1')
Output Type:
  TensorSpec(shape=(None, 1), dtype=tf.float32, name=None)
Captures:
  2706876255744: TensorSpec(shape=(), dtype=tf.resource, name=None)
  2706925139744: TensorSpec(shape=(), dtype=tf.resource, name=None)
  2706882825552: TensorSpec(shape=(), dtype=tf.resource, name=None)
  2706953243104: TensorSpec(shape=(), dtype=tf.resource, name=None)
  2706970655728: TensorSpec(shape=(), dtype=tf.resource, name=None)
  2706953993200: TensorSpec(shape=(), dtype=tf.resource, name=None)
TensorFlow Lite Predictions: [[0.99999917]]
Model Output:
{'raw_input_data': {'age': [50], 'num_sexual_partners': [10], 'first_sexual_intercourse': [15], 'num_pregnancies': [4], 'smokes': [1], 'smokes_years': [20], 'smokes_packs_per_year': [10.5], 'hormonal_contraceptives': [1], 'hor

metrics
