In [94]:
import os
print(os.listdir('/workspaces/diabetes_prediction-ML_Pipeline_Summative/models'))


['encoder.pkl', 'models', 'randomforest__model.pkl', 'retrained_model.pkl', 'scaler.pkl']


In [95]:
import pandas as pd
import joblib
from sklearn.preprocessing import LabelEncoder
from src.preprocessing import load_data, preprocess_data, split_data
from src.model import train_random_forest, evaluate_model, save_model, load_model
from src.prediction import load_model_and_scaler, predict_single, preprocess_new_data



## Preprocess Testing

#### Data Loading

In [96]:
from src.preprocessing import load_data

# Test the load_data function
file_path = "/workspaces/diabetes_prediction-ML_Pipeline_Summative/data/diabetes_prediction_dataset.csv"  # Replace with the actual file path
df = load_data(file_path)

# Display the first few rows
print(df.head())


   gender   age  hypertension  heart_disease smoking_history    bmi  \
0  Female  80.0             0              1           never  25.19   
1  Female  54.0             0              0         No Info  27.32   
2    Male  28.0             0              0           never  27.32   
3  Female  36.0             0              0         current  23.45   
4    Male  76.0             1              1         current  20.14   

   HbA1c_level  blood_glucose_level  diabetes  
0          6.6                  140         0  
1          6.6                   80         0  
2          5.7                  158         0  
3          5.0                  155         0  
4          4.8                  155         0  


#### Preprocessing.py Testing

In [97]:
from src.preprocessing import preprocess_data

# Test preprocess_data function
X, y, scaler, encoder = preprocess_data(df)

# Check the processed data
print("Processed Features Shape:", X.shape)
print("Processed Target Shape:", y.shape)
print("First 5 Encoded Features:")
print(X[:5])


Processed Features Shape: (100000, 12)
Processed Target Shape: (100000,)
First 5 Encoded Features:
[[-8.41046744e-01  1.69270354e+00 -2.84439447e-01  4.93637859e+00
  -3.21055792e-01  1.00170572e+00  4.77042159e-02 -3.19946365e-01
  -2.04230437e-01 -3.21198224e-01  1.35992968e+00 -2.62512493e-01]
 [-8.41046744e-01  5.38006427e-01 -2.84439447e-01 -2.02577655e-01
  -1.15583678e-04  1.00170572e+00 -1.42620999e+00 -3.19946365e-01
  -2.04230437e-01 -3.21198224e-01 -7.35332136e-01 -2.62512493e-01]
 [ 1.18723364e+00 -6.16690686e-01 -2.84439447e-01 -2.02577655e-01
  -1.15583678e-04  1.61108022e-01  4.89878478e-01 -3.19946365e-01
  -2.04230437e-01 -3.21198224e-01  1.35992968e+00 -2.62512493e-01]
 [-8.41046744e-01 -2.61399267e-01 -2.84439447e-01 -2.02577655e-01
  -5.83232300e-01 -4.92690191e-01  4.16182767e-01  3.12552386e+00
  -2.04230437e-01 -3.21198224e-01 -7.35332136e-01 -2.62512493e-01]
 [ 1.18723364e+00  1.51505783e+00  3.51568677e+00  4.93637859e+00
  -1.08197037e+00 -6.79489680e-01  4.16

#### With Pretrained Encoder

In [98]:
# Test the function with an existing encoder (use the one saved during model training)
sample_data = pd.DataFrame([{
    'gender': 'Female',
    'age': 40,
    'hypertension': 1,
    'heart_disease': 0,
    'bmi': 32.5,
    'HbA1c_level': 7.2,
    'blood_glucose_level': 150,
    'smoking_history': 'former',
    'diabetes': 0 
}])

X_sample, _, _, _ = preprocess_data(sample_data, encoder=encoder)
print("Processed Sample Features:")
print(X_sample)


Processed Sample Features:
[[0. 0. 0. 0. 0. 0. 0.]]


### Data Slitting

In [99]:
from src.preprocessing import split_data

# Split the dataset
X_train, X_test, y_train, y_test = split_data(X, y)

# Check the shapes of the splits
print("Training Features Shape:", X_train.shape)
print("Testing Features Shape:", X_test.shape)
print("Training Target Shape:", y_train.shape)
print("Testing Target Shape:", y_test.shape)


Training Features Shape: (80000, 12)
Testing Features Shape: (20000, 12)
Training Target Shape: (80000,)
Testing Target Shape: (20000,)


### model.py Testing

In [100]:
from sklearn.model_selection import train_test_split
from src.model import train_random_forest, evaluate_model, save_model, load_model, retrain_model

In [101]:
# Train the model
model = train_random_forest(X_train, y_train)


In [102]:
# Evaluate the model and plot results
acc, cm, report = evaluate_model(model, X_test, y_test)

# Print evaluation results
print(f"Accuracy: {acc}")
print(f"Confusion Matrix:\n{cm}")
print(f"Classification Report:\n{report}")

Accuracy: 0.96965
Confusion Matrix:
[[18218    82]
 [  525  1175]]
Classification Report:
              precision    recall  f1-score   support

           0       0.97      1.00      0.98     18300
           1       0.93      0.69      0.79      1700

    accuracy                           0.97     20000
   macro avg       0.95      0.84      0.89     20000
weighted avg       0.97      0.97      0.97     20000



In [105]:
# # Step 5: Save the trained model
# save_model(model, 'models/models/_model.pkl')

# # Step 6: Load the saved model (to simulate reloading in production or testing)
# loaded_model = load_model('models/models/random_forest_model.pkl')

In [None]:
from src.model import save_model 


model_dir = 'models/models'

# Test save_model
saved_model_path = model.save_model(model, model_dir=model_dir)

# Verify the saved file
print(f"Model was saved to: {saved_model_path}")
print("Models in directory:", os.listdir(model_dir))


AttributeError: 'RandomForestClassifier' object has no attribute 'save_model'

In [None]:
# Load the saved model
loaded_model = joblib.load(saved_model_path)

# Verify the model works (e.g., make predictions)
print("Sample predictions:", loaded_model.predict(X[:5]))


#### Model Retraining

In [None]:
# Retrain the model
retrained_model = retrain_model(X_train, y_train, model_path='models/retrained_model.pkl')

### Prediction Test

In [None]:
# Set the paths to the saved model and scaler
model_path = '/workspaces/diabetes_prediction-ML_Pipeline_Summative/models/randomforest__model.pkl'  # Path to the trained model
scaler_path = '/workspaces/diabetes_prediction-ML_Pipeline_Summative/models/scaler.pkl'  # Path to the saved scaler

# Load the model and scaler
model, scaler = load_model_and_scaler(model_path, scaler_path)

# Load the encoder
encoder = LabelEncoder()
# Optionally, if you saved the encoder, load it
encoder = joblib.load('/workspaces/diabetes_prediction-ML_Pipeline_Summative/models/encoder.pkl')

# New data for prediction
new_data = pd.DataFrame([{
    'gender': 'Female',  # Gender: Female
    'age': 55,  # Age: 55 years
    'hypertension': 1,  # Hypertension: Yes
    'heart_disease': 1,  # Heart Disease: Yes
    'bmi': 32.7,  # BMI: 32.7 (indicates overweight/obesity)
    'HbA1c_level': 8.2,  # HbA1c Level: 8.2% (indicates diabetes)
    'blood_glucose_level': 190,  # Blood Glucose Level: 190 mg/dL (high level)
    'smoking_history': 'current'  # Smoking History: Current smoker
}])


# Preprocess and predict
X_new = preprocess_new_data(new_data, scaler, encoder)
prediction = model.predict(X_new)
print("Prediction:", "Diabetes" if prediction[0] == 1 else "No Diabetes")


Prediction: Diabetes


#### Sample data

In [None]:
sample_data = pd.DataFrame([{
    'gender': 'Female',
    'age': 35,
    'hypertension': 1,
    'heart_disease': 0,
    'bmi': 28.7,
    'HbA1c_level': 6.5,
    'blood_glucose_level': 180,
    'smoking_history': 'current'
}])
