In [1]:
import pandas as pd
import joblib
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler

# Load the dataset
file_path = "diabetes.csv"
diabetes_df = pd.read_csv(file_path)

# Handle missing values by filling with median for numerical columns
diabetes_df.fillna(diabetes_df.median(), inplace=True)

# Define features and target
X = diabetes_df.drop(columns=['Outcome'])
y = diabetes_df['Outcome']

# Feature Scaling (Optional, but recommended)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Adjust RandomForest parameters to reduce accuracy
model_reduced = RandomForestClassifier(
    n_estimators=50,          # Reduce number of trees
    max_depth=10,             # Limit tree depth
    min_samples_split=10,     # Increase min samples to split
    min_samples_leaf=5,       # Increase min samples per leaf
    random_state=42
)

# Train the model
model_reduced.fit(X_train, y_train)

# Evaluate the updated model
y_pred_reduced = model_reduced.predict(X_test)
accuracy_reduced = accuracy_score(y_test, y_pred_reduced)
print(f"Model Accuracy: {accuracy_reduced:.2f}")

# Save the model and scaler
joblib.dump(model_reduced, open("diabetes_model.pkl", "wb"))
joblib.dump(scaler, open("scaler.pkl", "wb"))
print("Model and Scaler saved successfully.")


Model Accuracy: 0.96
Model and Scaler saved successfully.
