In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import joblib  # Import joblib for saving the model

# Load the dataset
data = pd.read_csv('cleaned_diabetes_103.csv')

# Print the column names to check what is available
print("Columns in the dataset:", data.columns)

# Define the prediabetes criteria
data['Prediabetes'] = ((data['Glucose'] >= 90) & (data['Glucose'] <= 125)) | \
                       (data['Insulin'] > 25) | \
                       (data['BloodPressure'] > 130)

# Convert boolean to integer (1 for prediabetes, 0 for not)
data['Prediabetes'] = data['Prediabetes'].astype(int)

# Create the Hypertension feature based on the new risk conditions
data['Hypertension'] = 0  # Initialize the column

# Condition 1: Risk of 50% for hypertension if Insulin > 25 and Glucose > 100
risk_condition_1 = (data['Insulin'] > 25) & (data['Glucose'] > 100)

# Assign Hypertension based on the first risk condition
data.loc[risk_condition_1 & (data['BloodPressure'] > 130), 'Hypertension'] = 1

# Condition 2: Risk of 80% for hypertension if Glucose > 125
risk_condition_2 = data['Glucose'] > 125

# Assign Hypertension based on the second risk condition
data.loc[risk_condition_2, 'Hypertension'] = 1

# Fill Hypertension for those not meeting the conditions
data['Hypertension'] = data['Hypertension'].fillna(0).astype(int)

# Define features and target variable
X = data.drop(['Prediabetes'], axis=1)  # Exclude the target variable
y = data['Prediabetes']  # New target variable

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Save the scaler for later use
joblib.dump(scaler, 'scaler.pkl')

# Initialize models
models = {
    'Logistic Regression': LogisticRegression(),
    'Support Vector Machine': SVC(),
    'Random Forest': RandomForestClassifier()
}

# Train models and evaluate their performance
for model_name, model in models.items():
    model.fit(X_train_scaled, y_train)  # Train the model
    y_pred = model.predict(X_test_scaled)  # Make predictions
    
    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    print(f"{model_name} Accuracy: {accuracy:.4f}")
    print(confusion_matrix(y_test, y_pred))
    print(classification_report(y_test, y_pred))
    
    # Save the model to a file
    joblib.dump(model, f'{model_name.replace(" ", "_").lower()}_model.pkl')
    print(f"{model_name} saved as '{model_name.replace(' ', '_').lower()}_model.pkl'\n")

# Example of making predictions on new data
# Define new data (example values)
new_data = pd.DataFrame({
    'Glucose': [130],
    'BloodPressure': [70],
    'SkinThickness': [35],
    'Insulin': [0],
    'BMI': [33.6],
    'Age': [50]
})

# Load the scaler
scaler = joblib.load('scaler.pkl')  # Load the scaler

# Scale the new data
new_data_scaled = scaler.transform(new_data)

# Make predictions on the new data using the Logistic Regression model
model = joblib.load('logistic_regression_model.pkl')  # Load the model
predictions = model.predict(new_data_scaled)

# Output the predictions
print("Predictions for new data:", predictions)


Columns in the dataset: Index(['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'Age'], dtype='object')
Logistic Regression Accuracy: 0.9870
[[  0   2]
 [  0 152]]
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         2
           1       0.99      1.00      0.99       152

    accuracy                           0.99       154
   macro avg       0.49      0.50      0.50       154
weighted avg       0.97      0.99      0.98       154

Logistic Regression saved as 'logistic_regression_model.pkl'

Support Vector Machine Accuracy: 0.9870
[[  0   2]
 [  0 152]]
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         2
           1       0.99      1.00      0.99       152

    accuracy                           0.99       154
   macro avg       0.49      0.50      0.50       154
weighted avg       0.97      0.99      0.98       154

Support Vector Machine saved as 'support_ve

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Random Forest Accuracy: 1.0000
[[  2   0]
 [  0 152]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         2
           1       1.00      1.00      1.00       152

    accuracy                           1.00       154
   macro avg       1.00      1.00      1.00       154
weighted avg       1.00      1.00      1.00       154

Random Forest saved as 'random_forest_model.pkl'



ValueError: The feature names should match those that were passed during fit.
Feature names seen at fit time, yet now missing:
- Hypertension


In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import joblib  # Import joblib for saving the model

# Load the dataset
data = pd.read_csv('cleaned_diabetes_103.csv')

# Print the column names to check what is available
print("Columns in the dataset:", data.columns)

# Define the prediabetes criteria
data['Prediabetes'] = ((data['Glucose'] >= 90) & (data['Glucose'] <= 125)) | \
                       (data['Insulin'] > 25) | \
                       (data['BloodPressure'] > 130)

# Convert boolean to integer (1 for prediabetes, 0 for not)
data['Prediabetes'] = data['Prediabetes'].astype(int)

# Create the Hypertension feature based on the new risk conditions
data['Hypertension'] = 0  # Initialize the column

# Condition 1: Risk of 50% for hypertension if Insulin > 25 and Glucose > 100
risk_condition_1 = (data['Insulin'] > 25) & (data['Glucose'] > 100)

# Assign Hypertension based on the first risk condition
data.loc[risk_condition_1 & (data['BloodPressure'] > 130), 'Hypertension'] = 1

# Condition 2: Risk of 80% for hypertension if Glucose > 125
risk_condition_2 = data['Glucose'] > 125

# Assign Hypertension based on the second risk condition
data.loc[risk_condition_2, 'Hypertension'] = 1

# Fill Hypertension for those not meeting the conditions
data['Hypertension'] = data['Hypertension'].fillna(0).astype(int)

# Define features and target variable
X = data.drop(['Prediabetes'], axis=1)  # Exclude the target variable
y = data['Prediabetes']  # New target variable

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Save the scaler for later use
joblib.dump(scaler, 'scaler.pkl')

# Initialize models
models = {
    'Logistic Regression': LogisticRegression(),
    'Support Vector Machine': SVC(),
    'Random Forest': RandomForestClassifier()
}

# Train models and evaluate their performance
for model_name, model in models.items():
    model.fit(X_train_scaled, y_train)  # Train the model
    y_pred = model.predict(X_test_scaled)  # Make predictions
    
    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    print(f"{model_name} Accuracy: {accuracy:.4f}")
    print(confusion_matrix(y_test, y_pred))
    print(classification_report(y_test, y_pred))
    
    # Save the model to a file
    joblib.dump(model, f'{model_name.replace(" ", "_").lower()}_model.pkl')
    print(f"{model_name} saved as '{model_name.replace(' ', '_').lower()}_model.pkl'\n")

# Example of making predictions on new data
# Define new data (example values)
new_data = pd.DataFrame({
    'Glucose': [130],
    'BloodPressure': [70],
    'SkinThickness': [35],
    'Insulin': [0],
    'BMI': [33.6],
    'Age': [50],
    'Hypertension': [0]  # Include the missing feature 'Hypertension'
})

# Load the scaler
scaler = joblib.load('scaler.pkl')  # Load the scaler

# Scale the new data
new_data_scaled = scaler.transform(new_data)

# Make predictions on the new data using the Logistic Regression model
model = joblib.load('logistic_regression_model.pkl')  # Load the model
predictions = model.predict(new_data_scaled)

# Output the predictions
print("Predictions for new data:", predictions)


Columns in the dataset: Index(['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'Age'], dtype='object')
Logistic Regression Accuracy: 0.9870
[[  0   2]
 [  0 152]]
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         2
           1       0.99      1.00      0.99       152

    accuracy                           0.99       154
   macro avg       0.49      0.50      0.50       154
weighted avg       0.97      0.99      0.98       154

Logistic Regression saved as 'logistic_regression_model.pkl'

Support Vector Machine Accuracy: 0.9870
[[  0   2]
 [  0 152]]
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         2
           1       0.99      1.00      0.99       152

    accuracy                           0.99       154
   macro avg       0.49      0.50      0.50       154
weighted avg       0.97      0.99      0.98       154

Support Vector Machine saved as 'support_ve

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Random Forest Accuracy: 1.0000
[[  2   0]
 [  0 152]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         2
           1       1.00      1.00      1.00       152

    accuracy                           1.00       154
   macro avg       1.00      1.00      1.00       154
weighted avg       1.00      1.00      1.00       154

Random Forest saved as 'random_forest_model.pkl'

Predictions for new data: [1]


In [12]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import joblib  # Import joblib for saving the model

# Load the dataset
data = pd.read_csv('cleaned_diabetes_103.csv')

# Print the column names to check what is available
print("Columns in the dataset:", data.columns)

# Define the prediabetes criteria
data['Prediabetes'] = ((data['Glucose'] >= 90) & (data['Glucose'] <= 125)) | \
                       (data['Insulin'] > 25) | \
                       (data['BloodPressure'] > 130)

# Convert boolean to integer (1 for prediabetes, 0 for not)
data['Prediabetes'] = data['Prediabetes'].astype(int)

# Create the Hypertension feature based on the new risk conditions
data['Hypertension'] = 0  # Initialize the column

# Condition 1: Risk of 50% for hypertension if Insulin > 25 and Glucose > 100
risk_condition_1 = (data['Insulin'] > 25) & (data['Glucose'] > 100)

# Assign Hypertension based on the first risk condition
data.loc[risk_condition_1 & (data['BloodPressure'] > 130), 'Hypertension'] = 1

# Condition 2: Risk of 80% for hypertension if Glucose > 125
risk_condition_2 = data['Glucose'] > 125

# Assign Hypertension based on the second risk condition
data.loc[risk_condition_2, 'Hypertension'] = 1

# Fill Hypertension for those not meeting the conditions
data['Hypertension'] = data['Hypertension'].fillna(0).astype(int)

# Define features and target variable
X = data.drop(['Prediabetes'], axis=1)  # Exclude the target variable
y = data['Prediabetes']  # New target variable

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Save the scaler for later use
joblib.dump(scaler, 'scaler.pkl')

# Initialize models
models = {
    'Logistic Regression': LogisticRegression(),
    'Support Vector Machine': SVC(),
    'Random Forest': RandomForestClassifier()
}

# Train models and evaluate their performance
for model_name, model in models.items():
    model.fit(X_train_scaled, y_train)  # Train the model
    y_pred = model.predict(X_test_scaled)  # Make predictions
    
    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    print(f"{model_name} Accuracy: {accuracy:.4f}")
    print(confusion_matrix(y_test, y_pred))
    print(classification_report(y_test, y_pred))
    
    # Save the model to a file
    joblib.dump(model, f'{model_name.replace(" ", "_").lower()}_model.pkl')
    print(f"{model_name} saved as '{model_name.replace(' ', '_').lower()}_model.pkl'\n")

# Example of making predictions on new data
# Define new data (example values)
new_data = pd.DataFrame({
    'Glucose': [130],
    'BloodPressure': [70],
    'SkinThickness': [35],
    'Insulin': [0],
    'BMI': [33.6],
    'Age': [50],
    'Hypertension': [0]  # Include the missing feature 'Hypertension'
})

# Load the scaler
scaler = joblib.load('scaler.pkl')  # Load the scaler

# Scale the new data
new_data_scaled = scaler.transform(new_data)



# Make predictions on the new data using all models
for model_name in models.keys():
    model = joblib.load(f'{model_name.replace(" ", "_").lower()}_model.pkl')  # Load the model
    predictions = model.predict(new_data_scaled)  # Make predictions
    print(f"Predictions for {model_name} on new data:", predictions)


Columns in the dataset: Index(['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'Age'], dtype='object')
Logistic Regression Accuracy: 0.9870
[[  0   2]
 [  0 152]]
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         2
           1       0.99      1.00      0.99       152

    accuracy                           0.99       154
   macro avg       0.49      0.50      0.50       154
weighted avg       0.97      0.99      0.98       154

Logistic Regression saved as 'logistic_regression_model.pkl'

Support Vector Machine Accuracy: 0.9870
[[  0   2]
 [  0 152]]
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         2
           1       0.99      1.00      0.99       152

    accuracy                           0.99       154
   macro avg       0.49      0.50      0.50       154
weighted avg       0.97      0.99      0.98       154

Support Vector Machine saved as 'support_ve

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Random Forest Accuracy: 1.0000
[[  2   0]
 [  0 152]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         2
           1       1.00      1.00      1.00       152

    accuracy                           1.00       154
   macro avg       1.00      1.00      1.00       154
weighted avg       1.00      1.00      1.00       154

Random Forest saved as 'random_forest_model.pkl'

Predictions for Logistic Regression on new data: [1]
Predictions for Support Vector Machine on new data: [1]
Predictions for Random Forest on new data: [1]


Columns in the dataset: Index(['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'Age'], dtype='object')
Logistic Regression Accuracy: 0.9870
[[  0   2]
 [  0 152]]
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         2
           1       0.99      1.00      0.99       152

    accuracy                           0.99       154
   macro avg       0.49      0.50      0.50       154
weighted avg       0.97      0.99      0.98       154

Logistic Regression Cross-Validation Accuracy: 0.9919 ± 0.0000
Logistic Regression saved as 'logistic_regression_model.pkl'

Support Vector Machine Accuracy: 0.9870
[[  0   2]
 [  0 152]]
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         2
           1       0.99      1.00      0.99       152

    accuracy                           0.99       154
   macro avg       0.49      0.50      0.50       154
weighted avg       0.97      0.99  

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Random Forest Accuracy: 0.9935
[[  1   1]
 [  0 152]]
              precision    recall  f1-score   support

           0       1.00      0.50      0.67         2
           1       0.99      1.00      1.00       152

    accuracy                           0.99       154
   macro avg       1.00      0.75      0.83       154
weighted avg       0.99      0.99      0.99       154

Random Forest Cross-Validation Accuracy: 0.9919 ± 0.0000
Random Forest saved as 'random_forest_model.pkl'



ValueError: The feature names should match those that were passed during fit.
Feature names seen at fit time, yet now missing:
- BMI
- Hypertension


In [38]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import joblib  # Import joblib for saving the model

# Load the dataset
data = pd.read_csv('cleaned_diabetes_103.csv')

# Print the column names to check what is available
print("Columns in the dataset:", data.columns)

# Define the prediabetes criteria
data['Prediabetes'] = ((data['Glucose'] >= 90) & (data['Glucose'] <= 125)) | \
                       (data['Insulin'] > 25) | \
                       (data['BloodPressure'] > 130)

# Convert boolean to integer (1 for prediabetes, 0 for not)
data['Prediabetes'] = data['Prediabetes'].astype(int)

# Create the Hypertension feature based on the new risk conditions
data['Hypertension'] = 0  # Initialize the column

# Condition 1: Risk of 50% for hypertension if Insulin > 25 and Glucose > 100
risk_condition_1 = (data['Insulin'] > 25) & (data['Glucose'] > 100)

# Assign Hypertension based on the first risk condition
data.loc[risk_condition_1 & (data['BloodPressure'] > 130), 'Hypertension'] = 1

# Condition 2: Risk of 80% for hypertension if Glucose > 125
risk_condition_2 = data['Glucose'] > 125

# Assign Hypertension based on the second risk condition
data.loc[risk_condition_2, 'Hypertension'] = 1

# Fill Hypertension for those not meeting the conditions
data['Hypertension'] = data['Hypertension'].fillna(0).astype(int)

# Define features and target variable
X = data[['Glucose', 'Insulin', 'BloodPressure']]  # Only include relevant features
y = data['Prediabetes']  # New target variable

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Save the scaler for later use
joblib.dump(scaler, 'scaler.pkl')

# Initialize models
models = {
    'Logistic Regression': LogisticRegression(),
    'Support Vector Machine': SVC(),
    'Random Forest': RandomForestClassifier()
}

# Train models and evaluate their performance
for model_name, model in models.items():
    model.fit(X_train_scaled, y_train)  # Train the model
    y_pred = model.predict(X_test_scaled)  # Make predictions
    
    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    print(f"{model_name} Accuracy: {accuracy:.4f}")
    print(confusion_matrix(y_test, y_pred))
    print(classification_report(y_test, y_pred))
    
    # Save the model to a file
    joblib.dump(model, f'{model_name.replace(" ", "_").lower()}_model.pkl')
    print(f"{model_name} saved as '{model_name.replace(' ', '_').lower()}_model.pkl'\n")

# Function to get user input
def get_user_input():
    name = input("Enter your name: ")
    insulin = float(input("Enter Insulin level: "))
    glucose = float(input("Enter Glucose level: "))
    blood_pressure = float(input("Enter Blood Pressure level: "))
    
    # Create a DataFrame for the input
    new_data = pd.DataFrame({
        'Glucose': [glucose],
        'Insulin': [insulin],
        'BloodPressure': [blood_pressure]
    })
    
    return name, new_data

# Function to make predictions
def make_predictions(new_data):
    # Load the scaler
    scaler = joblib.load('scaler.pkl')  # Load the scaler

    # Scale the new data
    new_data_scaled = scaler.transform(new_data)

    # Make predictions on the new data using all models
    predictions = {}
    for model_name in models.keys():
        model = joblib.load(f'{model_name.replace(" ", "_").lower()}_model.pkl')  # Load the model
        pred = model.predict(new_data_scaled)  # Make predictions
        predictions[model_name] = pred[0]  # Store the prediction

    return predictions

# Main function to run the input and prediction process
def main():
    print("Please enter your details:")
    name, new_data = get_user_input()  # Get user input
    predictions = make_predictions(new_data)  # Make predictions

    # Output the predictions
    for model_name, prediction in predictions.items():
        if prediction == 1:
            print(f"{name}, Prediction from {model_name}: Prediabetes")
        else:
            print(f"{name}, Prediction from {model_name}: No Prediabetes")

# Run the main function
if __name__ == "__main__":
    main()


Columns in the dataset: Index(['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'Age'], dtype='object')
Logistic Regression Accuracy: 0.9870
[[  0   2]
 [  0 152]]
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         2
           1       0.99      1.00      0.99       152

    accuracy                           0.99       154
   macro avg       0.49      0.50      0.50       154
weighted avg       0.97      0.99      0.98       154

Logistic Regression saved as 'logistic_regression_model.pkl'

Support Vector Machine Accuracy: 0.9870
[[  0   2]
 [  0 152]]
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         2
           1       0.99      1.00      0.99       152

    accuracy                           0.99       154
   macro avg       0.49      0.50      0.50       154
weighted avg       0.97      0.99      0.98       154

Support Vector Machine saved as 'support_ve

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Random Forest Accuracy: 1.0000
[[  2   0]
 [  0 152]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         2
           1       1.00      1.00      1.00       152

    accuracy                           1.00       154
   macro avg       1.00      1.00      1.00       154
weighted avg       1.00      1.00      1.00       154

Random Forest saved as 'random_forest_model.pkl'

Please enter your details:


Enter your name:  aditya
Enter Insulin level:  20
Enter Glucose level:  100
Enter Blood Pressure level:  110


aditya, Prediction from Logistic Regression: Prediabetes
aditya, Prediction from Support Vector Machine: Prediabetes
aditya, Prediction from Random Forest: Prediabetes


In [42]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import joblib  # Import joblib for saving the model

# Load the dataset
data = pd.read_csv('cleaned_diabetes_103.csv')

# Print the column names to check what is available
print("Columns in the dataset:", data.columns)

# Define the prediabetes criteria
data['Prediabetes'] = ((data['Glucose'] >= 90) & (data['Glucose'] <= 125)) | \
                       (data['Insulin'] > 25) | \
                       (data['BloodPressure'] > 130)

# Convert boolean to integer (1 for prediabetes, 0 for not)
data['Prediabetes'] = data['Prediabetes'].astype(int)

# Create the Hypertension feature based on the new risk conditions
data['Hypertension'] = 0  # Initialize the column

# Condition 1: Risk of 50% for hypertension if Insulin > 25 and Glucose > 100
risk_condition_1 = (data['Insulin'] > 25) & (data['Glucose'] > 100)

# Assign Hypertension based on the first risk condition
data.loc[risk_condition_1 & (data['BloodPressure'] > 130), 'Hypertension'] = 1

# Condition 2: Risk of 80% for hypertension if Glucose > 125
risk_condition_2 = data['Glucose'] > 125

# Assign Hypertension based on the second risk condition
data.loc[risk_condition_2, 'Hypertension'] = 1

# Fill Hypertension for those not meeting the conditions
data['Hypertension'] = data['Hypertension'].fillna(0).astype(int)

# Define features and target variable
X = data[['Glucose', 'Insulin', 'BloodPressure', 'Age']]  # Include Age as a feature
y = data[['Prediabetes', 'Hypertension']]  # New target variables

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Save the scaler for later use
joblib.dump(scaler, 'scaler.pkl')

# Initialize models
models = {
    'Logistic Regression': LogisticRegression(),
    'Support Vector Machine': SVC(),
    'Random Forest': RandomForestClassifier()
}

# Train models and evaluate their performance
for model_name, model in models.items():
    model.fit(X_train_scaled, y_train)  # Train the model
    y_pred = model.predict(X_test_scaled)  # Make predictions
    
    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    print(f"{model_name} Accuracy: {accuracy:.4f}")
    print(confusion_matrix(y_test, y_pred))
    print(classification_report(y_test, y_pred))
    
    # Save the model to a file
    joblib.dump(model, f'{model_name.replace(" ", "_").lower()}_model.pkl')
    print(f"{model_name} saved as '{model_name.replace(' ', '_').lower()}_model.pkl'\n")

# Function to get user input
def get_user_input():
    name = input("Enter your name: ")
    insulin = float(input("Enter Insulin level: "))
    glucose = float(input("Enter Glucose level: "))
    blood_pressure = float(input("Enter Blood Pressure level: "))
    age = int(input("Enter Age: "))
    
    # Create a DataFrame for the input
    new_data = pd.DataFrame({
        'Glucose': [glucose],
        'Insulin': [insulin],
        'BloodPressure': [blood_pressure],
        'Age': [age]
    })
    
    return name, new_data

# Function to make predictions
def make_predictions(new_data):
    # Load the scaler
    scaler = joblib.load('scaler.pkl')  # Load the scaler

    # Scale the new data
    new_data_scaled = scaler.transform(new_data)

    # Make predictions on the new data using all models
    predictions = {}
    for model_name, model in models.items():
        model = joblib.load(f'{model_name.replace(" ", "_").lower()}_model.pkl')  # Load the model
        pred = model.predict(new_data_scaled)  # Make predictions
        predictions[model_name] = pred[0]  # Store the prediction

    return predictions

# Main function to run the input and prediction process
def main():
    print("Please enter your details:")
    name, new_data = get_user_input()  # Get user input
    predictions = make_predictions(new_data)  # Make predictions

    # Output the predictions
    for model_name, prediction in predictions.items():
        if prediction[0] == 1:
            print(f"{name}, Prediction from {model_name}: Prediabetes")
        else:
            print(f"{name}, Prediction from {model_name}: No Prediabetes")
        
        if prediction[1] == 1:
            print(f"{name}, Prediction from {model_name}: Hypertension")
        else:
            print(f"{name}, Prediction from {model_name}: No Hypertension")

# Run the main function
if __name__ == "__main__":
    main()


Columns in the dataset: Index(['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'Age'], dtype='object')


ValueError: y should be a 1d array, got an array of shape (614, 2) instead.

In [54]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import joblib  # Import joblib for saving the model

# Load the dataset
data = pd.read_csv('cleaned_diabetes_103.csv')

# Print the column names to check what is available
print("Columns in the dataset:", data.columns)

# Define the prediabetes criteria
data['Prediabetes'] = ((data['Glucose'] >= 90) & (data['Glucose'] <= 125)) | \
                       (data['Insulin'] > 25) | \
                       (data['BloodPressure'] > 130)

# Convert boolean to integer (1 for prediabetes, 0 for not)
data['Prediabetes'] = data['Prediabetes'].astype(int)

# Create the Hypertension feature based on the new risk conditions
data['Hypertension'] = 0  # Initialize the column

# Condition 1: Risk of 50% for hypertension if Insulin > 25 and Glucose > 100
risk_condition_1 = (data['Insulin'] > 25) & (data['Glucose'] > 100)

# Assign Hypertension based on the first risk condition
data.loc[risk_condition_1 & (data['BloodPressure'] > 130), 'Hypertension'] = 1

# Condition 2: Risk of 80% for hypertension if Glucose > 125
risk_condition_2 = data['Glucose'] > 125

# Assign Hypertension based on the second risk condition
data.loc[risk_condition_2, 'Hypertension'] = 1

# Fill Hypertension for those not meeting the conditions
data['Hypertension'] = data['Hypertension'].fillna(0).astype(int)

# Define features
X = data[['Glucose', 'Insulin', 'BloodPressure', 'Age']]  # Include Age as a feature

# Split the data for Prediabetes
y_prediabetes = data['Prediabetes']  # Target variable for Prediabetes
X_train, X_test, y_train_prediabetes, y_test_prediabetes = train_test_split(X, y_prediabetes, test_size=0.2, random_state=42)

# Split the data for Hypertension
y_hypertension = data['Hypertension']  # Target variable for Hypertension
X_train_hypertension, X_test_hypertension, y_train_hypertension, y_test_hypertension = train_test_split(X, y_hypertension, test_size=0.2, random_state=42)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Save the scaler for later use
joblib.dump(scaler, 'scaler.pkl')

# Initialize models
models = {
    'Logistic Regression': LogisticRegression(),
    'Support Vector Machine': SVC(),
    'Random Forest': RandomForestClassifier()
}

# Train models for Prediabetes
for model_name, model in models.items():
    model.fit(X_train_scaled, y_train_prediabetes)  # Train the model for Prediabetes
    y_pred = model.predict(X_test_scaled)  # Make predictions
    
    # Evaluate the model
    accuracy = accuracy_score(y_test_prediabetes, y_pred)
    print(f"{model_name} (Prediabetes) Accuracy: {accuracy:.4f}")
    print(confusion_matrix(y_test_prediabetes, y_pred))
    print(classification_report(y_test_prediabetes, y_pred))
    
    # Save the model to a file
    joblib.dump(model, f'{model_name.replace(" ", "_").lower()}_prediabetes_model.pkl')
    print(f"{model_name} saved as '{model_name.replace(' ', '_').lower()}_prediabetes_model.pkl'\n")

# Train models for Hypertension
for model_name, model in models.items():
    model.fit(X_train_scaled, y_train_hypertension)  # Train the model for Hypertension
    y_pred = model.predict(X_test_scaled)  # Make predictions
    
    # Evaluate the model
    accuracy = accuracy_score(y_test_hypertension, y_pred)
    print(f"{model_name} (Hypertension) Accuracy: {accuracy:.4f}")
    print(confusion_matrix(y_test_hypertension, y_pred))
    print(classification_report(y_test_hypertension, y_pred))
    
    # Save the model to a file
    joblib.dump(model, f'{model_name.replace(" ", "_").lower()}_hypertension_model.pkl')
    print(f"{model_name} saved as '{model_name.replace(' ', '_').lower()}_hypertension_model.pkl'\n")

# Function to get user input
def get_user_input():
    name = input("Enter your name: ")
    insulin = float(input("Enter Insulin level: "))
    glucose = float(input("Enter Glucose level: "))
    blood_pressure = float(input("Enter Blood Pressure level: "))
    age = int(input("Enter Age: "))
    
    # Create a DataFrame for the input
    new_data = pd.DataFrame({
        'Glucose': [glucose],
        'Insulin': [insulin],
        'BloodPressure': [blood_pressure],
        'Age': [age]
    })
    
    return name, new_data

# Function to make predictions
def make_predictions(new_data):
    # Load the scaler
    scaler = joblib.load('scaler.pkl')  # Load the scaler

    # Scale the new data
    new_data_scaled = scaler.transform(new_data)

    # Make predictions on the new data using all models
    predictions = {}
    for model_name in models.keys():
        # Predict for Prediabetes
        model = joblib.load(f'{model_name.replace(" ", "_").lower()}_prediabetes_model.pkl')  # Load the model
        pred_prediabetes = model.predict(new_data_scaled)  # Make predictions
        predictions[model_name] = {'Prediabetes': pred_prediabetes[0]}  # Store the prediction

        # Predict for Hypertension
        model = joblib.load(f'{model_name.replace(" ", "_").lower()}_hypertension_model.pkl')  # Load the model
        pred_hypertension = model.predict(new_data_scaled)  # Make predictions
        predictions[model_name]['Hypertension'] = pred_hypertension[0]  # Store the prediction

    return predictions

# Main function to run the input and prediction process
def main():
    print("Please enter your details:")
    name, new_data = get_user_input()  # Get user input
    predictions = make_predictions(new_data)  # Make predictions

    # Output the predictions
    for model_name, prediction in predictions.items():
        if prediction['Prediabetes'] == 1:
            print(f"{name}, Prediction from {model_name}: Prediabetes")
        else:
            print(f"{name}, Prediction from {model_name}: No Prediabetes")
        
        if prediction['Hypertension'] == 1:
            print(f"{name}, Prediction from {model_name}: Hypertension")
        else:
            print(f"{name}, Prediction from {model_name}: No Hypertension")

# Run the main function
if __name__ == "__main__":
    main()


Columns in the dataset: Index(['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'Age'], dtype='object')
Logistic Regression (Prediabetes) Accuracy: 0.9870
[[  0   2]
 [  0 152]]
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         2
           1       0.99      1.00      0.99       152

    accuracy                           0.99       154
   macro avg       0.49      0.50      0.50       154
weighted avg       0.97      0.99      0.98       154

Logistic Regression saved as 'logistic_regression_prediabetes_model.pkl'

Support Vector Machine (Prediabetes) Accuracy: 0.9870
[[  0   2]
 [  0 152]]
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         2
           1       0.99      1.00      0.99       152

    accuracy                           0.99       154
   macro avg       0.49      0.50      0.50       154
weighted avg       0.97      0.99      0.98       154

Sup

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Random Forest (Prediabetes) Accuracy: 1.0000
[[  2   0]
 [  0 152]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         2
           1       1.00      1.00      1.00       152

    accuracy                           1.00       154
   macro avg       1.00      1.00      1.00       154
weighted avg       1.00      1.00      1.00       154

Random Forest saved as 'random_forest_prediabetes_model.pkl'

Logistic Regression (Hypertension) Accuracy: 1.0000
[[94  0]
 [ 0 60]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        94
           1       1.00      1.00      1.00        60

    accuracy                           1.00       154
   macro avg       1.00      1.00      1.00       154
weighted avg       1.00      1.00      1.00       154

Logistic Regression saved as 'logistic_regression_hypertension_model.pkl'

Support Vector Machine (Hypertension) Accuracy: 0.9935
[[93  1]
 [ 0 6

Enter your name:  aditya
Enter Insulin level:  140
Enter Glucose level:  130
Enter Blood Pressure level:  140
Enter Age:  40


aditya, Prediction from Logistic Regression: Prediabetes
aditya, Prediction from Logistic Regression: Hypertension
aditya, Prediction from Support Vector Machine: Prediabetes
aditya, Prediction from Support Vector Machine: Hypertension
aditya, Prediction from Random Forest: Prediabetes
aditya, Prediction from Random Forest: Hypertension


In [52]:
from flask import Flask, render_template_string, request
import pandas as pd
import joblib
from sklearn.preprocessing import StandardScaler

app = Flask(__name__)

# Load the scaler and models
scaler = joblib.load('scaler.pkl')
models = {
    'Logistic Regression': joblib.load('logistic_regression_model.pkl'),
    'Support Vector Machine': joblib.load('support_vector_machine_model.pkl'),
    'Random Forest': joblib.load('random_forest_model.pkl')
}

# Function to make predictions
def make_predictions(new_data):
    # Scale the new data
    new_data_scaled = scaler.transform(new_data)

    # Make predictions on the new data using all models
    predictions = {}
    for model_name, model in models.items():
        # Predict for Prediabetes
        pred_prediabetes = model.predict(new_data_scaled)  # Make predictions
        predictions[model_name] = {'Prediabetes': pred_prediabetes[0]}  # Store the prediction

        # Predict for Hypertension
        model_hypertension = joblib.load(f'{model_name.replace(" ", "_").lower()}_hypertension_model.pkl')  # Load the model
        pred_hypertension = model_hypertension.predict(new_data_scaled)  # Make predictions
        predictions[model_name]['Hypertension'] = pred_hypertension[0]  # Store the prediction

    return predictions

# HTML templates
index_html = '''
<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>Diabetes Prediction</title>
</head>
<body>
    <h1>Diabetes Prediction Form</h1>
    <form method="POST">
        <label for="name">Name:</label>
        <input type="text" id="name" name="name" required><br><br>
        
        <label for="insulin">Insulin Level:</label>
        <input type="number" id="insulin" name="insulin" required><br><br>
        
        <label for="glucose">Glucose Level:</label>
        <input type="number" id="glucose" name="glucose" required><br><br>
        
        <label for="blood_pressure">Blood Pressure Level:</label>
        <input type="number" id="blood_pressure" name="blood_pressure" required><br><br>
        
        <label for="age">Age:</label>
        <input type="number" id="age" name="age" required><br><br>
        
        <input type="submit" value="Submit">
    </form>
</body>
</html>
'''

results_html = '''
<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>Prediction Results</title>
</head>
<body>
    <h1>Prediction Results for {{ name }}</h1>
    {% for model_name, prediction in predictions.items() %}
        <h2>{{ model_name }}</h2>
        <p>Prediabetes: {{ 'Yes' if prediction['Prediabetes'] == 1 else 'No' }}</p>
        <p>Hypertension: {{ 'Yes' if prediction['Hypertension'] == 1 else 'No' }}</p>
    {% endfor %}
    <a href="/">Go Back</a>
</body>
</html>
'''

@app.route('/', methods=['GET', 'POST'])
def index():
    if request.method == 'POST':
        name = request.form['name']
        insulin = float(request.form['insulin'])
        glucose = float(request.form['glucose'])
        blood_pressure = float(request.form['blood_pressure'])
        age = int(request.form['age'])

        # Create a DataFrame for the input
        new_data = pd.DataFrame({
            'Glucose': [glucose],
            'Insulin': [insulin],
            'BloodPressure': [blood_pressure],
            'Age': [age]
        })

        # Make predictions
        predictions = make_predictions(new_data)

        return render_template_string(results_html, name=name, predictions=predictions)

    return render_template_string(index_html)

if __name__ == '__main__':
    app.run(debug=True)


 * Serving Flask app '__main__'
 * Debug mode: on


 * Running on http://127.0.0.1:5000
Press CTRL+C to quit
 * Restarting with watchdog (windowsapi)


SystemExit: 1

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [50]:
import streamlit as st
import pandas as pd
import joblib

# Load the scaler and models
scaler = joblib.load('scaler.pkl')
models = {
    'Logistic Regression': joblib.load('logistic_regression_model.pkl'),
    'Support Vector Machine': joblib.load('support_vector_machine_model.pkl'),
    'Random Forest': joblib.load('random_forest_model.pkl')
}

# Function to make predictions
def make_predictions(new_data):
    # Scale the new data
    new_data_scaled = scaler.transform(new_data)

    # Make predictions on the new data using all models
    predictions = {}
    for model_name, model in models.items():
        # Predict for Prediabetes
        pred_prediabetes = model.predict(new_data_scaled)  # Make predictions
        predictions[model_name] = {'Prediabetes': pred_prediabetes[0]}  # Store the prediction

        # Predict for Hypertension
        model_hypertension = joblib.load(f'{model_name.replace(" ", "_").lower()}_hypertension_model.pkl')  # Load the model
        pred_hypertension = model_hypertension.predict(new_data_scaled)  # Make predictions
        predictions[model_name]['Hypertension'] = pred_hypertension[0]  # Store the prediction

    return predictions

# Streamlit app
def main():
    st.title("Diabetes Prediction App")

    # User input
    name = st.text_input("Enter your name:")
    insulin = st.number_input("Insulin Level:", min_value=0.0)
    glucose = st.number_input("Glucose Level:", min_value=0.0)
    blood_pressure = st.number_input("Blood Pressure Level:", min_value=0.0)
    age = st.number_input("Age:", min_value=0)

    if st.button("Predict"):
        # Create a DataFrame for the input
        new_data = pd.DataFrame({
            'Glucose': [glucose],
            'Insulin': [insulin],
            'BloodPressure': [blood_pressure],
            'Age': [age]
        })

        # Make predictions
        predictions = make_predictions(new_data)

        # Display results
        st.write(f"### Prediction Results for {name}")
        for model_name, prediction in predictions.items():
            st.write(f"**{model_name}**")
            st.write(f"Prediabetes: {'Yes' if prediction['Prediabetes'] == 1 else 'No'}")
            st.write(f"Hypertension: {'Yes' if prediction['Hypertension'] == 1 else 'No'}")
            st.write("---")

if __name__ == "__main__":
    main()


2025-05-17 13:14:11.299 
  command:

    streamlit run C:\Users\HP\anaconda3\Lib\site-packages\ipykernel_launcher.py [ARGUMENTS]
