In [12]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import matplotlib.pyplot as plt
import seaborn as sns

class DiabetesAnalysis:
    def __init__(self):
        self.data = None
        self.model = None
        self.scaler = StandardScaler()

    def load_data(self, file_path):
        self.data = pd.read_csv(file_path)
        return self.data

    def data_summary(self):
        return self.data.describe()

    def preprocess_data(self):
        # Separate features and target
        X = self.data.drop('Outcome', axis=1)
        y = self.data['Outcome']

        # Split data into training and test sets
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

        # Scale the features
        self.X_train = self.scaler.fit_transform(X_train)
        self.X_test = self.scaler.transform(X_test)
        self.y_train = y_train
        self.y_test = y_test

    

    def hyperparameter_tuning(self):
        # Define the parameter grid for RandomizedSearchCV
        param_grid = {
            'n_estimators': [100, 200, 300, 400, 500],
            'max_depth': [None, 10, 20, 30, 40, 50],
            'min_samples_split': [2, 5, 10],
            'min_samples_leaf': [1, 2, 4],
            'max_features': ['sqrt', 'log2', None]
        }

        # Initialize RandomizedSearchCV with RandomForestClassifier
        random_search = RandomizedSearchCV(
            estimator=RandomForestClassifier(random_state=42),
            param_distributions=param_grid,
            n_iter=15,  # Number of parameter combinations to try
            cv=3,       # 5-fold cross-validation
            n_jobs=-1,  # Use all available CPU cores
            verbose=2,
            random_state=42
        )

        # Fit to the training data
        random_search.fit(self.X_train, self.y_train)

        # Get the best parameters
        best_params = random_search.best_params_
        print("Best parameters found: ", best_params)

        # Train the final model with the best parameters
        self.model = RandomForestClassifier(**best_params, random_state=42)
        self.model.fit(self.X_train, self.y_train)

    def train_model(self):
        # Preprocess the data before training
        self.preprocess_data()

        # Perform hyperparameter tuning
        self.hyperparameter_tuning()

        # Predict on the test set
        predictions = self.model.predict(self.X_test)

        # Evaluate the model
        accuracy = accuracy_score(self.y_test, predictions)
        print(f"Model Accuracy: {accuracy:.4f}")
        print("\nClassification Report:")
        print(classification_report(self.y_test, predictions))

    def predict(self, input_data):
        # Scale the input data
        input_data_scaled = self.scaler.transform([input_data])
        prediction = self.model.predict(input_data_scaled)
        prediction_proba = self.model.predict_proba(input_data_scaled)
        print(f"Prediction: {'Diabetic' if prediction[0] == 1 else 'Non-Diabetic'}")
        print(f"Prediction Probability: {prediction_proba[0]}")
        return prediction[0]

    def plot_feature_importance(self):
        # Plot feature importance if available
        if hasattr(self.model, 'feature_importances_'):
            feature_importances = self.model.feature_importances_
            features = self.data.drop('Outcome', axis=1).columns
            plt.figure(figsize=(10, 6))
            sns.barplot(x=feature_importances, y=features)
            plt.title('Feature Importance')
            plt.xlabel('Importance')
            plt.ylabel('Feature')
            plt.show()

    def plot_correlation_matrix(self):
        # Plot the correlation matrix of the dataset
        plt.figure(figsize=(12, 8))
        sns.heatmap(self.data.corr(), annot=True, cmap='coolwarm', linewidths=0.5)
        plt.title('Correlation Matrix')
        plt.show()

In [13]:
print("\nData Summary:")
print(analyzer.data_summary())


Data Summary:
       Pregnancies     Glucose  BloodPressure  SkinThickness     Insulin  \
count   768.000000  768.000000     768.000000     768.000000  768.000000   
mean      3.845052  120.894531      69.105469      20.536458   79.799479   
std       3.369578   31.972618      19.355807      15.952218  115.244002   
min       0.000000    0.000000       0.000000       0.000000    0.000000   
25%       1.000000   99.000000      62.000000       0.000000    0.000000   
50%       3.000000  117.000000      72.000000      23.000000   30.500000   
75%       6.000000  140.250000      80.000000      32.000000  127.250000   
max      17.000000  199.000000     122.000000      99.000000  846.000000   

              BMI  DiabetesPedigreeFunction         Age     Outcome  
count  768.000000                768.000000  768.000000  768.000000  
mean    31.992578                  0.471876   33.240885    0.348958  
std      7.884160                  0.331329   11.760232    0.476951  
min      0.000000   

In [14]:
print("\nTraining the model with hyperparameter tuning...")
analyzer.train_model()


Training the model with hyperparameter tuning...
Fitting 5 folds for each of 50 candidates, totalling 250 fits
Best parameters found:  {'n_estimators': 300, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': 40}
Model Accuracy: 0.7576

Classification Report:
              precision    recall  f1-score   support

           0       0.82      0.81      0.81       151
           1       0.65      0.66      0.65        80

    accuracy                           0.76       231
   macro avg       0.73      0.74      0.73       231
weighted avg       0.76      0.76      0.76       231



In [17]:
pip install imbalanced-learn


Collecting imbalanced-learn
  Downloading imbalanced_learn-0.12.4-py3-none-any.whl.metadata (8.3 kB)
Downloading imbalanced_learn-0.12.4-py3-none-any.whl (258 kB)
Installing collected packages: imbalanced-learn
Successfully installed imbalanced-learn-0.12.4
Note: you may need to restart the kernel to use updated packages.


In [19]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import SMOTE  # For data balancing
from sklearn.pipeline import Pipeline
import matplotlib.pyplot as plt
import seaborn as sns


In [20]:

class DiabetesAnalysis:
    def __init__(self):
        self.data = None
        self.model = None
        self.scaler = StandardScaler()
        self.smote = SMOTE(random_state=42)
        self.feature_engineer = False

    def load_data(self, file_path):
        self.data = pd.read_csv(file_path)
        return self.data

    def data_summary(self):
        return self.data.describe()

    def preprocess_data(self):
        # Separate features and target
        X = self.data.drop('Outcome', axis=1)
        y = self.data['Outcome']

        # Feature Engineering: Adding Polynomial Features (Optional)
        if self.feature_engineer:
            poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)
            X = poly.fit_transform(X)

        # Split data into training and test sets
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

        # Apply SMOTE to balance classes
        X_train_balanced, y_train_balanced = self.smote.fit_resample(X_train, y_train)

        # Scale the features
        self.X_train = self.scaler.fit_transform(X_train_balanced)
        self.X_test = self.scaler.transform(X_test)
        self.y_train = y_train_balanced
        self.y_test = y_test

    def enable_feature_engineering(self):
        self.feature_engineer = True

    def train_model(self):
        # Preprocess the data before training
        self.preprocess_data()

        # Train the RandomForest model
        self.model = RandomForestClassifier(n_estimators=200, max_depth=20, random_state=42)
        self.model.fit(self.X_train, self.y_train)

        # Predict on the test set
        predictions = self.model.predict(self.X_test)

        # Evaluate the model
        accuracy = accuracy_score(self.y_test, predictions)
        print(f"Model Accuracy: {accuracy:.4f}")
        print("\nClassification Report:")
        print(classification_report(self.y_test, predictions))

    def predict(self, input_data):
        # Scale the input data
        input_data_scaled = self.scaler.transform([input_data])
        prediction = self.model.predict(input_data_scaled)
        prediction_proba = self.model.predict_proba(input_data_scaled)
        print(f"Prediction: {'Diabetic' if prediction[0] == 1 else 'Non-Diabetic'}")
        print(f"Prediction Probability: {prediction_proba[0]}")
        return prediction[0]

    def plot_feature_importance(self):
        # Plot feature importance if available
        if hasattr(self.model, 'feature_importances_'):
            feature_importances = self.model.feature_importances_
            features = self.data.drop('Outcome', axis=1).columns
            plt.figure(figsize=(10, 6))
            sns.barplot(x=feature_importances, y=features)
            plt.title('Feature Importance')
            plt.xlabel('Importance')
            plt.ylabel('Feature')
            plt.show()

    def plot_correlation_matrix(self):
        # Plot the correlation matrix of the dataset
        plt.figure(figsize=(12, 8))
        sns.heatmap(self.data.corr(), annot=True, cmap='coolwarm', linewidths=0.5)
        plt.title('Correlation Matrix')
        plt.show()



In [21]:
# Initialize the analyzer
analyzer = DiabetesAnalysis()


In [22]:

# Enable feature engineering
analyzer.enable_feature_engineering()




In [23]:
# Load data from your specific location
data = analyzer.load_data(r'C:\Users\PMLS\Desktop\diabetes.csv')

# Display basic data info
print("\nData Summary:")
print(analyzer.data_summary())




Data Summary:
       Pregnancies     Glucose  BloodPressure  SkinThickness     Insulin  \
count   768.000000  768.000000     768.000000     768.000000  768.000000   
mean      3.845052  120.894531      69.105469      20.536458   79.799479   
std       3.369578   31.972618      19.355807      15.952218  115.244002   
min       0.000000    0.000000       0.000000       0.000000    0.000000   
25%       1.000000   99.000000      62.000000       0.000000    0.000000   
50%       3.000000  117.000000      72.000000      23.000000   30.500000   
75%       6.000000  140.250000      80.000000      32.000000  127.250000   
max      17.000000  199.000000     122.000000      99.000000  846.000000   

              BMI  DiabetesPedigreeFunction         Age     Outcome  
count  768.000000                768.000000  768.000000  768.000000  
mean    31.992578                  0.471876   33.240885    0.348958  
std      7.884160                  0.331329   11.760232    0.476951  
min      0.000000   

In [24]:
# Train the model with data balancing and feature engineering
print("\nTraining the model...")
analyzer.train_model()




Training the model...
Model Accuracy: 0.7619

Classification Report:
              precision    recall  f1-score   support

           0       0.86      0.76      0.81       151
           1       0.63      0.76      0.69        80

    accuracy                           0.76       231
   macro avg       0.74      0.76      0.75       231
weighted avg       0.78      0.76      0.77       231



In [25]:
# Make prediction with your sample input
print("\nMaking prediction for sample patient...")
sample_input = [6, 148, 72, 35, 0, 33.6, 0.627, 50]




Making prediction for sample patient...


In [29]:
# Show what these values represent
features = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigree', 'Age']
print("\nInput values:")
for feature, value in zip(features, sample_input):
    print(f"{feature}: {value}")

# Make prediction
analyzer.predict(sample_input)

# Plot feature importance
print("\nGenerating feature importance plot...")
analyzer.plot_feature_importance()
plt.show()

# Optional: Show correlation heatmap
print("\nGenerating correlation matrix...")
analyzer.plot_correlation_matrix()
plt.show()


Input values:
Pregnancies: 6
Glucose: 148
BloodPressure: 72
SkinThickness: 35
Insulin: 0
BMI: 33.6
DiabetesPedigree: 0.627
Age: 50


ValueError: X has 8 features, but StandardScaler is expecting 36 features as input.