In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

# Load the dataset
url = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.data.csv"
names = ['preg', 'plas', 'pres', 'skin', 'insu', 'mass', 'pedi', 'age', 'class']
data = pd.read_csv(url, names=names)

# Split data into features and target variable
X = data.drop('class', axis=1)
y = data['class']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train logistic regression model
lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(X_train, y_train)

# Get coefficients (importance) of features
coefficients = np.abs(lr_model.coef_[0])

# Rank features based on coefficients
feature_ranking = np.argsort(coefficients)[::-1]

# Select top N features
num_features_to_select = 5
selected_features = X.columns[feature_ranking][:num_features_to_select]

# Display selected features
print("Selected Features:")
print(selected_features)

Selected Features:
Index(['pedi', 'mass', 'preg', 'age', 'plas'], dtype='object')


In [17]:
print(f'{selected_features}')

Index(['pedi', 'mass', 'preg', 'age', 'plas'], dtype='object')


In [3]:
from sklearn.neighbors import KNeighborsClassifier

# Selecting features from the dataset
X_train_selected = X_train[selected_features]
X_test_selected = X_test[selected_features]

# Train KNN model
knn_model = KNeighborsClassifier(n_neighbors=5)  # You can choose the value of k as desired
knn_model.fit(X_train_selected, y_train)

# Evaluate model
train_accuracy = knn_model.score(X_train_selected, y_train)
test_accuracy = knn_model.score(X_test_selected, y_test)

print("Train Accuracy:", train_accuracy)
print("Test Accuracy:", test_accuracy)


Train Accuracy: 0.8192182410423453
Test Accuracy: 0.7662337662337663


In [4]:
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier

# Train SVM model
svm_model = SVC(kernel='linear')  # You can choose different kernels (e.g., 'rbf') and parameters as desired
svm_model.fit(X_train_selected, y_train)

# Train Naive Bayes model
nb_model = GaussianNB()
nb_model.fit(X_train_selected, y_train)

# Train Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)  # You can adjust the number of trees (n_estimators) as desired
rf_model.fit(X_train_selected, y_train)

# Evaluate models
svm_train_accuracy = svm_model.score(X_train_selected, y_train)
svm_test_accuracy = svm_model.score(X_test_selected, y_test)

nb_train_accuracy = nb_model.score(X_train_selected, y_train)
nb_test_accuracy = nb_model.score(X_test_selected, y_test)

rf_train_accuracy = rf_model.score(X_train_selected, y_train)
rf_test_accuracy = rf_model.score(X_test_selected, y_test)

# Print accuracies
print("SVM Train Accuracy:", svm_train_accuracy)
print("SVM Test Accuracy:", svm_test_accuracy)
print("Naive Bayes Train Accuracy:", nb_train_accuracy)
print("Naive Bayes Test Accuracy:", nb_test_accuracy)
print("Random Forest Train Accuracy:", rf_train_accuracy)
print("Random Forest Test Accuracy:", rf_test_accuracy)


SVM Train Accuracy: 0.7687296416938111
SVM Test Accuracy: 0.7597402597402597
Naive Bayes Train Accuracy: 0.755700325732899
Naive Bayes Test Accuracy: 0.7532467532467533
Random Forest Train Accuracy: 1.0
Random Forest Test Accuracy: 0.7662337662337663


In [12]:
from sklearn.ensemble import GradientBoostingClassifier

# Train Gradient Boosting model
gbm_model = GradientBoostingClassifier()
gbm_model.fit(X_train_selected, y_train)

# Evaluate model
gbm_train_accuracy = gbm_model.score(X_train_selected, y_train)
gbm_test_accuracy = gbm_model.score(X_test_selected, y_test)

print("Gradient Boosting Train Accuracy:", gbm_train_accuracy)
print("Gradient Boosting Test Accuracy:", gbm_test_accuracy)


from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV

# Define the Gradient Boosting Classifier
gbm_model = GradientBoostingClassifier()

# Define the hyperparameters grid
param_grid = {
    'n_estimators': [50, 100, 200],  # Number of boosting stages
    'learning_rate': [0.1, 0.01, 0.001],  # Step size shrinkage used to prevent overfitting
    'max_depth': [3, 5, 7],  # Maximum depth of the individual estimators
    'min_samples_split': [2, 5, 10],  # Minimum number of samples required to split an internal node
    'min_samples_leaf': [1, 2, 4]  # Minimum number of samples required to be at a leaf node
}

# Perform GridSearchCV
grid_search = GridSearchCV(gbm_model, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train_selected, y_train)

# Get the best hyperparameters
best_params = grid_search.best_params_

# Train the model with the best hyperparameters
best_gbm_model = GradientBoostingClassifier(**best_params)
best_gbm_model.fit(X_train_selected, y_train)

# Evaluate the model
train_accuracy = best_gbm_model.score(X_train_selected, y_train)
test_accuracy = best_gbm_model.score(X_test_selected, y_test)

print("Best Parameters:", best_params)
print("Train Accuracy:", train_accuracy)
print("Test Accuracy:", test_accuracy)


Gradient Boosting Train Accuracy: 0.9055374592833876
Gradient Boosting Test Accuracy: 0.7207792207792207
Best Parameters: {'learning_rate': 0.1, 'max_depth': 3, 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 100}
Train Accuracy: 0.9006514657980456
Test Accuracy: 0.7467532467532467


In [6]:
from sklearn.ensemble import AdaBoostClassifier

# Train AdaBoost model
adaboost_model = AdaBoostClassifier()
adaboost_model.fit(X_train_selected, y_train)

# Evaluate model
adaboost_train_accuracy = adaboost_model.score(X_train_selected, y_train)
adaboost_test_accuracy = adaboost_model.score(X_test_selected, y_test)

print("AdaBoost Train Accuracy:", adaboost_train_accuracy)
print("AdaBoost Test Accuracy:", adaboost_test_accuracy)


AdaBoost Train Accuracy: 0.8322475570032574
AdaBoost Test Accuracy: 0.7597402597402597


In [7]:
import xgboost as xgb

# Train XGBoost model
xgb_model = xgb.XGBClassifier()
xgb_model.fit(X_train_selected, y_train)

# Evaluate model
xgb_train_accuracy = xgb_model.score(X_train_selected, y_train)
xgb_test_accuracy = xgb_model.score(X_test_selected, y_test)

print("XGBoost Train Accuracy:", xgb_train_accuracy)
print("XGBoost Test Accuracy:", xgb_test_accuracy)


XGBoost Train Accuracy: 1.0
XGBoost Test Accuracy: 0.7402597402597403


In [8]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

# Define Logistic Regression model
lr_model = LogisticRegression(max_iter=1000)

# Define hyperparameters grid
param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100],  # Regularization parameter
    'solver': ['liblinear', 'lbfgs', 'saga']  # Algorithm to use in the optimization problem
}

# Perform GridSearchCV
grid_search = GridSearchCV(lr_model, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train_selected, y_train)

# Get the best hyperparameters
best_params = grid_search.best_params_

# Train the model with the best hyperparameters
best_lr_model = LogisticRegression(**best_params, max_iter=1000)
best_lr_model.fit(X_train_selected, y_train)

# Evaluate the model
train_accuracy = best_lr_model.score(X_train_selected, y_train)
test_accuracy = best_lr_model.score(X_test_selected, y_test)

print("Best Parameters:", best_params)
print("Train Accuracy:", train_accuracy)
print("Test Accuracy:", test_accuracy)




Best Parameters: {'C': 1, 'solver': 'lbfgs'}
Train Accuracy: 0.7638436482084691
Test Accuracy: 0.7532467532467533




In [9]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV

# Define KNN model
knn_model = KNeighborsClassifier()

# Define hyperparameters grid
param_grid = {
    'n_neighbors': [3, 5, 7, 9, 11, 13],  # Number of neighbors
    'weights': ['uniform', 'distance'],  # Weight function used in prediction
    'p': [1, 2]  # Power parameter for the Minkowski metric
}

# Perform GridSearchCV
grid_search = GridSearchCV(knn_model, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train_selected, y_train)

# Get the best hyperparameters
best_params = grid_search.best_params_

# Train the model with the best hyperparameters
best_knn_model = KNeighborsClassifier(**best_params)
best_knn_model.fit(X_train_selected, y_train)

# Evaluate the model
train_accuracy = best_knn_model.score(X_train_selected, y_train)
test_accuracy = best_knn_model.score(X_test_selected, y_test)

print("Best Parameters:", best_params)
print("Train Accuracy:", train_accuracy)
print("Test Accuracy:", test_accuracy)


Best Parameters: {'n_neighbors': 9, 'p': 1, 'weights': 'uniform'}
Train Accuracy: 0.8029315960912052
Test Accuracy: 0.7532467532467533


In [10]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

# Define the Random Forest Classifier
rf_model = RandomForestClassifier()

# Define the hyperparameters grid
param_grid = {
    'n_estimators': [50, 100, 200],  # Number of trees in the forest
    'max_depth': [None, 10, 20],  # Maximum depth of the trees
    'min_samples_split': [2, 5, 10],  # Minimum number of samples required to split an internal node
    'min_samples_leaf': [1, 2, 4]  # Minimum number of samples required to be at a leaf node
}

# Perform GridSearchCV
grid_search = GridSearchCV(rf_model, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train_selected, y_train)

# Get the best hyperparameters
best_params = grid_search.best_params_


# Train the model with the best hyperparameters
best_rf_model = RandomForestClassifier(**best_params)
best_rf_model.fit(X_train_selected, y_train)

# Evaluate the model
train_accuracy = best_rf_model.score(X_train_selected, y_train)
test_accuracy = best_rf_model.score(X_test_selected, y_test)

print("Best Parameters:", best_params)
print("Train Accuracy:", train_accuracy)
print("Test Accuracy:", test_accuracy)


Best Parameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 200}
Train Accuracy: 0.9299674267100977
Test Accuracy: 0.7402597402597403


In [11]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV

# Define the Decision Tree Classifier
dt_model = DecisionTreeClassifier()

# Define the hyperparameters grid
param_grid = {
    'max_depth': [None, 5, 10, 20],  # Maximum depth of the tree
    'min_samples_split': [2, 5, 10],  # Minimum number of samples required to split an internal node
    'min_samples_leaf': [1, 2, 4]  # Minimum number of samples required to be at a leaf node
}

# Perform GridSearchCV
grid_search = GridSearchCV(dt_model, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train_selected, y_train)

# Get the best hyperparameters
best_params = grid_search.best_params_

# Train the model with the best hyperparameters
best_dt_model = DecisionTreeClassifier(**best_params)
best_dt_model.fit(X_train_selected, y_train)

# Evaluate the model
train_accuracy = best_dt_model.score(X_train_selected, y_train)
test_accuracy = best_dt_model.score(X_test_selected, y_test)

print("Best Parameters:", best_params)
print("Train Accuracy:", train_accuracy)
print("Test Accuracy:", test_accuracy)


Best Parameters: {'max_depth': 5, 'min_samples_leaf': 4, 'min_samples_split': 2}
Train Accuracy: 0.8241042345276873
Test Accuracy: 0.7727272727272727


In [16]:
from joblib import dump, load

dump(best_rf_model, 'diabetes_model.joblib')


['diabetes_model.joblib']

In [49]:
loaded_model = load('diabetes_model.joblib')

input_values = ['9', '102', '32.9', '0.665', '46']

input_array = np.array(input_values).reshape(1, -1)

prediction = loaded_model.predict(input_array)

if prediction[0] == 0:
    print("Person is not diabetic")
else:
    print("Person is diabetic")


Person is not diabetic




In [46]:
json = {
  "pregnancies": "9",
  "glucose": "102",
  "bloodPressure": "76",
  "skinThickness": "97",
  "insulin": "0",
  "bmi": "32.9",
  "diabetesPedigree": "0.665",
  "age": "46"
}

elements_to_consider = ['diabetesPedigree', 'bmi', 'pregnancies','age', 'glucose']
input_list = []

for element in elements_to_consider:
    if element in json:
        input_list.append(json.pop(element))
print(input_list)

loaded_model = load('diabetes_model.joblib')

input_array = np.array(input_list).reshape(1, -1)

prediction = loaded_model.predict(input_array)

if prediction[0] == 0:
    print(prediction)
    print("Person is not diabetic")
else:
    print(prediction)
    print("Person is diabetic")

['0.665', '32.9', '9', '46', '102']
[1]
Person is diabetic


