In [1]:
# Import the required libraries and dependencies
import pandas as pd
from pathlib import Path
from sklearn.cluster import KMeans, AgglomerativeClustering, Birch
from sklearn.preprocessing import StandardScaler

# Import the modules
import numpy as np
from pathlib import Path
from sklearn.metrics import balanced_accuracy_score, confusion_matrix, classification_report


In [2]:
# Read the CSV file into a Pandas DataFrame
# Set the index using the Ticker column
diabetes_df = pd.read_csv(
    Path("diabetes.csv"))

# Review the DataFrame
diabetes_df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


## Prepare the Data 

In [3]:
# Separate the data into labels and features
# Separate the y variable, the labels
y = diabetes_df["Outcome"]

# Separate the X variable, the features
X = diabetes_df.drop(columns="Outcome")

In [4]:
# Review the y variable Series
y[:5]

0    1
1    0
2    1
3    0
4    1
Name: Outcome, dtype: int64

In [5]:
# Review the X variable DataFrame
X.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,6,148,72,35,0,33.6,0.627,50
1,1,85,66,29,0,26.6,0.351,31
2,8,183,64,0,0,23.3,0.672,32
3,1,89,66,23,94,28.1,0.167,21
4,0,137,40,35,168,43.1,2.288,33


In [6]:
# Check the balance of our target values
outcome_counts = y.value_counts()
outcome_counts

Outcome
0    500
1    268
Name: count, dtype: int64

In [7]:
# Import the train_test_learn module
from sklearn.model_selection import train_test_split

# Split the data using train_test_split
# Assign a random_state of 1 to the function
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 1)

In [8]:
# Import the LogisticRegression module from SKLearn
from sklearn.linear_model import LogisticRegression

# Instantiate the Logistic Regression model
# Assign a random_state parameter of 1 to the model
classifier = LogisticRegression(solver='lbfgs', random_state = 1)
# Fit the model using training data
classifier.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [9]:
# Make a prediction using the testing data
predictions = classifier.predict(X_test)
predictions

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0,
       0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0,
       1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0])

In [10]:
# Print the balanced_accuracy score of the model
balanced_acc_score = balanced_accuracy_score(y_test, predictions)

In [11]:
# Generate a confusion matrix for the model
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])
cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,109,14
Actual 1,29,40


In [12]:
# Print the classification report for the model
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score: {balanced_acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,109,14
Actual 1,29,40


Accuracy Score: 0.732944503358077
Classification Report
              precision    recall  f1-score   support

           0       0.79      0.89      0.84       123
           1       0.74      0.58      0.65        69

    accuracy                           0.78       192
   macro avg       0.77      0.73      0.74       192
weighted avg       0.77      0.78      0.77       192



## Resampled data is below

In [13]:
# Import the RandomOverSampler module form imbalanced-learn
from imblearn.over_sampling import RandomOverSampler

# Instantiate the random oversampler model
# # Assign a random_state parameter of 1 to the model
over_sample_model = RandomOverSampler(random_state = 1)

# Fit the original training data to the random_oversampler model
X_resampled, y_resampled = over_sample_model.fit_resample(X_train, y_train)

In [14]:
# Count the distinct values of the resampled labels data
y_resampled.value_counts()

Outcome
0    377
1    377
Name: count, dtype: int64

In [15]:
# Instantiate the Logistic Regression model
# Assign a random_state parameter of 1 to the model
classifier = LogisticRegression(solver='lbfgs', random_state=1)
# Fit the model using the resampled training data
classifier.fit(X_resampled, y_resampled)
# Make a prediction using the testing data
predictions = classifier.predict(X_resampled)

predictions_df = pd.DataFrame({"Actual": y_resampled, "Prediction": predictions})
                        
predictions_df

Unnamed: 0,Actual,Prediction
0,0,0
1,0,0
2,1,1
3,0,0
4,0,1
...,...,...
749,1,1
750,1,1
751,1,1
752,1,0


In [16]:
# Print the balanced_accuracy score of the model 
balanced_acc_score = balanced_accuracy_score(y_resampled, predictions)

In [17]:
# Generate a confusion matrix for the model
cm = confusion_matrix(y_resampled, predictions)
cm_df = pd.DataFrame(cm, index=['Actual 0', 'Actual 1'], columns =['Predicted 0', 'Predicted 1'])

In [18]:
# Print the classification report for the model
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score: {balanced_acc_score}")
print("Classification Report")
print(classification_report(y_resampled, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,283,94
Actual 1,100,277


Accuracy Score: 0.7427055702917772
Classification Report
              precision    recall  f1-score   support

           0       0.74      0.75      0.74       377
           1       0.75      0.73      0.74       377

    accuracy                           0.74       754
   macro avg       0.74      0.74      0.74       754
weighted avg       0.74      0.74      0.74       754



In [32]:
# Change Variable Display from '0' and '1' to 'Non Diabetic' and 'Diabetic'
predictions_df['Actual']= predictions_df['Actual'].replace({0: 'Non Diabetic', 1: 'Diabetic'})
predictions_df['Prediction']= predictions_df['Prediction'].replace({0: 'Non Diabetic', 1: 'Diabetic'})

In [34]:
# Create pandas dataframe with the trained data
testing_data = pd.concat([X_resampled, predictions_df], axis=1)
testing_data

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Actual,Prediction
0,4,97,60,23,0,28.2,0.443,22,Non Diabetic,Non Diabetic
1,5,111,72,28,0,23.9,0.407,27,Non Diabetic,Non Diabetic
2,0,180,90,26,90,36.5,0.314,35,Diabetic,Diabetic
3,6,103,66,0,0,24.3,0.249,29,Non Diabetic,Non Diabetic
4,10,122,78,31,0,27.6,0.512,45,Non Diabetic,Diabetic
...,...,...,...,...,...,...,...,...,...,...
749,8,151,78,32,210,42.9,0.516,36,Diabetic,Diabetic
750,4,123,62,0,0,32.0,0.226,35,Diabetic,Diabetic
751,12,151,70,40,271,41.8,0.742,38,Diabetic,Diabetic
752,10,115,0,0,0,0.0,0.261,30,Diabetic,Non Diabetic


In [35]:
testing_data.to_csv('testing_data.csv')