In [85]:
import tensorflow as tf
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense, Lambda, Input, Flatten
from tensorflow.keras.applications.inception_v3 import InceptionV3
from tensorflow.keras.applications.inception_v3 import preprocess_input
from tensorflow.keras.preprocessing import image
from tensorflow.keras.preprocessing.image import ImageDataGenerator, load_img
from glob import glob

In [86]:
import requests
import zipfile
import os

url = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.data.csv"
filename = "diabetes.csv"

# Download
r = requests.get(url)
open(filename, 'wb').write(r.content)



23278

In [87]:
from sklearn.model_selection import train_test_split

# Load the dataset
diabetes_df = pd.read_csv('diabetes.csv', header=None)

# Split the data into features (X) and target (y)
X = diabetes_df.iloc[:, :-1]
y = diabetes_df.iloc[:, -1]

# Split the data into training and testing sets (e.g., 80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Concatenate features and target for saving
train_df = pd.concat([X_train, y_train], axis=1)
test_df = pd.concat([X_test, y_test], axis=1)

# Save the training and testing data to new CSV files
train_df.to_csv('diabetes_train.csv', index=False, header=False)
test_df.to_csv('diabetes_test.csv', index=False, header=False)

print("Training data saved to diabetes_train.csv")
print("Testing data saved to diabetes_test.csv")

Training data saved to diabetes_train.csv
Testing data saved to diabetes_test.csv


In [88]:
image_size = [224, 224]
Inception = InceptionV3(input_shape=image_size + [3], weights='imagenet', include_top=False)

In [89]:
for layer in Inception.layers:
  layer.trainable = False

In [90]:
train_path = "/content/diabetes_train.csv"
test_path = "/content/diabetes_test.csv"

In [91]:
folder = glob('/content/diabetes_train.csv')

In [92]:
x = Flatten()(Inception.output)


In [93]:
prediction = Dense(len(folder), activation='softmax')(x)
model = Model(inputs=Inception.input, outputs=prediction)
model.summary()

In [94]:
model.compile(
    loss= 'categorical_crossentropy',
    optimizer='adam',
    metrics=['accuracy']
    )

In [95]:
print("Missing values per column:")
print(diabetes_df.isnull().sum())

print("\nDescriptive statistics:")
display(diabetes_df.describe())

Missing values per column:
0    0
1    0
2    0
3    0
4    0
5    0
6    0
7    0
8    0
dtype: int64

Descriptive statistics:


Unnamed: 0,0,1,2,3,4,5,6,7,8
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [96]:
from sklearn.preprocessing import StandardScaler

# Identify columns with potential outliers (0 values that are not plausible)
# Based on domain knowledge for diabetes, 0 glucose, blood pressure, skin thickness, insulin, and BMI are not realistic.
cols_with_outliers_as_zero = [1, 2, 3, 4, 5]

# Replace 0 values in these columns with NaN
for col in cols_with_outliers_as_zero:
    diabetes_df[col] = diabetes_df[col].replace(0, np.nan)

# Impute missing values with the mean of each column
diabetes_df.fillna(diabetes_df.mean(), inplace=True)

# Separate features (X) and target (y) again after imputation
X = diabetes_df.iloc[:, :-1]
y = diabetes_df.iloc[:, -1]

# Apply StandardScaler to the feature columns
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Convert the scaled features back to a DataFrame for potential future use
X_scaled_df = pd.DataFrame(X_scaled, columns=X.columns)

# Display the first few rows of the scaled features and descriptive statistics to verify
display(X_scaled_df.head())
display(X_scaled_df.describe())

Unnamed: 0,0,1,2,3,4,5,6,7
0,0.639947,0.865108,-0.033518,0.6655021,-3.345079e-16,0.166292,0.468492,1.425995
1,-0.844885,-1.206162,-0.529859,-0.01746338,-3.345079e-16,-0.852531,-0.365061,-0.190672
2,1.23388,2.015813,-0.695306,8.087936e-16,-3.345079e-16,-1.332833,0.604397,-0.105584
3,-0.844885,-1.074652,-0.529859,-0.7004289,-0.7243887,-0.634212,-0.920763,-1.041549
4,-1.141852,0.503458,-2.680669,0.6655021,0.1465506,1.54898,5.484909,-0.020496


Unnamed: 0,0,1,2,3,4,5,6,7
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,-6.476301e-17,-3.561966e-16,6.915764e-16,7.956598e-16,-3.330669e-16,3.515706e-16,2.451743e-16,1.931325e-16
std,1.000652,1.000652,1.000652,1.000652,1.000652,1.000652,1.000652,1.000652
min,-1.141852,-2.554131,-4.004245,-2.52167,-1.665945,-2.075119,-1.189553,-1.041549
25%,-0.8448851,-0.7212214,-0.695306,-0.4727737,-0.4007289,-0.7215397,-0.6889685,-0.7862862
50%,-0.2509521,-0.1540881,-0.01675912,8.087936e-16,-3.345079e-16,-0.008363615,-0.3001282,-0.3608474
75%,0.6399473,0.610309,0.6282695,0.3240194,-3.345079e-16,0.6029301,0.4662269,0.6602056
max,3.906578,2.54185,4.102655,7.950467,8.126238,5.042087,5.883565,4.063716


In [97]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

# Define a new Sequential model
model = Sequential()

# Add Dense layers
model.add(Dense(128, activation='relu', input_shape=(X_train.shape[1],)))
model.add(Dense(64, activation='relu'))
model.add(Dense(32, activation='relu'))

# Add the output layer
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(
    loss='binary_crossentropy',
    optimizer='adam',
    metrics=['accuracy']
    )

model.summary()

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [98]:
from tensorflow.keras.optimizers import Adam

# Compile the model with a different learning rate
model.compile(
    loss='binary_crossentropy',
    optimizer=Adam(learning_rate=0.001),
    metrics=['accuracy']
)

# Train the model with a different batch size and number of epochs
r = model.fit(
    X_train, y_train,
    validation_data=(X_test, y_test),
    epochs=100,
    batch_size=64
)

# Evaluate the model's performance on the test data
loss, accuracy = model.evaluate(X_test, y_test, verbose=0)
print(f"Test Loss: {loss:.4f}")
print(f"Test Accuracy: {accuracy:.4f}")


Epoch 1/100
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 32ms/step - accuracy: 0.5323 - loss: 2.0870 - val_accuracy: 0.6039 - val_loss: 0.8420
Epoch 2/100
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 0.5796 - loss: 0.8714 - val_accuracy: 0.6688 - val_loss: 0.6986
Epoch 3/100
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step - accuracy: 0.6539 - loss: 0.6556 - val_accuracy: 0.6494 - val_loss: 0.7098
Epoch 4/100
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - accuracy: 0.6506 - loss: 0.6241 - val_accuracy: 0.6494 - val_loss: 0.7605
Epoch 5/100
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - accuracy: 0.6792 - loss: 0.6330 - val_accuracy: 0.4675 - val_loss: 0.8816
Epoch 6/100
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - accuracy: 0.6334 - loss: 0.7237 - val_accuracy: 0.6753 - val_loss: 0.7551
Epoch 7/100
[1m10/10[0m [

## Regularization

### Subtask:
Implement techniques like L1 or L2 regularization or dropout to prevent overfitting.


**Reasoning**:
Implement dropout layers in the model to prevent overfitting and then compile and train the model.



In [99]:
from tensorflow.keras.layers import Dropout

# Define a new Sequential model with dropout layers
model = Sequential()

# Add Dense layers with dropout
model.add(Dense(128, activation='relu', input_shape=(X_train.shape[1],)))
model.add(Dropout(0.5)) # Add dropout with a rate of 0.5
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.3)) # Add dropout with a rate of 0.3
model.add(Dense(32, activation='relu'))
model.add(Dropout(0.2)) # Add dropout with a rate of 0.2

# Add the output layer
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(
    loss='binary_crossentropy',
    optimizer='adam',
    metrics=['accuracy']
)

model.summary()

# Train the regularized model
r_regularized = model.fit(
    X_train, y_train,
    validation_data=(X_test, y_test),
    epochs=100,
    batch_size=64
)

# Evaluate the performance of the regularized model
loss_regularized, accuracy_regularized = model.evaluate(X_test, y_test, verbose=0)
print(f"Regularized Model Test Loss: {loss_regularized:.4f}")
print(f"Regularized Model Test Accuracy: {accuracy_regularized:.4f}")

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/100
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 49ms/step - accuracy: 0.5645 - loss: 6.6219 - val_accuracy: 0.4091 - val_loss: 1.4945
Epoch 2/100
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step - accuracy: 0.5551 - loss: 3.8048 - val_accuracy: 0.5974 - val_loss: 0.7024
Epoch 3/100
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step - accuracy: 0.5277 - loss: 3.4515 - val_accuracy: 0.5519 - val_loss: 0.8745
Epoch 4/100
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step - accuracy: 0.5030 - loss: 3.0851 - val_accuracy: 0.5519 - val_loss: 0.8043
Epoch 5/100
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step - accuracy: 0.5577 - loss: 1.9683 - val_accuracy: 0.5974 - val_loss: 0.7526
Epoch 6/100
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step - accuracy: 0.5511 - loss: 2.0663 - val_accuracy: 0.6234 - val_loss: 0.7444
Epoch 7/100
[1m10/10[0m [

**Reasoning**:
Initialize StratifiedKFold and lists to store cross-validation scores.



In [100]:
from sklearn.model_selection import StratifiedKFold

n_splits = 5  # Number of folds
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
cvscores = []

**Reasoning**:
Loop through each fold, split the data, define, compile, fit, and evaluate the model for each fold, then store the scores.



In [101]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

# Assuming X_scaled and y are already available from previous steps
for train_index, test_index in skf.split(X_scaled, y):
    X_train_fold, X_test_fold = X_scaled[train_index], X_scaled[test_index]
    y_train_fold, y_test_fold = y[train_index], y[test_index]

    # Define the model architecture within the loop
    model = Sequential()
    model.add(Dense(128, activation='relu', input_shape=(X_train_fold.shape[1],)))
    model.add(Dense(64, activation='relu'))
    model.add(Dense(32, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))

    # Compile the model
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

    # Fit the model
    model.fit(X_train_fold, y_train_fold, epochs=100, batch_size=64, verbose=0)

    # Evaluate the model
    scores = model.evaluate(X_test_fold, y_test_fold, verbose=0)
    print(f"Accuracy for fold: {scores[1]*100:.2f}%")
    cvscores.append(scores[1] * 100)


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Accuracy for fold: 68.83%


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Accuracy for fold: 72.73%


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Accuracy for fold: 72.73%


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Accuracy for fold: 71.90%


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Accuracy for fold: 67.32%


**Reasoning**:
Calculate and print the mean and standard deviation of the cross-validation accuracies.



In [102]:
print(f"Mean Accuracy: {np.mean(cvscores):.2f}%")
print(f"Standard Deviation of Accuracy: {np.std(cvscores):.2f}%")

Mean Accuracy: 70.70%
Standard Deviation of Accuracy: 2.22%


## Ensemble methods

### Subtask:
Explore using ensemble methods like Random Forests or Gradient Boosting.


**Reasoning**:
Import the necessary libraries for Random Forest and evaluation metrics, then instantiate, train, predict, and evaluate the Random Forest model using the previously scaled training and test data.



In [103]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Instantiate a RandomForestClassifier model
# Using default parameters initially
rf_model = RandomForestClassifier(random_state=42)

# Train the Random Forest model on the scaled training data
# Assuming X_scaled is available from previous steps and matches the original data order before train_test_split
# We will use X_train and y_train from the train_test_split performed earlier
rf_model.fit(X_train, y_train)

# Predict on the scaled test data
y_pred_rf = rf_model.predict(X_test)

# Evaluate the performance of the Random Forest model
accuracy_rf = accuracy_score(y_test, y_pred_rf)
precision_rf = precision_score(y_test, y_pred_rf)
recall_rf = recall_score(y_test, y_pred_rf)
f1_rf = f1_score(y_test, y_pred_rf)

# Print the evaluation metrics
print("Random Forest Model Evaluation:")
print(f"Accuracy: {accuracy_rf:.4f}")
print(f"Precision: {precision_rf:.4f}")
print(f"Recall: {recall_rf:.4f}")
print(f"F1-score: {f1_rf:.4f}")

Random Forest Model Evaluation:
Accuracy: 0.7208
Precision: 0.6071
Recall: 0.6182
F1-score: 0.6126


## Algorithm selection

### Subtask:
Try different classification algorithms to see if another model performs better on your dataset.


In [104]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

**Reasoning**:
Create instances of Logistic Regression, SVC, and K-Nearest Neighbors classifiers, train them on the training data, make predictions on the test data, and calculate the accuracy for each model.



In [106]:
# Create instances of the classifiers
lr_model = LogisticRegression(random_state=42)
svc_model = SVC(kernel='linear', random_state=42)

# Train the models
lr_model.fit(X_train, y_train)
svc_model.fit(X_train, y_train)

# Make predictions
y_pred_lr = lr_model.predict(X_test)
y_pred_svc = svc_model.predict(X_test)

# Evaluate accuracy
accuracy_lr = accuracy_score(y_test, y_pred_lr)
accuracy_svc = accuracy_score(y_test, y_pred_svc)


# Print the accuracy scores
print(f"Logistic Regression Accuracy: {accuracy_lr:.4f}")
print(f"SVC (Linear Kernel) Accuracy: {accuracy_svc:.4f}")



STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Logistic Regression Accuracy: 0.7403
SVC (Linear Kernel) Accuracy: 0.7532


In [107]:
# 1. Define a list of the 8 feature names and their hint ranges
feature_names_with_hints = [
    ('Pregnancies', (0, 17)), # Changed hint to a tuple of min/max values
    ('Glucose', (44, 199)),
    ('BloodPressure', (24, 122)),
    ('SkinThickness', (7, 99)),
    ('Insulin', (14, 846)),
    ('BMI', (18.2, 67.1)),
    ('DiabetesPedigreeFunction', (0.078, 2.42)),
    ('Age', (21, 81))
    ]

# 2. Initialize an empty list to store the user's input values
user_input_values = []

# 3. Loop through the list of feature names and prompt for input with hints
for feature_name, hint_range in feature_names_with_hints:
    min_val, max_val = hint_range
    while True:
        try:
            value_str = input(f"Enter value for {feature_name} ({min_val}-{max_val}): ")
            # 4. Convert the user's input to a numerical type (float)
            value = float(value_str)

            # Add range validation
            if value < min_val or value > max_val:
                print(f"Input out of range. Please enter a value between {min_val} and {max_val}.")
            else:
                user_input_values.append(value)
                break # Exit the while loop if input is valid and within range
        except ValueError:
            print("Invalid input. Please enter a numerical value.")

print("\nUser input collected:", user_input_values)

Enter value for Pregnancies (0-17): 0
Enter value for Glucose (44-199): 45
Enter value for BloodPressure (24-122): 25
Enter value for SkinThickness (7-99): 8
Enter value for Insulin (14-846): 15
Enter value for BMI (18.2-67.1): 18.3
Enter value for DiabetesPedigreeFunction (0.078-2.42): 0.079
Enter value for Age (21-81): 22

User input collected: [0.0, 45.0, 25.0, 8.0, 15.0, 18.3, 0.079, 22.0]


In [108]:
# 1. Convert user input list to a NumPy array and reshape
user_input_array = np.array(user_input_values).reshape(1, -1)

# 2. Identify columns with potential outliers as zero
# These indices correspond to the original diabetes_df columns (0-indexed)
cols_with_outliers_as_zero_indices = [1, 2, 3, 4, 5] # Glucose, BloodPressure, SkinThickness, Insulin, BMI

# 3. Replace 0 values in the user input array for these columns with NaN
for col_index in cols_with_outliers_as_zero_indices:
    if user_input_array[0, col_index] == 0:
        user_input_array[0, col_index] = np.nan

# 4. Impute missing values (NaNs) with the mean of each column from the training data
# Need to access the means of the original training data (X_train) for imputation
# Calculate means from X_train (assuming X_train is available from previous steps)
training_means = X_train.mean()

for col_index in cols_with_outliers_as_zero_indices:
    if np.isnan(user_input_array[0, col_index]):
        user_input_array[0, col_index] = training_means[col_index]

# 5. Use the previously fitted scaler to transform the preprocessed user input array
# Assuming 'scaler' object is available from the preprocessing step
user_input_scaled = scaler.transform(user_input_array)

print("\nPreprocessed and scaled user input:", user_input_scaled)


Preprocessed and scaled user input: [[-1.14185152 -2.52125334 -3.92152113 -2.40784264 -1.65417529 -2.06056407
  -1.18653306 -0.95646168]]


**Reasoning**:
The preprocessed and scaled user input is ready for prediction. Use the loaded model to predict on the scaled input and display the prediction result.



In [110]:
# Assuming 'model' is the trained model object (either the Keras model or the best performing sklearn model)

# For the best performing sklearn model (SVC)
# Assuming svc_model is the trained SVC model from the previous step
prediction = svc_model.predict(user_input_scaled)

# Define a function to display the prediction and suggestions
def suggestions(prediction, user_input_values, feature_names):
    """Displays the diabetes prediction and provides suggestions based on input values.

    Args:
        prediction: The prediction from the model (0 or 1).
        user_input_values: A list of the original user input values.
        feature_names: A list of the feature names.
    """
    if prediction[0] == 1:
        print("\nPrediction: Based on the provided information, the person is likely diabetic.")
        print("It is highly recommended to consult a doctor for a proper diagnosis and personalized treatment plan.")
        print("\nHere are some general suggestions based on your input values:")

        # Access the original user input values for targeted suggestions
        # Using indices as before, assuming the order is consistent

        # Suggestion for high Glucose (index 1)
        if user_input_values[1] > 120: # Using a threshold as an example
            print(f"- Your {feature_names[1][0]} level is high. Consider reducing sugar intake and refined carbohydrates.")

        # Suggestion for high BloodPressure (index 2)
        if user_input_values[2] > 80: # Using a threshold as an example
            print(f"- Your {feature_names[2][0]} is high. Focus on a low-sodium diet and regular physical activity.")

        # Suggestion for high BMI (index 5)
        if user_input_values[5] > 25: # Using a threshold as an example
            print(f"- Your {feature_names[5][0]} is in the higher range. Aim for weight management through a balanced diet and exercise.")

        # Suggestion for low BMI (though less common for diabetes, can still provide general health advice)
        if user_input_values[5] < 18.5: # Using a threshold as an example
             print(f"- Your {feature_names[5][0]} is in the lower range. Ensure you are consuming nutritious food to maintain a healthy weight.")


        # General suggestion for exercise
        print("- Incorporate regular physical activity into your routine.")
        print("- Maintain a healthy and balanced diet.")

    else:
        print("\nPrediction: Based on the provided information, the person is likely not diabetic.")
        print("Suggestion: Maintaining a healthy lifestyle with balanced diet and regular exercise is always beneficial for preventing diabetes.")

# Call the function to display the prediction and suggestions
# Assuming feature_names_with_hints is available from the input collection step
suggestions(prediction, user_input_values, feature_names_with_hints)


Prediction: Based on the provided information, the person is likely not diabetic.
Suggestion: Maintaining a healthy lifestyle with balanced diet and regular exercise is always beneficial for preventing diabetes.
