In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer, accuracy_score
import joblib
from scikeras.wrappers import KerasClassifier

# Load the dataset
data = pd.read_csv('app/data/diabetes_datasett.csv')

# Handle missing values
imputer = SimpleImputer(strategy='mean')
data.iloc[:, :-1] = imputer.fit_transform(data.iloc[:, :-1])

# Define features and target
X = data.drop('Outcome', axis=1)
y = data['Outcome']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Create the model
model = Sequential()
model.add(Dense(128, input_shape=(X_train.shape[1],), activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(32, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Early stopping to avoid overfitting
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

# Train the model
history = model.fit(X_train, y_train, epochs=200, batch_size=32, validation_split=0.2, callbacks=[early_stopping])

def create_model():
    model = Sequential()
    model.add(Dense(128, input_shape=(X_train.shape[1],), activation='relu'))
    model.add(Dropout(0.3))
    model.add(Dense(64, activation='relu'))
    model.add(Dropout(0.3))
    model.add(Dense(32, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

# Create a KerasClassifier
model_cv = KerasClassifier(model=create_model, epochs=200, batch_size=32, verbose=0)

# Evaluate with cross-validation
scores = cross_val_score(model_cv, X_train, y_train, cv=5, scoring=make_scorer(accuracy_score))
print(f'Cross-validation accuracy: {scores.mean()}')

# Save the model
model.save('diabetes_model2.h5')

# Save the scaler
joblib.dump(scaler, 'scaler2.pkl')


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/200
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 49ms/step - accuracy: 0.5814 - loss: 0.6755 - val_accuracy: 0.7154 - val_loss: 0.6046
Epoch 2/200
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step - accuracy: 0.7637 - loss: 0.5506 - val_accuracy: 0.7642 - val_loss: 0.5333
Epoch 3/200
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - accuracy: 0.7872 - loss: 0.4913 - val_accuracy: 0.7561 - val_loss: 0.4827
Epoch 4/200
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - accuracy: 0.7688 - loss: 0.4687 - val_accuracy: 0.7642 - val_loss: 0.4675
Epoch 5/200
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - accuracy: 0.7887 - loss: 0.4599 - val_accuracy: 0.7805 - val_loss: 0.4594
Epoch 6/200
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step - accuracy: 0.7986 - loss: 0.4635 - val_accuracy: 0.7724 - val_loss: 0.4658
Epoch 7/200
[1m16/16[0m [

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [2]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

# Load the dataset
data = pd.read_csv('app/data/diabetes_datasett.csv')

# Handle missing values
imputer = SimpleImputer(strategy='mean')
data.iloc[:, :-1] = imputer.fit_transform(data.iloc[:, :-1])


In [None]:
# 1. Importing and Exploring Data

# Display the first few rows of the dataset
print(data.head())

# Summary statistics
print(data.describe())

# Check for missing values
print(data.isnull().sum())

# Basic info
print(data.info())


In [None]:
# 2. Data Analysis and Pattern Discovery

# Age distribution
plt.figure(figsize=(10, 6))
sns.histplot(data['Age'], bins=20, kde=True)
plt.title('Age Distribution')
plt.xlabel('Age')
plt.ylabel('Count')
plt.show()

# Glucose distribution
plt.figure(figsize=(10, 6))
sns.histplot(data['Glucose'], bins=20, kde=True)
plt.title('Glucose Distribution')
plt.xlabel('Glucose')
plt.ylabel('Count')
plt.show()

# Outcome distribution
plt.figure(figsize=(10, 6))
sns.countplot(data['Outcome'])
plt.title('Outcome Distribution')
plt.show()

# Correlation matrix
plt.figure(figsize=(12, 10))
sns.heatmap(data.corr(), annot=True, cmap='coolwarm')
plt.title('Correlation Matrix')
plt.show()


In [None]:
# 3. More Detailed Data Analysis

# Distribution of BloodPressure
plt.figure(figsize=(10, 6))
sns.histplot(data['BloodPressure'], bins=20, kde=True)
plt.title('Blood Pressure Distribution')
plt.xlabel('Blood Pressure')
plt.ylabel('Count')
plt.show()

# Distribution of BMI
plt.figure(figsize=(10, 6))
sns.histplot(data['BMI'], bins=20, kde=True)
plt.title('BMI Distribution')
plt.xlabel('BMI')
plt.ylabel('Count')
plt.show()

# Heatmap of correlations focusing on key features
plt.figure(figsize=(12, 10))
sns.heatmap(data[['Pregnancies', 'Glucose', 'BloodPressure', 'BMI', 'Age', 'Outcome']].corr(), annot=True, cmap='coolwarm')
plt.title('Correlation Matrix of Key Features')
plt.show()


In [None]:
# 4. Bivariate Analysis

# Age vs Outcome
plt.figure(figsize=(10, 6))
sns.boxplot(x='Outcome', y='Age', data=data)
plt.title('Age vs Outcome')
plt.show()

# Glucose vs Outcome
plt.figure(figsize=(10, 6))
sns.boxplot(x='Outcome', y='Glucose', data=data)
plt.title('Glucose vs Outcome')
plt.show()

# BMI vs Outcome
plt.figure(figsize=(10, 6))
sns.boxplot(x='Outcome', y='BMI', data=data)
plt.title('BMI vs Outcome')
plt.show()

# Scatter plot of Glucose vs BMI
plt.figure(figsize=(10, 6))
sns.scatterplot(x='Glucose', y='BMI', hue='Outcome', data=data)
plt.title('Glucose vs BMI')
plt.show()


In [None]:
# 5. Multiple Plots

# Pairplot to see relationships between features
sns.pairplot(data[['Pregnancies', 'Glucose', 'BloodPressure', 'BMI', 'Age', 'Outcome']], hue='Outcome')
plt.show()

# Violin plot of Age vs Outcome
plt.figure(figsize=(10, 6))
sns.violinplot(x='Outcome', y='Age', data=data)
plt.title('Age vs Outcome')
plt.show()

# Swarm plot of BMI vs Outcome
plt.figure(figsize=(10, 6))
sns.swarmplot(x='Outcome', y='BMI', data=data)
plt.title('BMI vs Outcome')
plt.show()


In [None]:
# 6. Time Series Analysis (if applicable)

import matplotlib.pyplot as plt
import seaborn as sns

# Assuming there's a Date column in the dataset
if 'Date' in data.columns:
    data['Date'] = pd.to_datetime(data['Date'])
    
    # Plotting the data over time
    plt.figure(figsize=(10, 6))
    sns.lineplot(x='Date', y='Glucose', hue='Outcome', data=data)
    plt.title('Glucose over Time')
    plt.show()
    
    # Plotting the data over time by Age
    plt.figure(figsize=(10, 6))
    sns.lineplot(x='Date', y='Age', hue='Outcome', data=data)
    plt.title('Age over Time by Outcome')
    plt.show()
    

In [None]:
# 7. 3D Plots

import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import seaborn as sns

# 3D scatter plot of Age, Glucose, and BMI
fig = plt.figure(figsize=(10, 8))
ax = fig.add_subplot(111, projection='3d')
ax.scatter(data['Age'], data['Glucose'], data['BMI'], c=data['Outcome'], cmap='coolwarm', s=50)
ax.set_xlabel('Age')
ax.set_ylabel('Glucose')
ax.set_zlabel('BMI')
plt.title('3D Scatter Plot of Age, Glucose, and BMI')
plt.show()


In [7]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Conv1D, Flatten, Dropout
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.models import load_model
import joblib

# Load dataset
data = pd.read_csv('app/data/diabetes_datasett.csv')

# Split features and target
X = data.drop('Outcome', axis=1)
y = data['Outcome']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Reshape the data for CNN input
X_train_scaled = X_train_scaled.reshape(X_train_scaled.shape[0], X_train_scaled.shape[1], 1)
X_test_scaled = X_test_scaled.reshape(X_test_scaled.shape[0], X_test_scaled.shape[1], 1)

# Build the CNN model
model = Sequential()
model.add(Conv1D(filters=32, kernel_size=2, activation='relu', input_shape=(X_train_scaled.shape[1], 1)))
model.add(Dropout(0.2))
model.add(Conv1D(filters=64, kernel_size=2, activation='relu'))
model.add(Dropout(0.2))
model.add(Flatten())
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Define callbacks
early_stopping = EarlyStopping(monitor='val_loss', patience=80, restore_best_weights=True)
model_checkpoint = ModelCheckpoint('best_model.h5.keras', monitor='val_loss', save_best_only=True)

# Train the model
history = model.fit(X_train_scaled, y_train, epochs=100, batch_size=32, validation_split=0.2, callbacks=[early_stopping, model_checkpoint])

# Save the model and scaler
model.save('final_model.h5.keras')
joblib.dump(scaler, 'scaler.save')

# Load the best model
best_model = load_model('best_model.h5.keras')

# Evaluate the model on the test set
loss, accuracy = best_model.evaluate(X_test_scaled, y_test)
print(f'Test Accuracy: {accuracy * 100:.2f}%')

# Additional metrics can be calculated if needed
from sklearn.metrics import classification_report
y_pred = (best_model.predict(X_test_scaled) > 0.5).astype("int32")
print(classification_report(y_test, y_pred))


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/100
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 42ms/step - accuracy: 0.6633 - loss: 0.6541 - val_accuracy: 0.6585 - val_loss: 0.6080
Epoch 2/100
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step - accuracy: 0.7265 - loss: 0.5375 - val_accuracy: 0.7805 - val_loss: 0.4950
Epoch 3/100
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step - accuracy: 0.7652 - loss: 0.4910 - val_accuracy: 0.7642 - val_loss: 0.4925
Epoch 4/100
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step - accuracy: 0.7401 - loss: 0.4993 - val_accuracy: 0.7724 - val_loss: 0.4711
Epoch 5/100
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step - accuracy: 0.7801 - loss: 0.4375 - val_accuracy: 0.7398 - val_loss: 0.4693
Epoch 6/100
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step - accuracy: 0.7678 - loss: 0.4694 - val_accuracy: 0.7561 - val_loss: 0.4666
Epoch 7/100
[1m16/16[0m [