<a href="https://colab.research.google.com/github/abubakar-ahmed/Formative_1_Databases/blob/main/Task3_Fetch_Data_From_API_For_Predictions.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [66]:
!pip install keras-tuner --upgrade



In [67]:
import pandas as pd
import numpy as np
import requests
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.utils import class_weight
from imblearn.over_sampling import SMOTE
from tensorflow.keras import layers, regularizers, models, optimizers
from tensorflow.keras.callbacks import EarlyStopping
from keras_tuner.tuners import RandomSearch
from keras_tuner import HyperModel
from tensorflow.keras.models import load_model
import os

# --- Step 1: Fetch Data from API ---
api_url = "https://formative-1-databases.onrender.com//medical-history/"
response = requests.get(api_url)

# Process if data is fetched successfully
if response.status_code == 200:
    data = response.json()
    df = pd.DataFrame(data)
    print("Data fetched successfully!")
    print(df.head())

    features = [
        'Disease_Duration', 'Family_History', 'Substance_Use',
        'Suicide_Attempt', 'Positive_Symptom_Score',
        'Negative_Symptom_Score', 'GAF_Score'
    ]
    target = 'Diagnosis'

    X = df[features]
    y = df[target]

    # Encode target
    label_encoder = LabelEncoder()
    y_encoded = label_encoder.fit_transform(y)

    # --- Apply SMOTE Oversampling with adaptive k_neighbors ---
    min_class_count = np.min(np.bincount(y_encoded))
    if min_class_count > 1:
        k_neighbors = min(5, min_class_count - 1)
        smote = SMOTE(random_state=42, k_neighbors=k_neighbors)
        X_resampled, y_resampled = smote.fit_resample(X, y_encoded)
    else:
        print("Not enough samples for SMOTE. Proceeding without resampling.")
        X_resampled, y_resampled = X, y_encoded

    # Train-test split (modified to check for class count)
    try:
        X_train, X_test, y_train, y_test = train_test_split(
            X_resampled, y_resampled, test_size=0.2, random_state=42, stratify=y_resampled, shuffle=True
        )
    except ValueError:
        print("Stratified split failed due to insufficient samples in one of the classes. Falling back to random split.")
        X_train, X_test, y_train, y_test = train_test_split(
            X_resampled, y_resampled, test_size=0.2, random_state=42, shuffle=True
        )

    # --- Step 2: Calculate Class Weights ---
    class_weights = class_weight.compute_class_weight(
        class_weight='balanced',
        classes=np.unique(y_resampled),
        y=y_train
    )
    class_weights = dict(enumerate(class_weights))

    # --- Step 3: Define HyperModel ---
    class MyHyperModel(HyperModel):
        def build(self, hp):
            model = models.Sequential()
            model.add(layers.Input(shape=(X_train.shape[1],)))
            model.add(layers.Dense(
                units=hp.Int('units', min_value=32, max_value=128, step=16),
                activation='relu',
                kernel_regularizer=regularizers.l2(0.01)
            ))
            model.add(layers.BatchNormalization())
            model.add(layers.Dropout(0.5))
            model.add(layers.Dense(64, activation='relu'))
            model.add(layers.Dense(len(np.unique(y_resampled)), activation='softmax'))

            model.compile(
                optimizer=optimizers.Adam(learning_rate=0.001),
                loss='sparse_categorical_crossentropy',
                metrics=['accuracy']
            )
            return model

    # --- Step 4: Hyperparameter Tuning ---
    tuner = RandomSearch(
        MyHyperModel(),
        objective='val_accuracy',
        max_trials=5,
        executions_per_trial=1,
        directory='my_dir',
        project_name='schizophrenia_diagnosis'
    )

    early_stop = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
    tuner.search(X_train, y_train, epochs=50, validation_split=0.2, callbacks=[early_stop], class_weight=class_weights)

    best_model = tuner.get_best_models(num_models=1)[0]
    print("Best model retrieved successfully.")

    # --- Step 5: Evaluate and Save the Model ---
    test_loss, test_accuracy = best_model.evaluate(X_test, y_test)
    print(f"Test Accuracy: {test_accuracy * 100:.2f}%")

    # Make predictions
    y_pred = np.argmax(best_model.predict(X_test), axis=1)

    # Print confusion matrix
    print("\nConfusion Matrix:")
    print(confusion_matrix(y_test, y_pred, labels=np.unique(y_resampled)))

    # Print classification report
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred, labels=np.unique(y_resampled)))

    best_model.save('best_trained_model.keras')
    print('Model saved to best_trained_model.keras')

else:
    print(f"Failed to fetch data. Status code: {response.status_code}")
    print(response.text)
    df = None

# --- Step 6: Load Model and Make Predictions
def make_prediction(input_data):
    if not os.path.exists('best_trained_model.keras'):
        print("Model file not found. Skipping prediction.")
        return None

    model = load_model('best_trained_model.keras')
    input_array = np.array([input_data])
    prediction = model.predict(input_array)
    predicted_class = np.argmax(prediction, axis=1)
    return predicted_class

# Example prediction only if model exists
if os.path.exists('best_trained_model.keras'):
    example_input = [3, 1, 1, 0, 15, 40, 72]
    predicted = make_prediction(example_input)

    if predicted is not None:
        predicted_label = label_encoder.inverse_transform(predicted)
        print(f"Predicted Diagnosis: {predicted_label[0]}")
    else:
        print("Prediction skipped due to missing model.")
else:
    print("No trained model available. Skipping prediction.")






Data fetched successfully!
   Diagnosis  Disease_Duration  Hospitalizations  Family_History  \
0          2                 4                 2               2   
1          1                 5                 2               0   
2          1                 5                 2               0   
3          1                 5                 2               0   
4          1                 5                 2               0   

   Substance_Use  Suicide_Attempt  Positive_Symptom_Score  \
0              2                2                      50   
1              1                0                      45   
2              1                0                      45   
3              1                0                      45   
4              1                0                      45   

   Negative_Symptom_Score  GAF_Score  Patient_ID  
0                      40         60           2  
1                      70         68           3  
2                      30         56        

  saveable.load_own_variables(weights_store.get(inner_path))


Best model retrieved successfully.
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 531ms/step - accuracy: 1.0000 - loss: 0.1097
Test Accuracy: 100.00%
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 110ms/step

Confusion Matrix:
[[1 0]
 [0 0]]

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         1
           1       0.00      0.00      0.00         0

    accuracy                           1.00         1
   macro avg       0.50      0.50      0.50         1
weighted avg       1.00      1.00      1.00         1

Model saved to best_trained_model.keras


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  saveable.load_own_variables(weights_store.get(inner_path))


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 103ms/step
Predicted Diagnosis: 1


In [89]:
import pandas as pd
import numpy as np
from keras.models import load_model

# Step 1: Load the pre-trained model
model = load_model('best_trained_model.keras')

# Step 2: Load the data you want to predict
X_to_predict = patients_to_predict.drop(columns=['Patient_ID'])  # Drop the Patient_ID or other non-feature columns

# Step 3: Ensure the input data for prediction matches the shape used in training

print(f"Shape of X_to_predict before ensuring consistency: {X_to_predict.shape}")
X_to_predict = X_to_predict.iloc[:, :7]
print(f"Shape of X_to_predict after consistency check: {X_to_predict.shape}")

# Step 4: Make predictions for the selected patients
predictions = model.predict(X_to_predict)

# Step 5: Convert the predictions to binary labels (0 or 1)
predicted_classes = (predictions > 0.5).astype(int)

# Step 6: Map predictions to human-readable labels ("No Schizophrenia" or "Schizophrenia")
predicted_labels = np.where(predicted_classes == 0, "No Schizophrenia", "Schizophrenia")

# Step 7: Add predictions to the original DataFrame
patients_to_predict['Predicted_Diagnosis'] = predicted_labels

# Step 8: Show the predictions
print(patients_to_predict[['Patient_ID', 'Predicted_Diagnosis']])

# Optional: Save the predictions to a new CSV file
patients_to_predict.to_csv('patients_with_predictions.csv', index=False)


Shape of X_to_predict before ensuring consistency: (5, 10)
Shape of X_to_predict after consistency check: (5, 7)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 323ms/step
   Patient_ID Predicted_Diagnosis
0           2       Schizophrenia
1           3       Schizophrenia
2           4       Schizophrenia
3           1       Schizophrenia
4           6       Schizophrenia


In [90]:
# Step 1: Get the raw predicted probabilities (instead of class labels)
predicted_probabilities = model.predict(X_to_predict)

# Step 2: Show the probabilities for each patient
print("Predicted Probabilities (for Schizophrenia):")
print(predicted_probabilities)

# Step 3: Convert probabilities to labels (0 for No Schizophrenia, 1 for Schizophrenia)
predicted_classes = (predicted_probabilities > 0.5).astype(int)

# Step 4: Map predictions to human-readable labels ("No Schizophrenia" or "Schizophrenia")
predicted_labels = np.where(predicted_classes == 0, "No Schizophrenia", "Schizophrenia")

# Step 5: Add predictions to the original DataFrame
patients_to_predict['Predicted_Diagnosis'] = predicted_labels

# Step 6: Show the predictions
print(patients_to_predict[['Patient_ID', 'Predicted_Diagnosis']])


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 43ms/step
Predicted Probabilities (for Schizophrenia):
[[9.9999547e-01 4.4941012e-06]
 [9.9998200e-01 1.7951921e-05]
 [9.9998200e-01 1.7951921e-05]
 [9.9998200e-01 1.7951921e-05]
 [9.9998200e-01 1.7951921e-05]]
   Patient_ID Predicted_Diagnosis
0           2       Schizophrenia
1           3       Schizophrenia
2           4       Schizophrenia
3           1       Schizophrenia
4           6       Schizophrenia


In [118]:
import pandas as pd

#Load the dataset from the provided CSV link
url = "https://github.com/abubakar-ahmed/Formative_1_Databases/raw/main/schizophrenia_dataset_eng_version.csv"
df_new = pd.read_csv(url)

#Check the first few rows of the new dataset
print(df_new.head())

   Patient ID  Age  Gender  Education Level  Martial Status  Occupation  \
0           1   72       1                4               2           0   
1           2   49       1                5               2           2   
2           3   53       1                5               3           2   
3           4   67       1                3               2           0   
4           5   54       0                1               2           0   

   Income level  Live Area  Diagnosis  Disease Duration  Hospitalizations  \
0             2          1          0                 0                 0   
1             1          0          1                35                 1   
2             1          0          1                32                 0   
3             2          0          0                 0                 0   
4             2          1          0                 0                 0   

   Family History  Substance use  Suicide Attempt  Positive Symptom Score  \
0        

In [119]:
# existing DataFrame from the API and df_new is the new data from the CSV
df_combined = pd.concat([df, df_new], ignore_index=True)

# Check the combined data
print(df_combined.head())


   Diagnosis  Disease_Duration  Hospitalizations  Family_History  \
0          2               4.0                 2             2.0   
1          1               5.0                 2             0.0   
2          1               5.0                 2             0.0   
3          1               5.0                 2             0.0   
4          1               5.0                 2             0.0   

   Substance_Use  Suicide_Attempt  Positive_Symptom_Score  \
0            2.0              2.0                    50.0   
1            1.0              0.0                    45.0   
2            1.0              0.0                    45.0   
3            1.0              0.0                    45.0   
4            1.0              0.0                    45.0   

   Negative_Symptom_Score  GAF_Score  Patient_ID  ...  Disease Duration  \
0                    40.0       60.0         2.0  ...               NaN   
1                    70.0       68.0         3.0  ...               NaN   

In [122]:
df_cleaned = df_combined.dropna(subset=['Diagnosis'])


In [123]:
# Check the column names in the dataset
print(df.columns)


Index(['Diagnosis', 'Disease_Duration', 'Hospitalizations', 'Family_History',
       'Substance_Use', 'Suicide_Attempt', 'Positive_Symptom_Score',
       'Negative_Symptom_Score', 'GAF_Score', 'Patient_ID'],
      dtype='object')


In [124]:
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.utils import class_weight
from sklearn.metrics import classification_report, confusion_matrix
from tensorflow.keras import layers, models
from sklearn.preprocessing import StandardScaler

# Define the model architecture
def build_model(input_shape):
    model = models.Sequential([
        layers.Input(shape=input_shape),  # Input layer
        layers.Dense(32, activation='relu'),  # First hidden layer
        layers.Dropout(0.3),  # Dropout for regularization
        layers.Dense(16, activation='relu'),  # Second hidden layer
        layers.Dropout(0.3),
        layers.Dense(1, activation='sigmoid')  # Output layer (for binary classification)
    ])
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
                  loss='binary_crossentropy',
                  metrics=['accuracy'])
    return model

# Example data
X = np.random.rand(100, 8)
y = np.random.randint(0, 2, 100)

# Normalize data for better model performance
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the dataset
X_train, X_val, y_train, y_val = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Handle class imbalance using class weights
class_weights = class_weight.compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
class_weights_dict = {i: class_weights[i] for i in range(len(class_weights))}

# Build and train the model
model = build_model((X_train.shape[1],))
history = model.fit(X_train, y_train, epochs=20, batch_size=8, validation_data=(X_val, y_val), class_weight=class_weights_dict)

# Evaluate the model on test data
X_test = np.random.rand(10, 8)
y_test = np.random.randint(0, 2, 10)

# Normalize test data
X_test_scaled = scaler.transform(X_test)

test_loss, test_accuracy = model.evaluate(X_test_scaled, y_test)
print(f"Test Accuracy: {test_accuracy * 100:.2f}%")

# Predict on the test data
y_pred = (model.predict(X_test_scaled) > 0.5).astype(int)

# Confusion Matrix and Classification Report
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("Classification Report:")
print(classification_report(y_test, y_pred))



Epoch 1/20
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 32ms/step - accuracy: 0.4737 - loss: 0.7626 - val_accuracy: 0.5000 - val_loss: 0.7920
Epoch 2/20
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 0.4302 - loss: 0.7467 - val_accuracy: 0.5000 - val_loss: 0.7824
Epoch 3/20
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 0.5749 - loss: 0.6775 - val_accuracy: 0.4500 - val_loss: 0.7837
Epoch 4/20
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - accuracy: 0.4910 - loss: 0.6924 - val_accuracy: 0.4500 - val_loss: 0.7839
Epoch 5/20
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step - accuracy: 0.5043 - loss: 0.7131 - val_accuracy: 0.4500 - val_loss: 0.7775
Epoch 6/20
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - accuracy: 0.5444 - loss: 0.7136 - val_accuracy: 0.4500 - val_loss: 0.7753
Epoch 7/20
[1m10/10[0m [32m━━━━

In [132]:
test_loss, test_accuracy = model.evaluate(X_test_scaled, y_test)
print(f"Test Loss: {test_loss}, Test Accuracy: {test_accuracy}")


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 44ms/step - accuracy: 0.7000 - loss: 0.6506
Test Loss: 0.6505604386329651, Test Accuracy: 0.699999988079071


In [128]:
# Generate 5 new samples with random values
X_new = np.random.rand(5, 8)  # 5 new samples, 8 features

# Normalize the new data using the scaler
X_new_scaled = scaler.transform(X_new)

# Make predictions using the trained model
predictions = model.predict(X_new_scaled)

# Convert predictions to binary (0 for no schizophrenia, 1 for schizophrenia)
predictions_binary = (predictions > 0.5).astype(int)

# Output the predictions
for i, pred in enumerate(predictions_binary):
    print(f"Prediction for sample {i + 1}: {'Schizophrenia' if pred == 1 else 'No Schizophrenia'}")


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 88ms/step
Prediction for sample 1: No Schizophrenia
Prediction for sample 2: No Schizophrenia
Prediction for sample 3: No Schizophrenia
Prediction for sample 4: Schizophrenia
Prediction for sample 5: No Schizophrenia


In [129]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

# Load the CSV data from the provided URL
url = "https://github.com/abubakar-ahmed/Formative_1_Databases/raw/main/schizophrenia_dataset_eng_version.csv"
df = pd.read_csv(url)

# Ensure the features selected match those used during training
training_features = [
    'Disease Duration', 'Family History', 'Substance use', 'Suicide Attempt',
    'Positive Symptom Score', 'Negative Symptom Score', 'GAF Score', 'Social Support'
]

# Selecting only the relevant features for the new samples
X_new = df[training_features]

#  Drop rows with missing values
X_new = X_new.dropna()

# Normalize the data
scaler = StandardScaler()
X_new_scaled = scaler.fit_transform(X_new)

# Reshape the data to match the model input shape
X_new_scaled_reshaped = X_new_scaled[-5:].reshape(-1, 8)  # 5 samples, 8 features

# Make predictions on the reshaped last 5 samples
predictions = model.predict(X_new_scaled_reshaped)

# Convert the predictions to readable output
prediction_labels = ['Schizophrenia' if p > 0.5 else 'No Schizophrenia' for p in predictions]

# Display the predictions for the last 5 samples
for i, (sample, label) in enumerate(zip(df.index[-5:], prediction_labels)):
    print(f"Prediction for sample {sample}: {label}")



[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step
Prediction for sample 9995: No Schizophrenia
Prediction for sample 9996: Schizophrenia
Prediction for sample 9997: No Schizophrenia
Prediction for sample 9998: No Schizophrenia
Prediction for sample 9999: No Schizophrenia


In [130]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

# Load the CSV data from the provided URL
url = "https://github.com/abubakar-ahmed/Formative_1_Databases/raw/main/schizophrenia_dataset_eng_version.csv"
df = pd.read_csv(url)

# Ensure the features selected match those used during training
training_features = [
    'Disease Duration', 'Family History', 'Substance use', 'Suicide Attempt',
    'Positive Symptom Score', 'Negative Symptom Score', 'GAF Score', 'Social Support'
]

# Selecting only the relevant features for the new samples
X_new = df[training_features]

#  Drop rows with missing values
X_new = X_new.dropna()

# Normalize the data
scaler = StandardScaler()
X_new_scaled = scaler.fit_transform(X_new)

# Select random 5 samples from the dataset
random_samples_scaled = X_new_scaled[np.random.choice(X_new_scaled.shape[0], 5, replace=False)]

# Make predictions on the random 5 samples
predictions = model.predict(random_samples_scaled)

# Convert the predictions to readable output
prediction_labels = ['Schizophrenia' if p > 0.5 else 'No Schizophrenia' for p in predictions]

# Display the predictions for the random 5 samples
for i, (sample, label) in enumerate(zip(df.index[np.random.choice(X_new_scaled.shape[0], 5, replace=False)], prediction_labels)):
    print(f"Prediction for sample {sample}: {label}")


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 61ms/step
Prediction for sample 7790: Schizophrenia
Prediction for sample 6977: No Schizophrenia
Prediction for sample 153: Schizophrenia
Prediction for sample 5523: No Schizophrenia
Prediction for sample 1329: No Schizophrenia
