In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [None]:
# データの読み込み
df = pd.read_csv("cleaned_dataset_v0.2.csv")

In [None]:
df.head()

Unnamed: 0,Country,Year,UFMR(per1000births)_Both,UFMR(per1000births)_Male,UFMR(per1000births)_Female,Number_of_deaths_UF_Both,Number_of_deaths_UF_Male,Number_of_deaths_UF_Female,Early initiation of breastfeeding (%),Infants exclusively breastfed for the first six months of life (%),Region,UFMR_SDGS_Category
0,Algeria,2012,26.49,28.25,24.65,24174.0,13188.0,10986.0,35.7,25.4,Africa,Target Not Achieved
1,Algeria,2006,32.34,34.42,30.1,22698.0,12399.0,10299.0,49.5,6.9,Africa,Target Not Achieved
2,Angola,2015,88.34,95.01,81.18,100216.0,54635.0,45581.0,48.3,37.4,Africa,Target Not Achieved
3,Benin,2014,100.13,106.74,93.1,39709.0,21615.0,18094.0,46.6,41.4,Africa,Target Not Achieved
4,Benin,2012,104.6,111.33,97.46,39244.0,21327.0,17917.0,50.4,32.5,Africa,Target Not Achieved


In [None]:
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from keras.models import Sequential
from keras.layers import Dense

In [None]:
# Step 1: Prepare the target variable based on the SDGs threshold
# Binarizing UFMR into "Target Achieved" and "Target Not Achieved" categories
sdgs_labels = ['Target Achieved', 'Target Not Achieved']
df['UFMR_SDGS_Category'] = pd.cut(df['UFMR(per1000births)_Both'], bins=[0, 25, float('inf')], labels=sdgs_labels)

In [None]:
# Step 2: One-hot encode the 'Region' feature
df_encoded_ann = pd.get_dummies(df, columns=['Region'], drop_first=True)

In [None]:
# Step 3: Define features (X) and target (y)
# Selecting features for the ANN model and setting target labels
features_ann = ['Year', 'Early initiation of breastfeeding (%)', 'Infants exclusively breastfed for the first six months of life (%)'] + \
               [col for col in df_encoded_ann.columns if col.startswith("Region_")]
X_ann = df_encoded_ann[features_ann]
y_ann = df_encoded_ann['UFMR_SDGS_Category']

In [None]:
# Step 4: Convert target labels into binary values (0: Target Achieved, 1: Target Not Achieved)
y_ann = y_ann.map({'Target Achieved': 0, 'Target Not Achieved': 1}).astype(int)

In [None]:
# Step 5: Split the dataset into training and test sets (80% train, 20% test)
X_train_ann, X_test_ann, y_train_ann, y_test_ann = train_test_split(X_ann, y_ann, test_size=0.2, random_state=42)

In [None]:
# Step 6: Standardise the features for better ANN training
scaler = StandardScaler()
X_train_ann = scaler.fit_transform(X_train_ann)
X_test_ann = scaler.transform(X_test_ann)

In [None]:
# Step 7: Build an ANN model function with customisable hidden layers
def build_model_1():
    model = Sequential()
    model.add(Dense(24, input_dim=X_train_ann.shape[1], activation='relu'))  # Input layer and one hidden layer with 24 nodes
    model.add(Dense(1, activation='sigmoid'))  # Output layer with sigmoid for binary classification
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])  # Compile the model
    return model

In [None]:
# Step 8: K-Fold Cross-Validation function to evaluate the model using macro average metrics
def evaluate_model_with_kfold_macro(X_train, y_train, model_fn, n_splits=5):
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)  # 5-fold cross-validation
    fold_accuracies, fold_precisions, fold_recalls, fold_f1_scores = [], [], [], []  # Lists to store metrics for each fold

    for train_index, val_index in kf.split(X_train):
        # Split data into training and validation sets for this fold
        X_train_fold, X_val_fold = X_train[train_index], X_train[val_index]
        y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]

        # Build and train the model for the current fold
        model = model_fn()
        model.fit(X_train_fold, y_train_fold, epochs=50, batch_size=10, verbose=0)

        # Make predictions and calculate metrics using macro average
        y_val_pred_prob = model.predict(X_val_fold)
        y_val_pred = (y_val_pred_prob > 0.5).astype(int)

        # Collect metrics
        fold_accuracies.append(accuracy_score(y_val_fold, y_val_pred))
        fold_precisions.append(precision_score(y_val_fold, y_val_pred, average='macro'))
        fold_recalls.append(recall_score(y_val_fold, y_val_pred, average='macro'))
        fold_f1_scores.append(f1_score(y_val_fold, y_val_pred, average='macro'))

    # Step 9: Display results per fold and average metrics across folds
    print("Fold-wise Results (Macro Average):")
    for i in range(n_splits):
        print(f"Fold {i+1}: Accuracy={fold_accuracies[i]:.4f}, Precision={fold_precisions[i]:.4f}, Recall={fold_recalls[i]:.4f}, F1-Score={fold_f1_scores[i]:.4f}")

    print("\nAverage Results Across Folds (Macro Average):")
    print(f"Accuracy: {np.mean(fold_accuracies):.4f}")
    print(f"Precision: {np.mean(fold_precisions):.4f}")
    print(f"Recall: {np.mean(fold_recalls):.4f}")
    print(f"F1-Score: {np.mean(fold_f1_scores):.4f}")

    # Return all results
    return {
        'accuracy': fold_accuracies,
        'precision': fold_precisions,
        'recall': fold_recalls,
        'f1_score': fold_f1_scores,
        'mean_accuracy': np.mean(fold_accuracies),
        'mean_precision': np.mean(fold_precisions),
        'mean_recall': np.mean(fold_recalls),
        'mean_f1_score': np.mean(fold_f1_scores)
    }


In [None]:
# Step 10: Evaluate Model 1 using K-Fold Cross-Validation with Macro Average metrics
model_1_results = evaluate_model_with_kfold_macro(X_train_ann, y_train_ann, build_model_1)

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step
Fold-wise Results (Macro Average):
Fold 1: Accuracy=0.9032, Precision=0.8616, Recall=0.8616, F1-Score=0.8616
Fold 2: Accuracy=0.8197, Precision=0.7913, Recall=0.7826, F1-Score=0.7866
Fold 3: Accuracy=0.8197, Precision=0.7167, Recall=0.7836, F1-Score=0.7384
Fold 4: Accuracy=0.8525, Precision=0.7912, Recall=0.7380, F1-Score=0.7589
Fold 5: Accuracy=0.8852, Precision=0.9300, Recall=0.8056, F1-Score=0.8417

Average Results Across Folds (Macro Average):
Accuracy: 0.8561
Precision: 0.8182
Recall: 0.7943
F1-Score: 0.7974


In [None]:
# Step 11: Train final model on full training data
final_model = build_model_1()
final_model.fit(X_train_ann, y_train_ann, epochs=50, batch_size=10, verbose=0)

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


<keras.src.callbacks.history.History at 0x781fea01b700>

In [None]:
# Step 12: Evaluate on test data (20% test set)
y_test_pred_prob = final_model.predict(X_test_ann)
y_test_pred = (y_test_pred_prob > 0.5).astype(int)

# Compute test metrics (on 20% test data)
test_accuracy = accuracy_score(y_test_ann, y_test_pred)
test_precision = precision_score(y_test_ann, y_test_pred, average='macro')
test_recall = recall_score(y_test_ann, y_test_pred, average='macro')
test_f1 = f1_score(y_test_ann, y_test_pred, average='macro')

print("\nTest Data Results (Macro Average on 20% test set):")
print(f"Accuracy: {test_accuracy:.4f}")
print(f"Precision: {test_precision:.4f}")
print(f"Recall: {test_recall:.4f}")
print(f"F1-Score: {test_f1:.4f}")


[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step

Test Data Results (Macro Average on 20% test set):
Accuracy: 0.9091
Precision: 0.8442
Recall: 0.8227
F1-Score: 0.8329



**
*   ⭐️Preprocessing for the mortality dataset
: (includes sample, missing values and its finding i.e., data scarcity occurs in developed countries, it is not random; thus it is difficult to predict the values with the existing data which is mainly come from developing countries. )
*   ⭐️Preprocessing for the breastfeeding dataset (includes a search algorithm and its effect i.e., whether the cost function is 0.)
*   ⭐️Cleaning dataset (includes merging two datasets, variable selection, sample and its finding: imbalanced data.)
*   ⭐️Feature engineering (includes one-hot encoding.)
*   ⭐️Validation (imbalanced data; therefore, it requires 5-fold cross-validation not simply splitting.)
*   ⭐️Model building (includs 3 models and their parameters.)
*   ⭐️Model evaluation (includes these models' metrics and results.)

**





# ⭐️Preprocessing for the mortality dataset

## Missing values
// Purpose: There is a problem of data scarcity; thus, it is important to confirm whether there is a pattern.

// Results: Missing values in developed coutries are more than in developing countries.

// Potential apploaches: 1)use the other values or predict it to fill in the missing values, 2)delete the rows with missing values.

// Decision: This project employs the second approach. THe main reason is sample size after deleting these rows is over 300, meaning it is applicable to implement a machine learning. Additionally, the first approach has the risk that might cause bias by predicting the values with the existing data (in mainly the developing countries).

# ⭐️Preprocessing for the breastfeeding dataset

## Missing values

// Purpose: Find problems about missing values or other data issues.

// Results: There is a problem of year periods, which includes some values are the range of the years, not a single year.

## Alignment of year periods
// Problem and Purpose: There is a problem of year periods, which includes some values are the range of the years, not a single year; thus, it is crucial to select one year from each of these ranges for merging two datasets.

// Results: It is seen that one year exists in another row, and it can be merged into the row that has the range value.

// Potential approaches: 1)use the search algorithm that does not avoid local optima. 2)use the search algorithm that tries to avoid local optima.

// Decision: The first option (i.e., Hill climing algorithm) is better because the local optima could not occure. Moreover, it does not wast the computer resources compared with Simulated annealing, Tabu Search, and Genetic Algorithm.

## Results and analysis of Hill climbing algorithm

// Problem and Purpose: This is for confiring the selected algorithm performs well to align the year to merge two datasets.

// Results: The performance is good.

// Evidence: The chart is given below to confirm the performance of the hill clibming algorithm.

# ⭐️Cleaning dataset

## Merge two datasets into one

## Missing values and Sample information

// Purpose: To confirm whether the dateset is completely filled and can be used for AI/ML models. It also aims to identify potential biases in this section.

// Results: No missing values. However, the most data is related to the developing countries; therefore, it might cause the bias and should be reported in the limitation part.

## Variable selection
// Purpose: To identify the crucial variables and non-crucial variables. Additionaly this project considers additional variables for building a better model.


[For target variable]

// Results: XXX

// Potential approaches: 1) use "under-five mortality rates" (original data) or 2) use "UNICEF's Target Achieved" (Binary).

// Decision: the second approach is employed in this project. The main reason is that it is easier to understand the output of the model than just numerical data. This understandable result help not only technical users but also business members to grasp the results, enabling them to decide the investiment on the health care sector.


[For independent variables]

// Results: XXX

// Potential approaches: 1) use "country" or 2) add "region" based on the countries given in original dataset.

// Desicion: the second approach is a better choice because it can contribute to the decrease of the features, making the model learing faster.

# ⭐️Feature enginnering (It can specify the final input features)

## ⭐️Validation

# ⭐️Building Models

// Class Labels:
"Target Achieved" (UFMR ≤ 25): This would be classified as Negative (0) because it represents a successful outcome where the under-five mortality rate is low.
"Target Not Achieved" (UFMR > 25): This would be classified as Positive (1) because it represents a failure to meet the target, where the under-five mortality rate is higher than desired.


// In Summary:
Positive Class (1): Represents "Target Not Achieved" (i.e., UFMR > 25).
Negative Class (0): Represents "Target Achieved" (i.e., UFMR ≤ 25).
Implications for Model Evaluation:
True Positives (TP): Cases where the model correctly predicts "Target Not Achieved" (UFMR > 25).
False Positives (FP): Cases where the model incorrectly predicts "Target Not Achieved" when the actual label is "Target Achieved."
True Negatives (TN): Cases where the model correctly predicts "Target Achieved" (UFMR ≤ 25).
False Negatives (FN): Cases where the model incorrectly predicts "Target Achieved" when the actual label is "Target Not Achieved."

# ⭐️Validation

## K-fold cross-validation

// Purpose: To evaluate the learning models with imbalanced dataset

// Potential Approaches: 1) Only one splitting and validating, or 2) using K-fold cross validation.

// Decision: This project implement the second approach because the project uses the imbalanced dataset.

# ⭐️Model evaluation

## Accuracy, Precision, Recall, F1-score, ROC AUC, and PR AUC

// Purpose: To find the best model in the three ANNs.

// Potential Approaches: 1) use only Accuracy, Precision, Recall, and F1-score or 2) use them and ROC AUC and PR AUC.

// Decision: the second approach is employed in this project since the output data is imbalanced; therefore, it requires to use the metrics so that the researchers evaluate the models to avoid the potential bias.

// Results: XXX

// Memo: Area under curve receiver operating characteristic (ROC AUC) and Area under curve precision-recall (PR AUC)

In [None]:
# Accuracy

def calculate_accuracy(tp, tn, fp, fn):
    """Calculate Accuracy."""
    return (tp + tn) / (tp + tn + fp + fn)

In [None]:
# Precision

def calculate_precision(tp, fp):
    """Calculate Precision."""
    return tp / (tp + fp) if (tp + fp) > 0 else 0

In [None]:
# Recall

def calculate_recall(tp, fn):
    """Calculate Recall."""
    return tp / (tp + fn) if (tp + fn) > 0 else 0

In [None]:
# F1-score

def calculate_f1(precision, recall):
    """Calculate F1-Score."""
    return 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

In [None]:
# ROC AUC

def calculate_roc_auc(tp, tn, fp, fn):
    """Calculate ROC AUC (Area Under the Curve for Receiver Operating Characteristic)."""
    tpr = tp / (tp + fn)  # True Positive Rate (Recall)
    fpr = fp / (fp + tn)  # False Positive Rate
    return (1 + tpr - fpr) / 2  # Simplified calculation for ROC AUC

In [None]:
# PR AUC

def calculate_pr_auc(precision, recall):
    """Calculate PR AUC (Area Under the Precision-Recall Curve)."""
    return (precision + recall) / 2  # Simplified PR AUC calculation

In [None]:
from sklearn.metrics import confusion_matrix

# Placeholder for storing results
metrics_results = {
    "Accuracy": [],
    "Precision": [],
    "Recall": [],
    "F1-Score": [],
    "ROC AUC": [],
    "PR AUC": []
}



In [None]:
# While compiling, k-fold cross-varidation is used




In [None]:
# 8. モデルのコンパイル
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# 9. モデルの学習
model.fit(X_train_ann, y_train_ann, epochs=50, batch_size=10, verbose=1)

Epoch 1/50
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.3518 - loss: 1.0068
Epoch 2/50
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.4579 - loss: 0.8492
Epoch 3/50
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.6005 - loss: 0.7385
Epoch 4/50
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.6521 - loss: 0.6651
Epoch 5/50
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.6934 - loss: 0.5927
Epoch 6/50
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.7093 - loss: 0.5551
Epoch 7/50
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.7301 - loss: 0.5228
Epoch 8/50
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.7670 - loss: 0.4994  
Epoch 9/50
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m

<keras.src.callbacks.history.History at 0x7c3d6df5af20>

In [None]:
# To show the results of k-fold cross-varidation (average is also specified)

In [None]:
# Graph so that we could compare the three models

In [None]:
# Affine function (to transform data into linear model)
def affine(X, W, b):
    return np.dot(X, W) + b

In [None]:
# ReLU function (to transform the linear model into the non-linear model)
def relu(X):
    return np.maximum(0, X)

In [None]:
# Sigmoid function (to output probability for binary data)
def sigmoid(X):
    return 1 / (1 + np.exp(-X))

In [None]:
# Binary Crossentropy function (loss calculation)
def binary_crossentropy(y_true, y_pred):
    # Clip predictions to avoid log(0) errors
    y_pred = np.clip(y_pred, 1e-15, 1 - 1e-15)
    return -np.mean(y_true * np.log(y_pred) + (1 - y_true) * np.log(1 - y_pred))

In [None]:
# Initialise weights and biases
def initialise_weights(input_size, layer_size):
    W = np.random.randn(input_size, layer_size) * 0.01
    b = np.zeros((1, layer_size))
    return W, b

In [None]:
# Forward propagation (Affine -> Activation)
def forward_propagation(X, W, b, activation='relu'):
    Z = affine(X, W, b)
    if activation == 'relu':
        A = relu(Z)
    elif activation == 'sigmoid':
        A = sigmoid(Z)
    return A, Z

In [None]:
# Derivative of sigmoid function
def sigmoid_derivative(A):
    return A * (1 - A)