In [None]:
import pandas as pd

In [None]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from keras.models import Sequential
from keras.layers import Dense
from sklearn.metrics import confusion_matrix, classification_report

In [None]:
# データの読み込み
df = pd.read_csv("cleaned_dataset_v0.2.csv")

In [None]:
df.head()

Unnamed: 0,Country,Year,UFMR(per1000births)_Both,UFMR(per1000births)_Male,UFMR(per1000births)_Female,Number_of_deaths_UF_Both,Number_of_deaths_UF_Male,Number_of_deaths_UF_Female,Early initiation of breastfeeding (%),Infants exclusively breastfed for the first six months of life (%),Region,UFMR_SDGS_Category
0,Algeria,2012,26.49,28.25,24.65,24174.0,13188.0,10986.0,35.7,25.4,Africa,Target Not Achieved
1,Algeria,2006,32.34,34.42,30.1,22698.0,12399.0,10299.0,49.5,6.9,Africa,Target Not Achieved
2,Angola,2015,88.34,95.01,81.18,100216.0,54635.0,45581.0,48.3,37.4,Africa,Target Not Achieved
3,Benin,2014,100.13,106.74,93.1,39709.0,21615.0,18094.0,46.6,41.4,Africa,Target Not Achieved
4,Benin,2012,104.6,111.33,97.46,39244.0,21327.0,17917.0,50.4,32.5,Africa,Target Not Achieved



**
*   ⭐️Preprocessing for the mortality dataset
: (includes sample, missing values and its finding i.e., data scarcity occurs in developed countries, it is not random; thus it is difficult to predict the values with the existing data which is mainly come from developing countries. )
*   ⭐️Preprocessing for the breastfeeding dataset (includes a search algorithm and its effect i.e., whether the cost function is 0.)
*   ⭐️Cleaning dataset (includes merging two datasets, variable selection, sample and its finding: imbalanced data.)
*   ⭐️Feature engineering (includes one-hot encoding.)
*   ⭐️Validation (imbalanced data; therefore, it requires 5-fold cross-validation not simply splitting.)
*   ⭐️Model building (includs 3 models and their parameters.)
*   ⭐️Model evaluation (includes these models' metrics and results.)

**





# ⭐️Preprocessing for the mortality dataset

## Missing values
// Purpose: There is a problem of data scarcity; thus, it is important to confirm whether there is a pattern.

// Results: Missing values in developed coutries are more than in developing countries.

// Potential apploaches: 1)use the other values or predict it to fill in the missing values, 2)delete the rows with missing values.

// Decision: This project employs the second approach. THe main reason is sample size after deleting these rows is over 300, meaning it is applicable to implement a machine learning. Additionally, the first approach has the risk that might cause bias by predicting the values with the existing data (in mainly the developing countries).

# ⭐️Preprocessing for the breastfeeding dataset

## Missing values

// Purpose: Find problems about missing values or other data issues.

// Results: There is a problem of year periods, which includes some values are the range of the years, not a single year.

## Alignment of year periods
// Problem and Purpose: There is a problem of year periods, which includes some values are the range of the years, not a single year; thus, it is crucial to select one year from each of these ranges for merging two datasets.

// Results: It is seen that one year exists in another row, and it can be merged into the row that has the range value.

// Potential approaches: 1)use the search algorithm that does not avoid local optima. 2)use the search algorithm that tries to avoid local optima.

// Decision: The first option (i.e., Hill climing algorithm) is better because the local optima could not occure. Moreover, it does not wast the computer resources compared with Simulated annealing, Tabu Search, and Genetic Algorithm.

## Results and analysis of Hill climbing algorithm

// Problem and Purpose: This is for confiring the selected algorithm performs well to align the year to merge two datasets.

// Results: The performance is good.

// Evidence: The chart is given below to confirm the performance of the hill clibming algorithm.

# ⭐️Cleaning dataset

## Merge two datasets into one

## Missing values and Sample information

// Purpose: To confirm whether the dateset is completely filled and can be used for AI/ML models. It also aims to identify potential biases in this section.

// Results: No missing values. However, the most data is related to the developing countries; therefore, it might cause the bias and should be reported in the limitation part.

## Variable selection
// Purpose: To identify the crucial variables and non-crucial variables. Additionaly this project considers additional variables for building a better model.


[For target variable]

// Results: XXX

// Potential approaches: 1) use "under-five mortality rates" (original data) or 2) use "UNICEF's Target Achieved" (Binary).

// Decision: the second approach is employed in this project. The main reason is that it is easier to understand the output of the model than just numerical data. This understandable result help not only technical users but also business members to grasp the results, enabling them to decide the investiment on the health care sector.


[For independent variables]

// Results: XXX

// Potential approaches: 1) use "country" or 2) add "region" based on the countries given in original dataset.

// Desicion: the second approach is a better choice because it can contribute to the decrease of the features, making the model learing faster.

# ⭐️Feature enginnering (It can specify the final input features)

In [None]:
# 1. 目的変数をSDGs基準に基づいて2値分類（UFMRが25以下を「目標達成」、26以上を「目標未達」）
sdgs_labels = ['Target Achieved', 'Target Not Achieved']  # 2つのクラスラベルを定義
df['UFMR_SDGS_Category'] = pd.cut(df['UFMR(per1000births)_Both'], bins=[0, 25, float('inf')], labels=sdgs_labels)

# 2. One-Hot Encoding（Regionをエンコーディング）
df_encoded_ann = pd.get_dummies(df, columns=['Region'], drop_first=True)

# 3. 説明変数と目的変数の定義
features_ann = ['Year', 'Early initiation of breastfeeding (%)',
                'Infants exclusively breastfed for the first six months of life (%)'] + \
               [col for col in df_encoded_ann.columns if col.startswith("Region_")]
X_ann = df_encoded_ann[features_ann]
y_ann = df_encoded_ann['UFMR_SDGS_Category']

# 4. データの前処理（ラベルを0と1に変換）
y_ann = y_ann.map({'Target Achieved': 0, 'Target Not Achieved': 1})  # 0: Target Achieved, 1: Target Not Achieved
y_ann = y_ann.astype(int) # Explicitly convert y_ann to integer dtype



In [None]:
from google.colab import drive
drive.mount('/content/drive')

## ⭐️Validation

In [None]:
# 5. データを学習用とテスト用に分割
X_train_ann, X_test_ann, y_train_ann, y_test_ann = train_test_split(X_ann, y_ann, test_size=0.2, random_state=42)

# 6. 特徴量の標準化
scaler = StandardScaler()
X_train_ann = scaler.fit_transform(X_train_ann)
X_test_ann = scaler.transform(X_test_ann)



In [None]:
# to splite the dataset into two datasets i.e., training set and test set

In [None]:
# k-fold(5-fold) cross-validation (this is function used in "Model ecaluation" part)

# ⭐️Building Models

In [None]:
import numpy as np

In [None]:
# Affine function (to transform data into linear model)
def affine(X, W, b):
    return np.dot(X, W) + b

In [None]:
# ReLU function (to transform the linear model into the non-linear model)
def relu(X):
    return np.maximum(0, X)

In [None]:
# Sigmoid function (to output probability for binary data)
def sigmoid(X):
    return 1 / (1 + np.exp(-X))

In [None]:
# Binary Crossentropy function (loss calculation)
def binary_crossentropy(y_true, y_pred):
    # Clip predictions to avoid log(0) errors
    y_pred = np.clip(y_pred, 1e-15, 1 - 1e-15)
    return -np.mean(y_true * np.log(y_pred) + (1 - y_true) * np.log(1 - y_pred))

In [None]:
# Initialise weights and biases
def initialise_weights(input_size, layer_size):
    W = np.random.randn(input_size, layer_size) * 0.01
    b = np.zeros((1, layer_size))
    return W, b

In [None]:
# Forward propagation (Affine -> Activation)
def forward_propagation(X, W, b, activation='relu'):
    Z = affine(X, W, b)
    if activation == 'relu':
        A = relu(Z)
    elif activation == 'sigmoid':
        A = sigmoid(Z)
    return A, Z

In [None]:
# Derivative of sigmoid function
def sigmoid_derivative(A):
    return A * (1 - A)

In [None]:
# Backpropagation and gradient descent
def backpropagation(X, y, learning_rate=0.01, iterations=50, model_layers=[16, 8, 1]):
    input_size = X.shape[1]

    # initialise weights and biases for each layer
    W1, b1 = initialise_weights(input_size, model_layers[0])
    W2, b2 = initialise_weights(model_layers[0], model_layers[1])
    W3, b3 = initialise_weights(model_layers[1], model_layers[2])

    losses = []  # To track the loss over iterations

    # Iterative training
    for i in range(iterations):
        # Forward propagation
        A1, Z1 = forward_propagation(X, W1, b1, activation='relu')
        A2, Z2 = forward_propagation(A1, W2, b2, activation='relu')
        A3, Z3 = forward_propagation(A2, W3, b3, activation='sigmoid')

        # Compute loss
        loss = binary_crossentropy_loss(y, A3)
        losses.append(loss)

        # Backpropagation
        # Gradient for output layer (Sigmoid -> Loss)
        dA3 = -(y / A3 - (1 - y) / (1 - A3))
        dZ3 = dA3 * sigmoid_derivative(A3)
        dW3 = np.dot(A2.T, dZ3) / X.shape[0]
        db3 = np.sum(dZ3, axis=0, keepdims=True) / X.shape[0]

        # Gradients for hidden layer 2 (ReLU)
        dA2 = np.dot(dZ3, W3.T)
        dZ2 = dA2 * (Z2 > 0)  # ReLU derivative
        dW2 = np.dot(A1.T, dZ2) / X.shape[0]
        db2 = np.sum(dZ2, axis=0, keepdims=True) / X.shape[0]

        # Gradients for hidden layer 1 (ReLU)
        dA1 = np.dot(dZ2, W2.T)
        dZ1 = dA1 * (Z1 > 0)  # ReLU derivative
        dW1 = np.dot(X.T, dZ1) / X.shape[0]
        db1 = np.sum(dZ1, axis=0, keepdims=True) / X.shape[0]

        # Gradient descent step: Update weights and biases
        W1 -= learning_rate * dW1
        b1 -= learning_rate * db1
        W2 -= learning_rate * dW2
        b2 -= learning_rate * db2
        W3 -= learning_rate * dW3
        b3 -= learning_rate * db3

        # Print loss every 10 iterations
        if i % 10 == 0:
            print(f"Iteration {i}, Loss: {loss}")

    return W1, b1, W2, b2, W3, b3, losses

# Example usage:
# W1, b1, W2, b2, W3, b3, losses = backpropagation(X_scaled, y, learning_rate=0.01, iterations=50)

In [None]:
# First model (Hidden layer: one (16 nodes), Activation function: ReLU, Final actication function: Sigmoid)

# 7. モデルの構築
model = Sequential()

# 入力層と隠れ層（最初の層）を設定（ノード数は16、活性化関数はReLU）
model.add(Dense(16, input_dim=X_train_ann.shape[1], activation='relu'))

# 出力層（1ノード、活性化関数はSigmoid）
model.add(Dense(1, activation='sigmoid'))

In [None]:
# Second model (Hidden layer: two (16 and 8 nodes respectively), Activation function: ReLU, Final actication function: Sigmoid)
# 7. モデルの構築
model = Sequential()

# 入力層と隠れ層（最初の層）を設定（ノード数は16、活性化関数はReLU）
model.add(Dense(16, input_dim=X_train_ann.shape[1], activation='relu'))

# 隠れ層（ノード数は8、活性化関数はReLU）
model.add(Dense(8, activation='relu'))

# 出力層（1ノード、活性化関数はSigmoid）
model.add(Dense(1, activation='sigmoid'))

In [None]:
# Third model (Hidden layer: three (16, 8 and 4 nodes respectively), Activation function: ReLU, Final actication function: Sigmoid)
# 7. モデルの構築
model = Sequential()

# 入力層と隠れ層（最初の層）を設定（ノード数は16、活性化関数はReLU）
model.add(Dense(16, input_dim=X_train_ann.shape[1], activation='relu'))

# 隠れ層（ノード数は8、活性化関数はReLU）
model.add(Dense(8, activation='relu'))

# 隠れ層（ノード数は4、活性化関数はReLU）
model.add(Dense(4, activation='relu'))

# 出力層（1ノード、活性化関数はSigmoid）
model.add(Dense(1, activation='sigmoid'))

# ⭐️Validation

## K-fold cross-validation

// Purpose: To evaluate the learning models with imbalanced dataset

// Potential Approaches: 1) Only one splitting and validating, or 2) using K-fold cross validation.

// Decision: This project implement the second approach because the project uses the imbalanced dataset.

In [None]:
from sklearn.model_selection import KFold

# ⭐️Model evaluation

## Accuracy, Precision, Recall, F1-score, ROC AUC, and PR AUC

// Purpose: To find the best model in the three ANNs.

// Potential Approaches: 1) use only Accuracy, Precision, Recall, and F1-score or 2) use them and ROC AUC and PR AUC.

// Decision: the second approach is employed in this project since the output data is imbalanced; therefore, it requires to use the metrics so that the researchers evaluate the models to avoid the potential bias.

// Results: XXX

// Memo: Area under curve receiver operating characteristic (ROC AUC) and Area under curve precision-recall (PR AUC)

In [None]:
# Accuracy

def calculate_accuracy(tp, tn, fp, fn):
    """Calculate Accuracy."""
    return (tp + tn) / (tp + tn + fp + fn)

In [None]:
# Precision

def calculate_precision(tp, fp):
    """Calculate Precision."""
    return tp / (tp + fp) if (tp + fp) > 0 else 0

In [None]:
# Recall

def calculate_recall(tp, fn):
    """Calculate Recall."""
    return tp / (tp + fn) if (tp + fn) > 0 else 0

In [None]:
# F1-score

def calculate_f1(precision, recall):
    """Calculate F1-Score."""
    return 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

In [None]:
# ROC AUC

def calculate_roc_auc(tp, tn, fp, fn):
    """Calculate ROC AUC (Area Under the Curve for Receiver Operating Characteristic)."""
    tpr = tp / (tp + fn)  # True Positive Rate (Recall)
    fpr = fp / (fp + tn)  # False Positive Rate
    return (1 + tpr - fpr) / 2  # Simplified calculation for ROC AUC

In [None]:
# PR AUC

def calculate_pr_auc(precision, recall):
    """Calculate PR AUC (Area Under the Precision-Recall Curve)."""
    return (precision + recall) / 2  # Simplified PR AUC calculation

In [None]:
from sklearn.metrics import confusion_matrix

# Placeholder for storing results
metrics_results = {
    "Accuracy": [],
    "Precision": [],
    "Recall": [],
    "F1-Score": [],
    "ROC AUC": [],
    "PR AUC": []
}



In [None]:
# While compiling, k-fold cross-varidation is used




In [None]:
# 8. モデルのコンパイル
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# 9. モデルの学習
model.fit(X_train_ann, y_train_ann, epochs=50, batch_size=10, verbose=1)

Epoch 1/50
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.3518 - loss: 1.0068
Epoch 2/50
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.4579 - loss: 0.8492
Epoch 3/50
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.6005 - loss: 0.7385
Epoch 4/50
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.6521 - loss: 0.6651
Epoch 5/50
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.6934 - loss: 0.5927
Epoch 6/50
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.7093 - loss: 0.5551
Epoch 7/50
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.7301 - loss: 0.5228
Epoch 8/50
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.7670 - loss: 0.4994  
Epoch 9/50
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m

<keras.src.callbacks.history.History at 0x7c3d6df5af20>

In [None]:
# To show the results of k-fold cross-varidation (average is also specified)

In [None]:
# Graph so that we could compare the three models

In [None]:
# 10. モデルの評価
_, train_acc = model.evaluate(X_train_ann, y_train_ann, verbose=0)
_, test_acc = model.evaluate(X_test_ann, y_test_ann, verbose=0)

In [None]:

# 11. テストデータでの予測
y_test_pred_ann = (model.predict(X_test_ann) > 0.5).astype(int)  # 出力を0/1に変換

[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 57ms/step


In [None]:
# 12. 混同行列と評価レポートの作成
conf_matrix_ann = confusion_matrix(y_test_ann, y_test_pred_ann)
class_report_ann = classification_report(y_test_ann, y_test_pred_ann, target_names=sdgs_labels)

# 結果の表示
print(f"Training Accuracy: {train_acc:.4f}")
print(f"Test Accuracy: {test_acc:.4f}")
print("\nConfusion Matrix (ANN):\n", conf_matrix_ann)
print("\nClassification Report (ANN):\n", class_report_ann)

Training Accuracy: 0.8889
Test Accuracy: 0.8961

Confusion Matrix (ANN):
 [[ 7  6]
 [ 2 62]]

Classification Report (ANN):
                      precision    recall  f1-score   support

    Target Achieved       0.78      0.54      0.64        13
Target Not Achieved       0.91      0.97      0.94        64

           accuracy                           0.90        77
          macro avg       0.84      0.75      0.79        77
       weighted avg       0.89      0.90      0.89        77

