In [50]:
#importing libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
from tensorflow.keras.regularizers import l2
import tensorflow as tf
import seaborn as sns
import matplotlib.pyplot as plt

In [51]:
# Read the databases from resources
health_factors = pd.read_csv("/content/Health_Risk_factors.csv")
agriculture = pd.read_csv("/content/agricultural_inputs.csv")
climate = pd.read_csv("/content/climate.csv")
freshwater = pd.read_csv("/content/freshwater.csv")
health_system = pd.read_csv("/content/health_system.csv")


In [52]:
hf = health_factors.rename(columns={
    "Incidence of tuberculosis  per 100,000 people 2018": "tuberculosis",
    "Prevalence of HIV Total % of population ages 15-49 2018": "HIV_total",
    "Prevalence of HIV Women's share of population ages 15+ living with HIV % 2018": "HIV_female",
    "Prevalence of HIV Youth, Male % of population ages 15-24 2018": "HIV_youth_male",
    "Prevalence of HIV Youth, Female % of population ages 15-24 2018": "HIV_youth_female",
    "Prevalence of diabetes  % of population ages 20 to 79 2019": "diabetes",
    "Cause of death Communicable diseases and maternal, prenatal, and nutrition conditions % of population 2016":"Diseases_death",
})
hs = health_system.rename(columns={
    'External health expenditure (% of current health expenditure)   2016': "ext_health_expend",
    "Health expenditure Public % of current 2016": "public_health_expend",
    "Health workers Physicians per 1,000 people 2009-18": "physicians",
    "Health workers Nurses and midwives per 1,000 people 2009-18": "nurses",
    "Specialist surgical workforce  per 100,000 population 2008-18": "surgeries"
     })
fw = freshwater.rename(columns={
    'Annual freshwater withdrawals  % for agriculture 2015': "water_agriculture",
    "People using at least basic drinking water services Urban % of urban population 2018": "water_urban",
    "People using at least basic drinking water services Rural % of Rural population 2018": "water_rural",})
cl = climate.rename(columns={
    'Resilience Disaster risk reduction progress score 1, worst to 5,best 2011':"Disaster_risk",
    'Exposure to impact Urban population living in areas where elevation is below 5 meters % of urban population 2010':"urban_population",
    'Exposure to impact Population affected by droughts, floods, and extreme temperatures average annual; % of total population 2009':"climet_affected_population"
})
ag = agriculture.rename(columns={
    'Fertilizer consumption  kilograms per hectare of arable land 2014-16': "fertilizer",
    "Agricultural employment  % of total employment 2000-02": "farmers_2000",
    "Agricultural employment  % of total employment 2014-16": "farmers_recent",
    "Agricultural machinery tractors per 100 sq. km of arable land 2009": "agricultur_machinery"})


In [53]:
merged_df = pd.concat([hf[[ 'Diseases_death', 'tuberculosis']],
                      hs[['ext_health_expend', 'public_health_expend', 'physicians', 'nurses', 'surgeries']],
                      fw[[ 'water_urban', 'water_rural']],
                      cl[[ 'climet_affected_population']],
                      ag[['fertilizer', 'farmers_2000', 'farmers_recent']]],
                     axis=1)

In [54]:
merged_df.columns

Index(['Diseases_death', 'tuberculosis', 'ext_health_expend',
       'public_health_expend', 'physicians', 'nurses', 'surgeries',
       'water_urban', 'water_rural', 'climet_affected_population',
       'fertilizer', 'farmers_2000', 'farmers_recent'],
      dtype='object')

In [55]:
merged_df.describe()

Unnamed: 0,Diseases_death,tuberculosis,ext_health_expend,public_health_expend,physicians,nurses,surgeries,water_urban,water_rural,climet_affected_population,fertilizer,farmers_2000,farmers_recent
count,183.0,207.0,167.0,186.0,189.0,189.0,175.0,174.0,168.0,168.0,157.0,187.0,187.0
mean,22.31694,102.89372,9.116766,52.915054,1.72328,4.139153,37.937143,94.198851,80.005952,1.171429,141.32293,31.260963,25.916043
std,20.291504,137.634271,13.868251,21.957736,1.570366,3.965282,40.87752,7.935433,22.542109,1.958312,149.705871,25.199064,23.425185
min,1.0,0.0,0.0,5.1,0.0,0.1,0.0,64.7,22.8,0.0,0.3,0.3,0.1
25%,6.0,9.5,0.2,36.825,0.3,1.1,3.1,90.775,61.65,0.0,22.9,7.45,5.0
50%,13.0,45.0,1.7,55.85,1.3,2.8,23.6,97.8,89.95,0.25,112.1,24.2,18.6
75%,36.0,144.0,13.4,70.6,2.8,6.1,62.15,99.8,99.2,1.3,196.1,49.35,42.0
max,65.0,611.0,69.2,96.0,8.2,20.3,195.6,100.0,100.0,9.2,750.7,92.2,91.8


In [56]:
#df1 = ag.replace('', np.nan)  # Replace empty values with NaN
df1 = merged_df.dropna()  # Drop rows with any missing values
#merged_df.to_csv("merged_df.csv")

In [57]:
df1["tuberculosis"].describe()

count    104.000000
mean     111.105769
std      135.983503
min        3.000000
25%       16.750000
50%       57.500000
75%      151.000000
max      554.000000
Name: tuberculosis, dtype: float64

In [58]:
# Define the thresholds for classification

threshold_tuberculosis = 100
# Create a DataFrame to store the classified values
df2 = pd.DataFrame()
# Classify every columns
df2=df1[['Diseases_death', 'ext_health_expend',
       'public_health_expend', 'physicians', 'nurses', 'surgeries',
       'water_urban', 'water_rural', 'climet_affected_population',
       'fertilizer', 'farmers_2000', 'farmers_recent']]
df2['tuberculosis'] = [0 if i < threshold_tuberculosis else 1 for i in df1['tuberculosis']]

# Extract X and y
y = df2['tuberculosis'].values
X = df2.drop(columns='tuberculosis').values
df2.shape

(104, 13)

In [59]:
# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y,random_state=1,stratify=y,test_size=.2)

In [60]:
# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.
l_1 = 50
l_2 = 50
input_features = len(X_train[0])
nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(tf.keras.layers.Dense(units=l_1, input_dim = input_features, activation = "tanh", kernel_regularizer=l2(0.01)))

# Second hidden layer
nn.add(tf.keras.layers.Dense(units=l_2, activation = "LeakyReLU", kernel_regularizer=l2(0.01)))

# Output layer
nn.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# Check the structure of the model
nn.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_3 (Dense)             (None, 50)                650       
                                                                 
 dense_4 (Dense)             (None, 50)                2550      
                                                                 
 dense_5 (Dense)             (None, 1)                 51        
                                                                 
Total params: 3,251
Trainable params: 3,251
Non-trainable params: 0
_________________________________________________________________


In [61]:
# Compile the model
nn.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [62]:
# Train the model
nn.fit(X_train, y_train, epochs=70)

Epoch 1/70
Epoch 2/70
Epoch 3/70
Epoch 4/70
Epoch 5/70
Epoch 6/70
Epoch 7/70
Epoch 8/70
Epoch 9/70
Epoch 10/70
Epoch 11/70
Epoch 12/70
Epoch 13/70
Epoch 14/70
Epoch 15/70
Epoch 16/70
Epoch 17/70
Epoch 18/70
Epoch 19/70
Epoch 20/70
Epoch 21/70
Epoch 22/70
Epoch 23/70
Epoch 24/70
Epoch 25/70
Epoch 26/70
Epoch 27/70
Epoch 28/70
Epoch 29/70
Epoch 30/70
Epoch 31/70
Epoch 32/70
Epoch 33/70
Epoch 34/70
Epoch 35/70
Epoch 36/70
Epoch 37/70
Epoch 38/70
Epoch 39/70
Epoch 40/70
Epoch 41/70
Epoch 42/70
Epoch 43/70
Epoch 44/70
Epoch 45/70
Epoch 46/70
Epoch 47/70
Epoch 48/70
Epoch 49/70
Epoch 50/70
Epoch 51/70
Epoch 52/70
Epoch 53/70
Epoch 54/70
Epoch 55/70
Epoch 56/70
Epoch 57/70
Epoch 58/70
Epoch 59/70
Epoch 60/70
Epoch 61/70
Epoch 62/70
Epoch 63/70
Epoch 64/70
Epoch 65/70
Epoch 66/70
Epoch 67/70
Epoch 68/70
Epoch 69/70
Epoch 70/70


<keras.callbacks.History at 0x7f22eb74fc10>

In [63]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

1/1 - 0s - loss: 0.9602 - accuracy: 0.7143 - 108ms/epoch - 108ms/step
Loss: 0.9601607322692871, Accuracy: 0.7142857313156128


In [64]:
# Predict on the test data
y_pred = nn.predict(X_test)

# The predictions will be in the form of probabilities, so you might want to round them to get class labels (0 or 1)
y_pred_class = y_pred.round()
# Calculate the accuracy of the predictions
correct_predictions = (y_pred_class == y_test.reshape(-1, 1)).sum()
total_samples = len(y_test)
accuracy = correct_predictions / total_samples
print("Accuracy:", accuracy)

# Print the classification report
print("Classification Report:")
print(classification_report(y_test, y_pred_class))

Accuracy: 0.7142857142857143
Classification Report:
              precision    recall  f1-score   support

           0       0.75      0.86      0.80        14
           1       0.60      0.43      0.50         7

    accuracy                           0.71        21
   macro avg       0.68      0.64      0.65        21
weighted avg       0.70      0.71      0.70        21

