In [1]:
#importing libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
from tensorflow.keras.regularizers import l2
from sklearn.feature_selection import SelectKBest, chi2
import tensorflow as tf
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
# Read the databases from resources
health_factors = pd.read_csv("/content/Health_Risk_factors.csv")
agriculture = pd.read_csv("/content/agricultural_inputs.csv")
climate = pd.read_csv("/content/climate.csv")
freshwater = pd.read_csv("/content/freshwater.csv")
health_system = pd.read_csv("/content/health_system.csv")


In [3]:
hf = health_factors.rename(columns={
    "Incidence of tuberculosis  per 100,000 people 2018": "tuberculosis",
    "Prevalence of HIV Total % of population ages 15-49 2018": "HIV_total",
    "Prevalence of HIV Women's share of population ages 15+ living with HIV % 2018": "HIV_female",
    "Prevalence of HIV Youth, Male % of population ages 15-24 2018": "HIV_youth_male",
    "Prevalence of HIV Youth, Female % of population ages 15-24 2018": "HIV_youth_female",
    "Prevalence of diabetes  % of population ages 20 to 79 2019": "diabetes",
    "Cause of death Communicable diseases and maternal, prenatal, and nutrition conditions % of population 2016":"Diseases_death",
})
hs = health_system.rename(columns={
    'External health expenditure (% of current health expenditure)   2016': "ext_health_expend",
    "Health expenditure Public % of current 2016": "public_health_expend",
    "Health workers Physicians per 1,000 people 2009-18": "physicians",
    "Health workers Nurses and midwives per 1,000 people 2009-18": "nurses",
    "Specialist surgical workforce  per 100,000 population 2008-18": "surgeries"
     })
fw = freshwater.rename(columns={
    'Annual freshwater withdrawals  % for agriculture 2015': "water_agriculture",
    "People using at least basic drinking water services Urban % of urban population 2018": "water_urban",
    "People using at least basic drinking water services Rural % of Rural population 2018": "water_rural",})
cl = climate.rename(columns={
    'Resilience Disaster risk reduction progress score 1, worst to 5,best 2011':"Disaster_risk",
    'Exposure to impact Urban population living in areas where elevation is below 5 meters % of urban population 2010':"urban_population",
    'Exposure to impact Population affected by droughts, floods, and extreme temperatures average annual; % of total population 2009':"climet_affected_population"
})
ag = agriculture.rename(columns={
    'Fertilizer consumption  kilograms per hectare of arable land 2014-16': "fertilizer",
    "Agricultural employment  % of total employment 2000-02": "farmers_2000",
    "Agricultural employment  % of total employment 2014-16": "farmers_recent",
    "Agricultural machinery tractors per 100 sq. km of arable land 2009": "agricultur_machinery"})


In [4]:
merged_df = pd.concat([hf[[ "HIV_total","HIV_female", 'Diseases_death',"HIV_youth_male","HIV_youth_female", "diabetes",'tuberculosis']],
                      hs[['ext_health_expend', 'public_health_expend', 'physicians', 'nurses', 'surgeries']],
                      fw[[ "water_agriculture",'water_urban', 'water_rural']],
                      cl[[ "Disaster_risk",'climet_affected_population',"urban_population"]],
                      ag[['fertilizer', 'farmers_2000', 'farmers_recent', "agricultur_machinery"]]],
                     axis=1)

In [5]:
merged_df.columns

Index(['HIV_total', 'HIV_female', 'Diseases_death', 'HIV_youth_male',
       'HIV_youth_female', 'diabetes', 'tuberculosis', 'ext_health_expend',
       'public_health_expend', 'physicians', 'nurses', 'surgeries',
       'water_agriculture', 'water_urban', 'water_rural', 'Disaster_risk',
       'climet_affected_population', 'urban_population', 'fertilizer',
       'farmers_2000', 'farmers_recent', 'agricultur_machinery'],
      dtype='object')

In [6]:
merged_df.describe()

Unnamed: 0,HIV_total,HIV_female,Diseases_death,HIV_youth_male,HIV_youth_female,diabetes,tuberculosis,ext_health_expend,public_health_expend,physicians,...,water_agriculture,water_urban,water_rural,Disaster_risk,climet_affected_population,urban_population,fertilizer,farmers_2000,farmers_recent,agricultur_machinery
count,141.0,139.0,183.0,137.0,137.0,208.0,207.0,167.0,186.0,189.0,...,103.0,174.0,168.0,83.0,168.0,176.0,157.0,187.0,187.0,102.0
mean,1.837589,39.330935,22.31694,0.442336,0.893431,8.293269,102.89372,9.116766,52.915054,1.72328,...,48.223301,94.198851,80.005952,3.298795,1.171429,3.805114,141.32293,31.260963,25.916043,194.943137
std,4.309152,17.419957,20.291504,0.817701,2.23028,4.730419,137.634271,13.868251,21.957736,1.570366,...,35.362529,7.935433,22.542109,0.73857,1.958312,6.698113,149.705871,25.199064,23.425185,230.88089
min,0.1,6.0,1.0,0.1,0.1,1.0,0.0,0.0,5.1,0.0,...,0.0,64.7,22.8,1.0,0.0,0.0,0.3,0.3,0.1,0.1
25%,0.1,28.0,6.0,0.1,0.1,5.25,9.5,0.2,36.825,0.3,...,10.0,90.775,61.65,2.8,0.0,0.6,22.9,7.45,5.0,34.075
50%,0.4,36.0,13.0,0.1,0.1,6.85,45.0,1.7,55.85,1.3,...,57.0,97.8,89.95,3.3,0.25,1.85,112.1,24.2,18.6,112.05
75%,1.4,58.5,36.0,0.4,0.6,10.4,144.0,13.4,70.6,2.8,...,82.0,99.8,99.2,3.8,1.3,3.725,196.1,49.35,42.0,252.35
max,27.3,72.0,65.0,4.9,15.9,30.5,611.0,69.2,96.0,8.2,...,98.0,100.0,100.0,4.8,9.2,51.6,750.7,92.2,91.8,990.0


In [7]:
#df1 = ag.replace('', np.nan)  # Replace empty values with NaN
# df1 = merged_df.dropna()  # Drop rows with any missing values
column_means = merged_df.mean()
df1 = merged_df.fillna(column_means)
df1.describe()

Unnamed: 0,HIV_total,HIV_female,Diseases_death,HIV_youth_male,HIV_youth_female,diabetes,tuberculosis,ext_health_expend,public_health_expend,physicians,...,water_agriculture,water_urban,water_rural,Disaster_risk,climet_affected_population,urban_population,fertilizer,farmers_2000,farmers_recent,agricultur_machinery
count,214.0,214.0,214.0,214.0,214.0,214.0,214.0,214.0,214.0,214.0,...,214.0,214.0,214.0,214.0,214.0,214.0,214.0,214.0,214.0,214.0
mean,1.837589,39.330935,22.31694,0.442336,0.893431,8.293269,102.89372,9.116766,52.915054,1.72328,...,48.223301,94.198851,80.005952,3.298795,1.171429,3.805114,141.32293,31.260963,25.916043,194.943137
std,3.493543,14.021576,18.75686,0.653393,1.782128,4.663317,135.353782,12.242946,20.463674,1.475333,...,24.471106,7.151611,19.960112,0.458257,1.734005,6.0713,128.118348,23.54784,21.890199,158.985997
min,0.1,6.0,1.0,0.1,0.1,1.0,0.0,0.0,5.1,0.0,...,0.0,64.7,22.8,1.0,0.0,0.0,0.3,0.3,0.1,0.1
25%,0.225,31.0,8.0,0.1,0.1,5.4,10.0,0.4,40.775,0.4,...,48.223301,94.049713,75.6,3.298795,0.025,0.8,50.65,9.775,6.825,117.8
50%,1.4,39.330935,17.0,0.421168,0.8,6.9,46.0,5.55,52.915054,1.72328,...,48.223301,96.05,80.005952,3.298795,0.7,2.4,141.32293,31.260963,25.916043,194.943137
75%,1.837589,43.75,28.25,0.442336,0.893431,10.375,137.5,9.116766,67.775,2.5,...,54.75,99.6,97.525,3.298795,1.171429,3.805114,146.025,45.975,37.8,194.943137
max,27.3,72.0,65.0,4.9,15.9,30.5,611.0,69.2,96.0,8.2,...,98.0,100.0,100.0,4.8,9.2,51.6,750.7,92.2,91.8,990.0


In [8]:
# Define the thresholds for classification

threshold_tuberculosis = 100
# Create a DataFrame to store the classified values
df2 = pd.DataFrame()
# Classify every columns
df2 = df1.drop('tuberculosis', axis=1).copy()
df2['tuberculosis'] = [0 if i < threshold_tuberculosis else 1 for i in df1['tuberculosis']]

# Extract X and y
y = df2['tuberculosis'].values
X1 = df2.drop(columns='tuberculosis').values
df2.shape

(214, 22)

In [11]:
# Split the preprocessed data into a training and testing dataset
X1_train, X1_test, y_train, y_test = train_test_split(X1, y,random_state=1,stratify=y,test_size=.2)

In [17]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression  # or any other model
from sklearn.feature_selection import RFECV

estimator = LogisticRegression()
selector = RFECV(estimator, step=1, cv=20)
X_train_selected = selector.fit(X1_train, y_train)
# Assuming X_train and y_train are your training data and labels
# n_features_to_select = 20  # Select the top 10 features
# estimator = LogisticRegression()  # You can choose any model here
# rfe = RFE(estimator=estimator, n_features_to_select=n_features_to_select)
# X_train_selected = rfe.fit_transform(X1_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [19]:
# Get the mask of selected features
# Fit the Logistic Regression model to your data
estimator = LogisticRegression()
estimator.fit(X1_train, y_train)

# Create the RFE object with the Logistic Regression estimator
rfe = RFE(estimator=estimator, n_features_to_select=5)
rfe.fit(X1_train, y_train)

# Get the absolute coefficients from the Logistic Regression model
absolute_coefficients = np.abs(estimator.coef_).flatten()

# Create a DataFrame to store the results
feature_names = df2.drop(columns='tuberculosis').columns
result_df = pd.DataFrame({
    'Feature': feature_names,
    'Selected': rfe.support_,
    'Ranking': rfe.ranking_,
    'Absolute Coefficient': absolute_coefficients
})

# Sort the DataFrame based on the absolute coefficients in descending order
result_df = result_df.sort_values(by='Absolute Coefficient', ascending=False)

# Reset the index of the DataFrame
result_df = result_df.reset_index(drop=True)

# Print the result DataFrame
print(result_df)



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

                       Feature  Selected  Ranking  Absolute Coefficient
0                    HIV_total      True        1              0.573353
1             HIV_youth_female      True        1              0.302090
2                       nurses     False        8              0.223302
3   climet_affected_population      True        1              0.206175
4                   physicians      True        1              0.205479
5               HIV_youth_male     False        5              0.110882
6               Diseases_death     False        2              0.075513
7                   HIV_female     False        3              0.075053
8            ext_health_expend     False        4              0.071163
9                  water_urban     False        9              0.055585
10                farmers_2000     False        7              0.039880
11            urban_population     False       11              0.034410
12                    diabetes     False        6              0

In [33]:
X = df2[["HIV_total","HIV_youth_female", "climet_affected_population", "physicians", "Disaster_risk" ]].values
X_train, X_test, y_train, y_test = train_test_split(X, y,random_state=1,stratify=y,test_size=.2)


In [34]:
# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.
l_1 = 50
l_2 = 50
input_features = len(X_train[0])
nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(tf.keras.layers.Dense(units=l_1, input_dim = input_features, activation = "tanh"))

# Second hidden layer
nn.add(tf.keras.layers.Dense(units=l_2, activation = "LeakyReLU"))

# Output layer
nn.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# Check the structure of the model
nn.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_6 (Dense)             (None, 50)                300       
                                                                 
 dense_7 (Dense)             (None, 50)                2550      
                                                                 
 dense_8 (Dense)             (None, 1)                 51        
                                                                 
Total params: 2,901
Trainable params: 2,901
Non-trainable params: 0
_________________________________________________________________


In [35]:
# Compile the model
nn.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [36]:
# Train the model
nn.fit(X_train, y_train, epochs=50)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x7a2038c70dc0>

In [37]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

2/2 - 0s - loss: 0.4157 - accuracy: 0.7442 - 154ms/epoch - 77ms/step
Loss: 0.4157443940639496, Accuracy: 0.7441860437393188


In [39]:
y_pred = nn.predict(X_test)

# The predictions will be in the form of probabilities, so you might want to round them to get class labels (0 or 1)
y_pred_class = y_pred.round()
# Calculate the accuracy of the predictions
correct_predictions = (y_pred_class == y_test.reshape(-1, 1)).sum()
total_samples = len(y_test)
accuracy = correct_predictions / total_samples
print("Accuracy:", accuracy)

# Print the classification report
print("Classification Report:")
print(classification_report(y_test, y_pred_class))

Accuracy: 0.7441860465116279
Classification Report:
              precision    recall  f1-score   support

           0       0.82      0.79      0.81        29
           1       0.60      0.64      0.62        14

    accuracy                           0.74        43
   macro avg       0.71      0.72      0.71        43
weighted avg       0.75      0.74      0.75        43

