In [8]:
#importing libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
import tensorflow as tf
import seaborn as sns
import matplotlib.pyplot as plt

In [9]:
# Read the databases from resources
health_factors = pd.read_csv("/content/Health_Risk_factors.csv")
agriculture = pd.read_csv("/content/greenhouse_gas_emissions.csv")


In [10]:
merged_df = pd.concat([agriculture, health_factors['Incidence of tuberculosis  per 100,000 people 2018']], axis=1)

In [11]:
merged_df.columns

Index(['Country',
       'Total greenhouse gas emissions  thousand metric tons of carbon dioxide equivalent 2012',
       'Total greenhouse gas emissions  % change 1990-2012',
       'Methane emissions  thousand metric tons of carbon dioxide equivalent 2012',
       'Methane emissions  % change 1990-2012',
       'Methane emissions From energy processes % of total 2008',
       'Methane emissions Agricultural % of total 2008',
       'Nitrous oxide emissions  thousand metric tons of carbon dioxide equivalent 2012',
       'Nitrous oxide emissions  % change 1990-2012',
       'Nitrous oxide emissions From energy processes % of total 2008',
       'Nitrous oxide emissions Agricultural % of total 2008',
       'Other greenhouse gas emissions  thousand metric tons of carbon dioxide equivalent 2012',
       'Other greenhouse gas emissions  % change 1990-2012',
       'Incidence of tuberculosis  per 100,000 people 2018'],
      dtype='object')

In [14]:
# Renaming columns in the health_factors DataFrame to more descriptive names
ag = merged_df.rename(columns={
    "Incidence of tuberculosis  per 100,000 people 2018": "tuberculosis",
    'Nitrous oxide emissions From energy processes % of total 2008': "N2O_emission",
    'Other greenhouse gas emissions  % change 1990-2012': "other_GH_emission",
    'Total greenhouse gas emissions  % change 1990-2012': "total_GH_emission",
    'Methane emissions Agricultural % of total 2008': "CH4_emission"
    })
ag.columns
ag.describe()

Unnamed: 0,Total greenhouse gas emissions thousand metric tons of carbon dioxide equivalent 2012,total_GH_emission,Methane emissions thousand metric tons of carbon dioxide equivalent 2012,Methane emissions % change 1990-2012,Methane emissions From energy processes % of total 2008,CH4_emission,Nitrous oxide emissions thousand metric tons of carbon dioxide equivalent 2012,Nitrous oxide emissions % change 1990-2012,N2O_emission,Nitrous oxide emissions Agricultural % of total 2008,Other greenhouse gas emissions thousand metric tons of carbon dioxide equivalent 2012,other_GH_emission,tuberculosis
count,18.0,186.0,45.0,200.0,201.0,201.0,69.0,201.0,202.0,202.0,47.0,161.0,207.0
mean,381.3,71.663978,152.666667,37.1445,25.910945,42.623881,221.652174,25.647761,8.986139,63.426238,109.12766,80.692547,102.89372
std,306.474031,128.079877,211.722911,79.603798,24.831294,26.05539,266.02437,81.476506,9.038338,25.248245,462.17739,258.777001,137.634271
min,5.2,-78.0,0.0,-100.0,0.0,0.0,0.0,-100.0,0.0,0.0,-883.0,-620.0,0.0
25%,103.25,-7.525,29.0,2.025,6.8,21.9,15.0,-25.4,3.2,51.95,-72.0,-86.9,9.5
50%,337.65,42.75,46.0,24.1,16.7,42.1,81.0,14.8,6.5,69.3,55.0,-3.6,45.0
75%,590.15,112.1,215.0,60.85,40.1,64.6,344.0,49.9,12.1,82.375,357.0,168.1,144.0
max,959.3,828.9,732.0,834.7,97.6,96.8,990.0,648.2,60.7,99.3,988.0,983.4,611.0


In [16]:
df=ag[["other_GH_emission","N2O_emission","total_GH_emission","tuberculosis" , "CH4_emission"]]
#df1 = ag.replace('', np.nan)  # Replace empty values with NaN
df1 = df.dropna()  # Drop rows with any missing values

In [21]:
df1["tuberculosis"].describe()

count    149.000000
mean      96.489933
std      130.819538
min        0.000000
25%        9.000000
50%       44.000000
75%      116.000000
max      611.000000
Name: tuberculosis, dtype: float64

In [23]:
# Define the thresholds for classification

threshold_tuberculosis = 100
# Create a DataFrame to store the classified values
df2 = pd.DataFrame()
# Classify every columns
df2=df1[["other_GH_emission","N2O_emission","total_GH_emission","CH4_emission" ]]
df2['tuberculosis'] = [0 if i < threshold_tuberculosis else 1 for i in df1['tuberculosis']]

# Extract X and y
y = df2['tuberculosis'].values
X = df2.drop(columns='tuberculosis').values
df2.shape


(149, 5)

In [24]:
# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y,random_state=1,stratify=y,test_size=.2)

In [25]:
# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.
l_1 = 50
l_2 = 50
input_features = len(X_train[0])
nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(tf.keras.layers.Dense(units=l_1, input_dim = input_features, activation = "tanh"))

# Second hidden layer
nn.add(tf.keras.layers.Dense(units=l_2, activation = "LeakyReLU"))

# Output layer
nn.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# Check the structure of the model
nn.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 50)                250       
                                                                 
 dense_1 (Dense)             (None, 50)                2550      
                                                                 
 dense_2 (Dense)             (None, 1)                 51        
                                                                 
Total params: 2,851
Trainable params: 2,851
Non-trainable params: 0
_________________________________________________________________


In [26]:
# Compile the model
nn.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [27]:
# Train the model
nn.fit(X_train, y_train, epochs=50)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x7d459a012ec0>

In [28]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

1/1 - 0s - loss: 0.6041 - accuracy: 0.7333 - 170ms/epoch - 170ms/step
Loss: 0.6041117906570435, Accuracy: 0.7333333492279053


In [29]:
# Predict on the test data
y_pred = nn.predict(X_test)

# The predictions will be in the form of probabilities, so you might want to round them to get class labels (0 or 1)
y_pred_class = y_pred.round()
# Calculate the accuracy of the predictions
correct_predictions = (y_pred_class == y_test.reshape(-1, 1)).sum()
total_samples = len(y_test)
accuracy = correct_predictions / total_samples
print("Accuracy:", accuracy)

# Print the classification report
print("Classification Report:")
print(classification_report(y_test, y_pred_class))

Accuracy: 0.7333333333333333
Classification Report:
              precision    recall  f1-score   support

           0       0.75      0.95      0.84        22
           1       0.50      0.12      0.20         8

    accuracy                           0.73        30
   macro avg       0.62      0.54      0.52        30
weighted avg       0.68      0.73      0.67        30

