## Import Python Libraries and Modules

In [2]:
#Import python libraries and modules
import pandas as pd
import numpy as np

#Import libraries and modules for vizualization
from pandas.plotting import scatter_matrix
from matplotlib import pyplot

## 1. Load Dataset

In [5]:
#Specify what and where is the data file
filename = "C:/Users/Aleena/Desktop/UNT/big data/FINAL PROJECT/Life Expectancy Data.csv"

#Load the data into a Pandas Dataframe
df=pd.read_csv(filename)

## 2. Preprocess Dataset

Clean Data: Find and Mark Missing Values

NOTES:

The following columns cannot contain 0 values

i.e 0 values are invalid in these columns

1. Country
2. Year
3. Status
4. Life Expectancy
5. BMI
6. GDP
7. Population
8. Expenditure

If they exist, we need to mark them as missing value or numpy NaN

In [22]:
#mark zero values as missing or NaN 
df[['Country','Year','Status','Life expectancy ',' BMI ','GDP','Population','Total expenditure']] = df[['Country','Year','Status','Life expectancy ',' BMI ','GDP','Population','Total expenditure']].replace(0,np.NaN)


In [24]:
#Count the number of NaN values in each column
print(df.isnull().sum())

Country                              0
Year                                 0
Status                               0
Life expectancy                     10
Adult Mortality                     10
infant deaths                        0
Alcohol                            194
percentage expenditure               0
Hepatitis B                        553
Measles                              0
 BMI                                34
under-five deaths                    0
Polio                               19
Total expenditure                  226
Diphtheria                          19
 HIV/AIDS                            0
GDP                                448
Population                         652
 thinness  1-19 years               34
 thinness 5-9 years                 34
Income composition of resources    167
Schooling                          163
dtype: int64


In [26]:
#Lets get the percentage of Nan Values
# Calculate the percentage of NaN values in each column
nan_percentage = df.isnull().mean() * 100

# Display the results
print(nan_percentage)

Country                             0.000000
Year                                0.000000
Status                              0.000000
Life expectancy                     0.340368
Adult Mortality                     0.340368
infant deaths                       0.000000
Alcohol                             6.603131
percentage expenditure              0.000000
Hepatitis B                        18.822328
Measles                             0.000000
 BMI                                1.157250
under-five deaths                   0.000000
Polio                               0.646698
Total expenditure                   7.692308
Diphtheria                          0.646698
 HIV/AIDS                           0.000000
GDP                                15.248468
Population                         22.191967
 thinness  1-19 years               1.157250
 thinness 5-9 years                 1.157250
Income composition of resources     5.684139
Schooling                           5.547992
dtype: flo

AS you can see columns like 'Life Expectancy','Adult Mortality','BMI','thinness 1-19 years','thinness 5-19 years','Polio' and 'Diphtheria' have a very small percentage of missing data for some equivalent to zero.
For these we are going to drop the rows with NAN Values


In [43]:
# List of columns to check for NaN values
columns_to_check = ['Life expectancy ',' BMI ','Adult Mortality',' thinness  1-19 years',' thinness 5-9 years','Polio','Diphtheria ']

df.dropna(subset=columns_to_check, inplace=True)

Now lets check these Nan value percentage

In [46]:
#Lets get the percentage of Nan Values
# Calculate the percentage of NaN values in each column
nan_percentage = df.isnull().mean() * 100

# Display the results
print(nan_percentage)

Country                             0.000000
Year                                0.000000
Status                              0.000000
Life expectancy                     0.000000
Adult Mortality                     0.000000
infant deaths                       0.000000
Alcohol                             6.059557
percentage expenditure              0.000000
Hepatitis B                        18.178670
Measles                             0.000000
 BMI                                0.000000
under-five deaths                   0.000000
Polio                               0.000000
Total expenditure                   7.340720
Diphtheria                          0.000000
 HIV/AIDS                           0.000000
GDP                                15.062327
Population                         22.299169
 thinness  1-19 years               0.000000
 thinness 5-9 years                 0.000000
Income composition of resources     5.540166
Schooling                           5.540166
dtype: flo

### Imputing the values
Lets find out the data types before we decide how to impute the values

In [53]:
df.dtypes

Country                             object
Year                                 int64
Status                              object
Life expectancy                    float64
Adult Mortality                    float64
infant deaths                        int64
Alcohol                            float64
percentage expenditure             float64
Hepatitis B                        float64
Measles                              int64
 BMI                               float64
under-five deaths                    int64
Polio                              float64
Total expenditure                  float64
Diphtheria                         float64
 HIV/AIDS                          float64
GDP                                float64
Population                         float64
 thinness  1-19 years              float64
 thinness 5-9 years                float64
Income composition of resources    float64
Schooling                          float64
dtype: object

So the Coulmns with missing values have the following data types
1. Alcohol - float64
2. Hepatitis B - float 64
3. Total Expenditure - float64
4. GDP- float65
5. Population -Floadt64
6. Income composition of resources - float64
7. Schooling - float64


Since all these columns are of float data type we can impute values with the mean of the columns

In [59]:
# List of columns to impute
columns_to_impute = [
    'Alcohol',
    'Hepatitis B',
    'Total expenditure',
    'GDP',
    'Population',
    'Income composition of resources',
    'Schooling'
]

# Impute missing values with the mean of each column
for column in columns_to_impute:
    df[column].fillna(df[column].mean(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[column].fillna(df[column].mean(), inplace=True)


Let's check the NAN Values 

In [62]:
#Lets get the percentage of Nan Values
# Calculate the percentage of NaN values in each column
nan_percentage = df.isnull().mean() * 100

# Display the results
print(nan_percentage)

Country                            0.0
Year                               0.0
Status                             0.0
Life expectancy                    0.0
Adult Mortality                    0.0
infant deaths                      0.0
Alcohol                            0.0
percentage expenditure             0.0
Hepatitis B                        0.0
Measles                            0.0
 BMI                               0.0
under-five deaths                  0.0
Polio                              0.0
Total expenditure                  0.0
Diphtheria                         0.0
 HIV/AIDS                          0.0
GDP                                0.0
Population                         0.0
 thinness  1-19 years              0.0
 thinness 5-9 years                0.0
Income composition of resources    0.0
Schooling                          0.0
dtype: float64


#### All nan values are now replaced

In [67]:
df.to_csv('cleaned_dataset.csv', index=False)

# Confirm the file is saved
print("Dataset saved as 'cleaned_dataset.csv'")

Dataset saved as 'cleaned_dataset.csv'
