# About Dataset
**Life Expectancy (WHO)** 

In this dataset we are going to predict the Life Expectancy based on various features which include: Country,Year,Status,Adult Mortality,infant deaths,Alcohol,percentage expenditure,Hepatitis B,Measles,BMI,under-five deaths,Polio,Total expenditure,Diphtheria,HIV/AIDS,GDP,Population,thinness  1-19 years,thinness 5-9 years,Income composition of resources, and Schooling.

In [None]:
# import all the necessary libraries
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.svm import SVR
import statsmodels.api as sm
import matplotlib.pyplot as plt
from xgboost import XGBRegressor
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import AdaBoostRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.preprocessing import MinMaxScaler,OneHotEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import Lasso,Ridge,BayesianRidge,ElasticNet
from sklearn.metrics import mean_squared_error,mean_absolute_error,r2_score
import warnings 
warnings.filterwarnings("ignore")

In [None]:
dataframe=pd.read_csv("/kaggle/input/life-expectancy-who/Life Expectancy Data.csv")
dataframe.head()

# Exploratory Data Analysis

In [None]:
dataframe.shape

In [None]:
dataframe.info()

In [None]:
dataframe.describe()

In [None]:
dataframe.duplicated().sum()

In [None]:
column=dataframe.columns.to_list()
column

In [None]:
column[9]="Measles"
column[10]="BMI"
column[11]="Under-5 Deaths"
column[14]="Diphtheria"
column[15]="HIV/AIDS"
column[18]="thinness 1-19 years"
column[19]="thinness 5-9 years"
dataframe.columns=column
col=dataframe.columns.to_list()
col

In [None]:
dataframe.isnull().sum()

# Fill the missising Values in the dataset using median 

In [None]:
# Replacing the Null Values with median values of the data
imputer=SimpleImputer(missing_values=np.nan, strategy="median", fill_value=None)
dataframe['Life expectancy ']=imputer.fit_transform(dataframe[['Life expectancy ']])
dataframe['Adult Mortality']=imputer.fit_transform(dataframe[['Adult Mortality']])
dataframe['Alcohol']=imputer.fit_transform(dataframe[['Alcohol']])
dataframe['Hepatitis B']=imputer.fit_transform(dataframe[['Hepatitis B']])
dataframe['BMI']=imputer.fit_transform(dataframe[['BMI']])
dataframe['Polio']=imputer.fit_transform(dataframe[['Polio']])
dataframe['Total expenditure']=imputer.fit_transform(dataframe[['Total expenditure']])
dataframe['Diphtheria']=imputer.fit_transform(dataframe[['Diphtheria']])
dataframe['GDP']=imputer.fit_transform(dataframe[['GDP']])
dataframe['Population']=imputer.fit_transform(dataframe[['Population']])
dataframe['thinness 1-19 years']=imputer.fit_transform(dataframe[['thinness 1-19 years']])
dataframe['thinness 5-9 years']=imputer.fit_transform(dataframe[['thinness 5-9 years']])
dataframe['Income composition of resources']=imputer.fit_transform(dataframe[['Income composition of resources']])
dataframe['Schooling']=imputer.fit_transform(dataframe[['Schooling']])

In [None]:
# check the null values again
dataframe.isnull().sum()

In [None]:
# Let's make the correlation matrix
correlation_matrix= dataframe.corr()
correlation_matrix

#  Lets visulaise the correlation matrix 

In [None]:
# Lets visulaise the correlation matrix with the help of heatmap
plt.figure(figsize=(16,12))
sns.heatmap(correlation_matrix, annot=True,fmt=".2f", cmap="Reds")
plt.show()

Here, we can see that the column "under-5 Deaths" and "Infant Death" has the correlation=1, so we remove one of them. This way we deal with Multicollinearity.

In [None]:
dataframe.drop("Under-5 Deaths", axis=1, inplace=True)

In [None]:
dataframe.shape

# Lets make the histogram containing all the columns

In [None]:
dataframe.hist(bins=10, figsize=(16,16))
plt.suptitle("Data Distribution of all the columns")
plt.show()

In [None]:
# Check the unique values of Country columns
dataframe["Country"].unique()

In [None]:
# Check the unique values of Status columns
dataframe["Status"].unique()

# Bivariate Analysis

In [None]:
# To see the impact of status on Life Expectancy
fig=plt.figure(figsize=(5,5))
sns.violinplot(data=dataframe, x=dataframe["Status"], y=dataframe["Life expectancy "], hue=None ,color='y')
plt.title('Life expectancy vs Status', fontsize=16)
plt.show()

# To see the impact of columns that are highly correlate with Life Expectancy

In [None]:
# Schooling vs Life expectancy
plt.figure(figsize=(5,5))
sns.jointplot(x=dataframe["Schooling"], y=dataframe["Life expectancy "], kind="hex", color="r")
plt.show()

In [None]:
# Income composition of resources vs Life expectancy
plt.figure(figsize=(5,5))
sns.jointplot(x=dataframe["Income composition of resources"], y=dataframe["Life expectancy "], kind="hex", color="g")
plt.show()

In [None]:
# BMI vs Life expectancy
plt.figure(figsize=(5,5))
sns.jointplot(x=dataframe["BMI"], y=dataframe["Life expectancy "], kind="hex", color="b")
plt.show()

In [None]:
# GDP vs Life expectancy
plt.figure(figsize=(5,5))
sns.jointplot(x=dataframe["GDP"], y=dataframe["Life expectancy "], kind="hex", color="c")
plt.show()

In [None]:
# Polio vs Life expectancy
plt.figure(figsize=(5,5))
sns.jointplot(x=dataframe["Polio"], y=dataframe["Life expectancy "], kind="hex", color="y")
plt.show()

In [None]:
# Alcohol vs Life expectancy
plt.figure(figsize=(5,5))
sns.jointplot(x=dataframe["Alcohol"], y=dataframe["Life expectancy "], kind="hex", color="r")
plt.show()

# Q-Q plot of Life Expectancy

From here we can see that our target variable does not follow the Normal Distribution

In [None]:
plt.figure(figsize=(10,6))
stats.probplot(dataframe["Life expectancy "], plot= plt, dist="norm")
plt.title('Life expectancy')
plt.show()

# Detecting outliers in the dataset with the help of Winsorization Technique

In [None]:
# Percentile variable  for detecting outliers in the dataset
a=0.5
b=99

In [None]:
# Detecting outliers in daatset
data= dataframe.drop(["Country", "Status"], axis=1)
columns= data.columns.to_list()

outliers=[]

for col in columns:
    q1=np.percentile(data[col],a)
    q3=np.percentile(data[col],b)
    
    #print("col", col)
    
    for pos in range(len(data)):
        if data[col].iloc[pos] > q3 or data[col].iloc[pos]< q1:
            outliers.append(pos)
            
    #print(outliers)
    
    
outliers=set(outliers)           # remove the duplicate values from the outlier list
outliers=list(outliers)

In [None]:
ratio= round(len(outliers)/len(dataframe)*100,2)
ratio

# Remove the outliers from the dataset

In [None]:
# Drop the outliers from our dataset
dataframe.drop(dataframe.index[outliers], inplace=True)

# Data Preprocessing

In [None]:
x_train, x_test, y_train, y_test=train_test_split(dataframe.drop("Life expectancy ", axis=1),
                                                  dataframe["Life expectancy "],
                                                  test_size=0.2,
                                                  random_state=42)
x_train.shape, y_train.shape, x_test.shape, y_test.shape

# Getting Numerical And Categorical Columns

In [None]:
numerical_cols = []
categorical_cols = []

def get_numerical_and_categorical_columns(dataframe):
    for column in dataframe.columns:
        if pd.api.types.is_numeric_dtype(dataframe[column]):
            numerical_cols.append(column)
        else:
            categorical_cols.append(column)

    return numerical_cols, categorical_cols

In [None]:
get_numerical_and_categorical_columns(x_train)

# X_train Encoding

In [None]:
ohe= OneHotEncoder(handle_unknown="ignore")

x_train_ohe= ohe.fit_transform(x_train[categorical_cols])
x_train_ohe= x_train_ohe.toarray()

x_train_ohe_df= pd.DataFrame(x_train_ohe, columns=ohe.get_feature_names_out([categorical_cols[0],categorical_cols[1]]))

# One-hot encoding removed an index. Let's put it back:
x_train_ohe_df.index= x_train.index

# Joining the tables
x_train = pd.concat([x_train, x_train_ohe_df], axis=1)

# Dropping old categorical columns
x_train.drop(categorical_cols, axis=1, inplace=True)

# Checking result
x_train.head()

# X_test Encoding

In [None]:
x_test_ohe= ohe.transform(x_test[categorical_cols])
x_test_ohe= x_test_ohe.toarray()

x_test_ohe_df= pd.DataFrame(x_test_ohe, columns=ohe.get_feature_names_out([categorical_cols[0],categorical_cols[1]]))
#print(x_test_ohe_df)

# One-hot encoding removed an index. Let's put it back:
x_test_ohe_df.index= x_test.index

# Joining the tables
x_test= pd.concat([x_test, x_test_ohe_df], axis=1)

# Dropping old categorical columns
x_test.drop(categorical_cols, axis=1, inplace=True)

# Checking result
x_test.head()

In [None]:
#Data Preprocessing (--normalise the values of dataset)
min_max= MinMaxScaler()
x_train= min_max.fit_transform(x_train[numerical_cols])  
x_test=min_max.transform(x_test[numerical_cols])

In [None]:
models_parameters= {

       "LinearRegression":[LinearRegression(),  {'n_jobs':[-1]}],
       "RandomForestRegressor": [RandomForestRegressor(), {'n_estimators':[100], 'max_depth':[10], 'min_samples_split':[2], 'criterion':['squared_error']}],
       "DecisionTreeRegressor": [DecisionTreeRegressor(), {'splitter':['best'], 'max_depth':[12], 'min_samples_split':[2],'criterion':['squared_error']}],
       "GradientBoostingRegressor":[GradientBoostingRegressor(), {'n_estimators':[120], 'learning_rate':[0.1],'max_depth':[12], 'min_samples_leaf':[3],'loss':['squared_error']}],
       "SupportVectorRegressor": [SVR(), {'kernel':['rbf'], 'gamma':['scale']}],
       "Lasso":[ Lasso(), {'alpha':[1.0,1.1],'max_iter':[1000,1200],'selection':['cyclic', 'random']}],
       "Ridge":[Ridge(), { 'alpha':[1.0,1.1],'max_iter':[1000,1200],'solver':['auto','svd','lsqr']}]
}

In [None]:
result={}
predictions={}
for key, value in models_parameters.items():
    result_list=[]
    regressor = GridSearchCV(value[0],value[1],cv=50, scoring="r2", n_jobs=-1).fit(x_train, y_train)
    y_pred = regressor.predict(x_test)
    predictions[key]=y_pred 
    mse = mean_squared_error(y_test, y_pred)
    root_mse=np.sqrt(mse)
    mae=mean_absolute_error(y_test, y_pred)
    r2= r2_score(y_test, y_pred)
    result_list.append(root_mse)
    result_list.append(mae)
    result_list.append(r2)
    result[key]=result_list

In [None]:
result

In [None]:
final_results= pd.DataFrame(result)
final_results=final_results.T
columns=final_results.columns.tolist()
columns[0]="RootMeanSquaredError"
columns[1]="MeanAbsoluteError"
columns[2]="R2_Score"
final_results.columns=columns
final_results

# Visualise the Final Results

In [None]:
final_results.sort_values(by='RootMeanSquaredError',ascending=False).plot(kind="barh", figsize=(10, 7), grid=True).legend(bbox_to_anchor=(1.0, 1.0));

# Conclusion
RandomForestRegressor perform best in terms of RootMeanSquaredError, MeanAbsoluteError, R2_Score  with the value of 1.80880,1.195059, 0.959217 among all the regression models.