### Importing Required Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score as rsquare
from sklearn import preprocessing


### Loading Data Set

In [2]:
df=pd.read_csv("D:\\python_datascience\\data sets\\life_expectancy_data\\Life_Expectancy_Data.csv")
df.head(2)
               
               

Unnamed: 0,Country,Year,Status,Life expectancy,Adult Mortality,infant deaths,Alcohol,percentage expenditure,Hepatitis B,Measles,...,Polio,Total expenditure,Diphtheria,HIV/AIDS,GDP,Population,thinness 1-19 years,thinness 5-9 years,Income composition of resources,Schooling
0,Afghanistan,2015,Developing,65.0,263.0,62,0.01,71.279624,65.0,1154,...,6.0,8.16,65.0,0.1,584.25921,33736494.0,17.2,17.3,0.479,10.1
1,Afghanistan,2014,Developing,59.9,271.0,64,0.01,73.523582,62.0,492,...,58.0,8.18,62.0,0.1,612.696514,327582.0,17.5,17.5,0.476,10.0


**Here we have life expectancy data of different countries , in which Life expectancy column is a target variable and rest are the features**

#### Getting shape of data 

In [3]:
df.shape

(2938, 22)

### checking missing values

In [4]:
df.isnull().sum()

Country                              0
Year                                 0
Status                               0
Life expectancy                     10
Adult Mortality                     10
infant deaths                        0
Alcohol                            194
percentage expenditure               0
Hepatitis B                        553
Measles                              0
 BMI                                34
under-five deaths                    0
Polio                               19
Total expenditure                  226
Diphtheria                          19
 HIV/AIDS                            0
GDP                                448
Population                         652
 thinness  1-19 years               34
 thinness 5-9 years                 34
Income composition of resources    167
Schooling                          163
dtype: int64

In [5]:
df.dtypes

Country                             object
Year                                 int64
Status                              object
Life expectancy                    float64
Adult Mortality                    float64
infant deaths                        int64
Alcohol                            float64
percentage expenditure             float64
Hepatitis B                        float64
Measles                              int64
 BMI                               float64
under-five deaths                    int64
Polio                              float64
Total expenditure                  float64
Diphtheria                         float64
 HIV/AIDS                          float64
GDP                                float64
Population                         float64
 thinness  1-19 years              float64
 thinness 5-9 years                float64
Income composition of resources    float64
Schooling                          float64
dtype: object

In [6]:
df.select_dtypes("float").columns

Index(['Life expectancy ', 'Adult Mortality', 'Alcohol',
       'percentage expenditure', 'Hepatitis B', ' BMI ', 'Polio',
       'Total expenditure', 'Diphtheria ', ' HIV/AIDS', 'GDP', 'Population',
       ' thinness  1-19 years', ' thinness 5-9 years',
       'Income composition of resources', 'Schooling'],
      dtype='object')

In [7]:
df["Life expectancy "].fillna(df["Life expectancy "].mean(),inplace=True)
df["Adult Mortality"].fillna(df["Adult Mortality"].mean(),inplace=True)
df["Alcohol"].fillna(df["Alcohol"].mean(),inplace=True)
df["Hepatitis B"].fillna(df["Hepatitis B"].mean(),inplace=True)
df["Polio"].fillna(df["Polio"].mean(),inplace=True)
df[" BMI "].fillna(df[" BMI "].mean(),inplace=True)
df["Total expenditure"].fillna(df["Total expenditure"].mean(),inplace=True)
df["Diphtheria "].fillna(df["Diphtheria "].mean(),inplace=True)
df["GDP"].fillna(df["GDP"].mean(),inplace=True)
df[" thinness  1-19 years"].fillna(df[" thinness  1-19 years"].mean(),inplace=True)
df[" thinness 5-9 years"].fillna(df[" thinness 5-9 years"].mean(),inplace=True)
df["Income composition of resources"].fillna(df["Income composition of resources"].mean(),inplace=True)
df["Schooling"].fillna(df["Schooling"].mean(),inplace=True)
df["Population"].fillna(df["Population"].mean(),inplace=True)

In [8]:
## checking missing values again
df.isnull().sum()

Country                            0
Year                               0
Status                             0
Life expectancy                    0
Adult Mortality                    0
infant deaths                      0
Alcohol                            0
percentage expenditure             0
Hepatitis B                        0
Measles                            0
 BMI                               0
under-five deaths                  0
Polio                              0
Total expenditure                  0
Diphtheria                         0
 HIV/AIDS                          0
GDP                                0
Population                         0
 thinness  1-19 years              0
 thinness 5-9 years                0
Income composition of resources    0
Schooling                          0
dtype: int64

In [9]:
dff=df.copy()

 It is a regressor problem where we want to predict the life expectancy of the different counteries based on some predictng factors like  income composition ,mortality rates,immunization and human development index etc  and some of the features are categorical so to handle such data we will import catboostRegressor algorithm
 
 
 HERE, Country,Year and status are categorical columns 

In [10]:
from catboost import CatBoostRegressor, Pool
## here pool is internal data structure of catboost framework where training happens

### Seggregating the data into features  and target 

In [11]:
df.head(2)

Unnamed: 0,Country,Year,Status,Life expectancy,Adult Mortality,infant deaths,Alcohol,percentage expenditure,Hepatitis B,Measles,...,Polio,Total expenditure,Diphtheria,HIV/AIDS,GDP,Population,thinness 1-19 years,thinness 5-9 years,Income composition of resources,Schooling
0,Afghanistan,2015,Developing,65.0,263.0,62,0.01,71.279624,65.0,1154,...,6.0,8.16,65.0,0.1,584.25921,33736494.0,17.2,17.3,0.479,10.1
1,Afghanistan,2014,Developing,59.9,271.0,64,0.01,73.523582,62.0,492,...,58.0,8.18,62.0,0.1,612.696514,327582.0,17.5,17.5,0.476,10.0


In [12]:
df.columns

Index(['Country', 'Year', 'Status', 'Life expectancy ', 'Adult Mortality',
       'infant deaths', 'Alcohol', 'percentage expenditure', 'Hepatitis B',
       'Measles ', ' BMI ', 'under-five deaths ', 'Polio', 'Total expenditure',
       'Diphtheria ', ' HIV/AIDS', 'GDP', 'Population',
       ' thinness  1-19 years', ' thinness 5-9 years',
       'Income composition of resources', 'Schooling'],
      dtype='object')

### Seggregating features and target

In [13]:
x=df.drop(columns=["Life expectancy "]) ## predicting features 
y=df["Life expectancy "]  ## target of countinous type

### Dividing data into training and test data 

In [14]:
x_train,x_test,y_train,y_test=train_test_split(x,y,random_state=42) ## by default = 0.25 

now you need to pass training data   and test data along with categorical features to make train pool and test pool 

In [15]:
pool_train=Pool(x_train,y_train,cat_features=["Country","Year","Status"])
pool_test=Pool(x_test,cat_features=["Country","Year","Status"])

### Fit th model using catboost

In [16]:
import time
start=time.time()
cbr=CatBoostRegressor() ## run with default parameters
cbr.fit(pool_train)  ##

Learning rate set to 0.046385
0:	learn: 9.1570586	total: 187ms	remaining: 3m 6s
1:	learn: 8.8306608	total: 211ms	remaining: 1m 45s
2:	learn: 8.5114803	total: 234ms	remaining: 1m 17s
3:	learn: 8.2299172	total: 258ms	remaining: 1m 4s
4:	learn: 7.9446519	total: 283ms	remaining: 56.3s
5:	learn: 7.6901105	total: 307ms	remaining: 50.9s
6:	learn: 7.4359461	total: 335ms	remaining: 47.5s
7:	learn: 7.2053827	total: 361ms	remaining: 44.7s
8:	learn: 6.9795732	total: 386ms	remaining: 42.5s
9:	learn: 6.7696681	total: 414ms	remaining: 40.9s
10:	learn: 6.5567585	total: 439ms	remaining: 39.4s
11:	learn: 6.3654425	total: 479ms	remaining: 39.5s
12:	learn: 6.1690530	total: 513ms	remaining: 39s
13:	learn: 5.9915128	total: 538ms	remaining: 37.9s
14:	learn: 5.8205764	total: 563ms	remaining: 37s
15:	learn: 5.6613738	total: 589ms	remaining: 36.2s
16:	learn: 5.5022961	total: 613ms	remaining: 35.4s
17:	learn: 5.3510502	total: 641ms	remaining: 34.9s
18:	learn: 5.2039509	total: 667ms	remaining: 34.4s
19:	learn: 5.

164:	learn: 1.9767227	total: 5.18s	remaining: 26.2s
165:	learn: 1.9744800	total: 5.22s	remaining: 26.2s
166:	learn: 1.9692759	total: 5.24s	remaining: 26.1s
167:	learn: 1.9667241	total: 5.27s	remaining: 26.1s
168:	learn: 1.9653704	total: 5.29s	remaining: 26s
169:	learn: 1.9630859	total: 5.33s	remaining: 26s
170:	learn: 1.9603824	total: 5.38s	remaining: 26.1s
171:	learn: 1.9561123	total: 5.42s	remaining: 26.1s
172:	learn: 1.9542427	total: 5.45s	remaining: 26s
173:	learn: 1.9495444	total: 5.49s	remaining: 26.1s
174:	learn: 1.9470356	total: 5.52s	remaining: 26s
175:	learn: 1.9460506	total: 5.59s	remaining: 26.2s
176:	learn: 1.9423346	total: 5.61s	remaining: 26.1s
177:	learn: 1.9376010	total: 5.64s	remaining: 26.1s
178:	learn: 1.9347231	total: 5.67s	remaining: 26s
179:	learn: 1.9311760	total: 5.7s	remaining: 26s
180:	learn: 1.9286998	total: 5.73s	remaining: 25.9s
181:	learn: 1.9243131	total: 5.76s	remaining: 25.9s
182:	learn: 1.9226405	total: 5.79s	remaining: 25.9s
183:	learn: 1.9191246	tot

323:	learn: 1.6156177	total: 10.2s	remaining: 21.2s
324:	learn: 1.6153588	total: 10.2s	remaining: 21.2s
325:	learn: 1.6137948	total: 10.2s	remaining: 21.2s
326:	learn: 1.6116941	total: 10.3s	remaining: 21.1s
327:	learn: 1.6105612	total: 10.3s	remaining: 21.1s
328:	learn: 1.6095459	total: 10.3s	remaining: 21s
329:	learn: 1.6077514	total: 10.3s	remaining: 21s
330:	learn: 1.6057977	total: 10.4s	remaining: 21s
331:	learn: 1.6031928	total: 10.4s	remaining: 20.9s
332:	learn: 1.6020071	total: 10.4s	remaining: 20.9s
333:	learn: 1.6007721	total: 10.4s	remaining: 20.8s
334:	learn: 1.5989725	total: 10.5s	remaining: 20.8s
335:	learn: 1.5963443	total: 10.5s	remaining: 20.8s
336:	learn: 1.5959488	total: 10.5s	remaining: 20.7s
337:	learn: 1.5956076	total: 10.6s	remaining: 20.7s
338:	learn: 1.5951185	total: 10.6s	remaining: 20.7s
339:	learn: 1.5942985	total: 10.6s	remaining: 20.7s
340:	learn: 1.5917691	total: 10.7s	remaining: 20.6s
341:	learn: 1.5899049	total: 10.7s	remaining: 20.6s
342:	learn: 1.5885

488:	learn: 1.3841675	total: 15.4s	remaining: 16.1s
489:	learn: 1.3831306	total: 15.4s	remaining: 16s
490:	learn: 1.3818664	total: 15.4s	remaining: 16s
491:	learn: 1.3812341	total: 15.5s	remaining: 16s
492:	learn: 1.3802160	total: 15.5s	remaining: 15.9s
493:	learn: 1.3800676	total: 15.5s	remaining: 15.9s
494:	learn: 1.3797445	total: 15.6s	remaining: 15.9s
495:	learn: 1.3785163	total: 15.6s	remaining: 15.8s
496:	learn: 1.3778282	total: 15.6s	remaining: 15.8s
497:	learn: 1.3770761	total: 15.6s	remaining: 15.8s
498:	learn: 1.3747241	total: 15.7s	remaining: 15.7s
499:	learn: 1.3733984	total: 15.7s	remaining: 15.7s
500:	learn: 1.3707567	total: 15.7s	remaining: 15.7s
501:	learn: 1.3697205	total: 15.8s	remaining: 15.6s
502:	learn: 1.3690632	total: 15.8s	remaining: 15.6s
503:	learn: 1.3687430	total: 15.8s	remaining: 15.6s
504:	learn: 1.3671500	total: 15.9s	remaining: 15.6s
505:	learn: 1.3666229	total: 15.9s	remaining: 15.5s
506:	learn: 1.3657972	total: 15.9s	remaining: 15.5s
507:	learn: 1.3655

651:	learn: 1.2035854	total: 20.4s	remaining: 10.9s
652:	learn: 1.2022275	total: 20.4s	remaining: 10.9s
653:	learn: 1.2013131	total: 20.5s	remaining: 10.8s
654:	learn: 1.2008387	total: 20.5s	remaining: 10.8s
655:	learn: 1.1995110	total: 20.5s	remaining: 10.8s
656:	learn: 1.1985599	total: 20.6s	remaining: 10.7s
657:	learn: 1.1971072	total: 20.6s	remaining: 10.7s
658:	learn: 1.1959971	total: 20.6s	remaining: 10.7s
659:	learn: 1.1954004	total: 20.7s	remaining: 10.7s
660:	learn: 1.1950736	total: 20.7s	remaining: 10.6s
661:	learn: 1.1946470	total: 20.8s	remaining: 10.6s
662:	learn: 1.1940114	total: 20.8s	remaining: 10.6s
663:	learn: 1.1920823	total: 20.8s	remaining: 10.5s
664:	learn: 1.1913839	total: 20.9s	remaining: 10.5s
665:	learn: 1.1911045	total: 20.9s	remaining: 10.5s
666:	learn: 1.1890590	total: 20.9s	remaining: 10.4s
667:	learn: 1.1879923	total: 20.9s	remaining: 10.4s
668:	learn: 1.1863437	total: 21s	remaining: 10.4s
669:	learn: 1.1860360	total: 21s	remaining: 10.3s
670:	learn: 1.18

811:	learn: 1.0623939	total: 25.4s	remaining: 5.88s
812:	learn: 1.0615915	total: 25.4s	remaining: 5.84s
813:	learn: 1.0615445	total: 25.4s	remaining: 5.81s
814:	learn: 1.0610030	total: 25.5s	remaining: 5.78s
815:	learn: 1.0599921	total: 25.5s	remaining: 5.75s
816:	learn: 1.0596688	total: 25.5s	remaining: 5.71s
817:	learn: 1.0592155	total: 25.6s	remaining: 5.69s
818:	learn: 1.0580592	total: 25.6s	remaining: 5.65s
819:	learn: 1.0565838	total: 25.6s	remaining: 5.62s
820:	learn: 1.0554927	total: 25.6s	remaining: 5.59s
821:	learn: 1.0548737	total: 25.7s	remaining: 5.56s
822:	learn: 1.0537809	total: 25.7s	remaining: 5.53s
823:	learn: 1.0533991	total: 25.8s	remaining: 5.5s
824:	learn: 1.0522305	total: 25.8s	remaining: 5.47s
825:	learn: 1.0509104	total: 25.8s	remaining: 5.44s
826:	learn: 1.0500240	total: 25.8s	remaining: 5.41s
827:	learn: 1.0493298	total: 25.9s	remaining: 5.38s
828:	learn: 1.0487412	total: 25.9s	remaining: 5.35s
829:	learn: 1.0477490	total: 26s	remaining: 5.32s
830:	learn: 1.0

972:	learn: 0.9436624	total: 30.4s	remaining: 844ms
973:	learn: 0.9421182	total: 30.4s	remaining: 812ms
974:	learn: 0.9417362	total: 30.5s	remaining: 781ms
975:	learn: 0.9408957	total: 30.5s	remaining: 750ms
976:	learn: 0.9403651	total: 30.5s	remaining: 718ms
977:	learn: 0.9401397	total: 30.5s	remaining: 687ms
978:	learn: 0.9396380	total: 30.6s	remaining: 656ms
979:	learn: 0.9391662	total: 30.6s	remaining: 624ms
980:	learn: 0.9386677	total: 30.6s	remaining: 593ms
981:	learn: 0.9380060	total: 30.6s	remaining: 562ms
982:	learn: 0.9373372	total: 30.7s	remaining: 530ms
983:	learn: 0.9364961	total: 30.7s	remaining: 499ms
984:	learn: 0.9353070	total: 30.8s	remaining: 468ms
985:	learn: 0.9344379	total: 30.8s	remaining: 437ms
986:	learn: 0.9335259	total: 30.8s	remaining: 406ms
987:	learn: 0.9329825	total: 30.8s	remaining: 374ms
988:	learn: 0.9318886	total: 30.9s	remaining: 343ms
989:	learn: 0.9309171	total: 30.9s	remaining: 312ms
990:	learn: 0.9297458	total: 30.9s	remaining: 281ms
991:	learn: 

<catboost.core.CatBoostRegressor at 0x1f1fb3117c0>

by default it runs 1000 iterations 

#### Predictions

In [17]:
y_pred=cbr.predict(x_test)


#### Evaluation metrics r Square 

In [18]:
r_square=rsquare(y_test,y_pred)
print(r_square)
end=time.time()
diff=end-start
print("execution time",diff)

0.9723944085345756
execution time 32.2993381023407


#####  Catboost gives 97.23% accuracy  

### with default parameters it give good performance 

####  Hyper parameter tuning of catboost 

In [19]:
start=time.time()
cbr=CatBoostRegressor(iterations=100) 
cbr.fit(pool_train)
y_pred=cbr.predict(x_test)
print("execution time",diff)
r_square=rsquare(y_test,y_pred)
print(r_square)
end=time.time()
diff=end-start

Learning rate set to 0.301561
0:	learn: 7.4007361	total: 21.3ms	remaining: 2.11s
1:	learn: 5.8544207	total: 44.9ms	remaining: 2.2s
2:	learn: 4.8071661	total: 76ms	remaining: 2.46s
3:	learn: 4.1704360	total: 95.6ms	remaining: 2.29s
4:	learn: 3.6749681	total: 123ms	remaining: 2.33s
5:	learn: 3.3638924	total: 145ms	remaining: 2.27s
6:	learn: 3.1536247	total: 169ms	remaining: 2.24s
7:	learn: 2.9370633	total: 203ms	remaining: 2.33s
8:	learn: 2.8153406	total: 257ms	remaining: 2.6s
9:	learn: 2.6826712	total: 281ms	remaining: 2.53s
10:	learn: 2.6224135	total: 300ms	remaining: 2.43s
11:	learn: 2.5432534	total: 325ms	remaining: 2.38s
12:	learn: 2.4876948	total: 353ms	remaining: 2.36s
13:	learn: 2.4465472	total: 379ms	remaining: 2.33s
14:	learn: 2.4031109	total: 404ms	remaining: 2.29s
15:	learn: 2.3665363	total: 439ms	remaining: 2.3s
16:	learn: 2.3355721	total: 468ms	remaining: 2.28s
17:	learn: 2.3096177	total: 495ms	remaining: 2.25s
18:	learn: 2.2629821	total: 523ms	remaining: 2.23s
19:	learn: 2

In [20]:
start=time.time()
cbr=CatBoostRegressor(iterations=100,max_depth=2) 
cbr.fit(pool_train)
y_pred=cbr.predict(x_test)  ## predictions
print("execution time",diff)
r_square=rsquare(y_test,y_pred)  ## accuracy
print(r_square)
end=time.time()
diff=end-start

Learning rate set to 0.301561
0:	learn: 7.6814971	total: 5.6ms	remaining: 555ms
1:	learn: 6.5578279	total: 11.4ms	remaining: 558ms
2:	learn: 5.7128570	total: 15.5ms	remaining: 501ms
3:	learn: 5.0684757	total: 23.2ms	remaining: 558ms
4:	learn: 4.6063078	total: 30.5ms	remaining: 580ms
5:	learn: 4.2834599	total: 37.8ms	remaining: 593ms
6:	learn: 4.0460518	total: 44.5ms	remaining: 591ms
7:	learn: 3.8604484	total: 51.5ms	remaining: 592ms
8:	learn: 3.6783194	total: 58.6ms	remaining: 592ms
9:	learn: 3.5753738	total: 65ms	remaining: 585ms
10:	learn: 3.4232771	total: 71.6ms	remaining: 580ms
11:	learn: 3.3222338	total: 78ms	remaining: 572ms
12:	learn: 3.2785872	total: 84.5ms	remaining: 566ms
13:	learn: 3.2359858	total: 90.9ms	remaining: 558ms
14:	learn: 3.1997735	total: 97.2ms	remaining: 551ms
15:	learn: 3.1660124	total: 104ms	remaining: 545ms
16:	learn: 3.1422484	total: 110ms	remaining: 537ms
17:	learn: 3.1104894	total: 116ms	remaining: 530ms
18:	learn: 3.0653439	total: 125ms	remaining: 532ms
1

### XGBoost 

**importing xgboost**

In [21]:
import xgboost as xgb

#### Seggregating the data into features  and target 

In [22]:
df.head(2)

Unnamed: 0,Country,Year,Status,Life expectancy,Adult Mortality,infant deaths,Alcohol,percentage expenditure,Hepatitis B,Measles,...,Polio,Total expenditure,Diphtheria,HIV/AIDS,GDP,Population,thinness 1-19 years,thinness 5-9 years,Income composition of resources,Schooling
0,Afghanistan,2015,Developing,65.0,263.0,62,0.01,71.279624,65.0,1154,...,6.0,8.16,65.0,0.1,584.25921,33736494.0,17.2,17.3,0.479,10.1
1,Afghanistan,2014,Developing,59.9,271.0,64,0.01,73.523582,62.0,492,...,58.0,8.18,62.0,0.1,612.696514,327582.0,17.5,17.5,0.476,10.0


#### Segregation of data into features and target

In [23]:
x=df.drop(columns=["Life expectancy "]) ## predicting features 
y=df["Life expectancy "]  ## target of countinous type

#### Since xgboost can not handle encoding of categorical features unlike catboost does so we do it explicitly as :

In [24]:
print(df["Year"].unique())
print(df["Country"].unique())
print(df["Status"].unique())

[2015 2014 2013 2012 2011 2010 2009 2008 2007 2006 2005 2004 2003 2002
 2001 2000]
['Afghanistan' 'Albania' 'Algeria' 'Angola' 'Antigua and Barbuda'
 'Argentina' 'Armenia' 'Australia' 'Austria' 'Azerbaijan' 'Bahamas'
 'Bahrain' 'Bangladesh' 'Barbados' 'Belarus' 'Belgium' 'Belize' 'Benin'
 'Bhutan' 'Bolivia (Plurinational State of)' 'Bosnia and Herzegovina'
 'Botswana' 'Brazil' 'Brunei Darussalam' 'Bulgaria' 'Burkina Faso'
 'Burundi' "Côte d'Ivoire" 'Cabo Verde' 'Cambodia' 'Cameroon' 'Canada'
 'Central African Republic' 'Chad' 'Chile' 'China' 'Colombia' 'Comoros'
 'Congo' 'Cook Islands' 'Costa Rica' 'Croatia' 'Cuba' 'Cyprus' 'Czechia'
 "Democratic People's Republic of Korea"
 'Democratic Republic of the Congo' 'Denmark' 'Djibouti' 'Dominica'
 'Dominican Republic' 'Ecuador' 'Egypt' 'El Salvador' 'Equatorial Guinea'
 'Eritrea' 'Estonia' 'Ethiopia' 'Fiji' 'Finland' 'France' 'Gabon' 'Gambia'
 'Georgia' 'Germany' 'Ghana' 'Greece' 'Grenada' 'Guatemala' 'Guinea'
 'Guinea-Bissau' 'Guyana' 'Hait

In [32]:
lbl=preprocessing.LabelEncoder()
x["Country"]=lbl.fit_transform(x["Country"]) ##Fit label encoder and return encoded labels.
x["Year"]=lbl.fit_transform(x["Year"])
x["Status"]=lbl.fit_transform(x["Status"])


In [33]:
x.head(5)

Unnamed: 0,Country,Year,Status,Adult Mortality,infant deaths,Alcohol,percentage expenditure,Hepatitis B,Measles,BMI,...,Polio,Total expenditure,Diphtheria,HIV/AIDS,GDP,Population,thinness 1-19 years,thinness 5-9 years,Income composition of resources,Schooling
0,0,15,1,263.0,62,0.01,71.279624,65.0,1154,19.1,...,6.0,8.16,65.0,0.1,584.25921,33736494.0,17.2,17.3,0.479,10.1
1,0,14,1,271.0,64,0.01,73.523582,62.0,492,18.6,...,58.0,8.18,62.0,0.1,612.696514,327582.0,17.5,17.5,0.476,10.0
2,0,13,1,268.0,66,0.01,73.219243,64.0,430,18.1,...,62.0,8.13,64.0,0.1,631.744976,31731688.0,17.7,17.7,0.47,9.9
3,0,12,1,272.0,69,0.01,78.184215,67.0,2787,17.6,...,67.0,8.52,67.0,0.1,669.959,3696958.0,17.9,18.0,0.463,9.8
4,0,11,1,275.0,71,0.01,7.097109,68.0,3013,17.2,...,68.0,7.87,68.0,0.1,63.537231,2978599.0,18.2,18.2,0.454,9.5


In [27]:
#df.dtypes

Country                              int32
Year                                 int64
Status                               int32
Life expectancy                    float64
Adult Mortality                    float64
infant deaths                        int64
Alcohol                            float64
percentage expenditure             float64
Hepatitis B                        float64
Measles                              int64
 BMI                               float64
under-five deaths                    int64
Polio                              float64
Total expenditure                  float64
Diphtheria                         float64
 HIV/AIDS                          float64
GDP                                float64
Population                         float64
 thinness  1-19 years              float64
 thinness 5-9 years                float64
Income composition of resources    float64
Schooling                          float64
dtype: object

#### Dividing the data into train and test data

In [34]:
x_train,x_test,y_train,y_test=train_test_split(x,y,random_state=42) ## by default = 0.25 

In [35]:
x_train.head(2)

Unnamed: 0,Country,Year,Status,Adult Mortality,infant deaths,Alcohol,percentage expenditure,Hepatitis B,Measles,BMI,...,Polio,Total expenditure,Diphtheria,HIV/AIDS,GDP,Population,thinness 1-19 years,thinness 5-9 years,Income composition of resources,Schooling
2737,180,8,1,273.0,5,9.46,456.458666,84.0,48,57.6,...,91.0,6.63,9.0,0.4,3891.37823,462582.0,2.5,2.6,0.73,14.9
789,51,12,1,127.0,7,3.92,41.880877,88.0,72,52.1,...,85.0,6.48,87.0,0.1,572.14313,15419666.0,1.3,1.2,0.717,13.3


### Model fitting using xgboost

In [36]:
import time
start=time.time()
xgbr=xgb.XGBRegressor() ## run with default parameters
xgbr.fit(x_train,y_train)  

XGBRegressor(base_score=0.5, booster='gbtree', callbacks=None,
             colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
             early_stopping_rounds=None, enable_categorical=False,
             eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
             importance_type=None, interaction_constraints='',
             learning_rate=0.300000012, max_bin=256, max_cat_to_onehot=4,
             max_delta_step=0, max_depth=6, max_leaves=0, min_child_weight=1,
             missing=nan, monotone_constraints='()', n_estimators=100, n_jobs=0,
             num_parallel_tree=1, predictor='auto', random_state=0, reg_alpha=0,
             reg_lambda=1, ...)

#### Predictions

In [37]:
y_pred=xgbr.predict(x_test)


In [38]:
end=time.time()
diff=end-start
print("execution time",diff)

execution time 6.4699225425720215


#### XGBOOST Model Evaluation 

In [39]:
r_square=rsquare(y_test,y_pred)
r_square

0.9652172214302785

### lightGBM


In [40]:
dff.head(3)

Unnamed: 0,Country,Year,Status,Life expectancy,Adult Mortality,infant deaths,Alcohol,percentage expenditure,Hepatitis B,Measles,...,Polio,Total expenditure,Diphtheria,HIV/AIDS,GDP,Population,thinness 1-19 years,thinness 5-9 years,Income composition of resources,Schooling
0,Afghanistan,2015,Developing,65.0,263.0,62,0.01,71.279624,65.0,1154,...,6.0,8.16,65.0,0.1,584.25921,33736494.0,17.2,17.3,0.479,10.1
1,Afghanistan,2014,Developing,59.9,271.0,64,0.01,73.523582,62.0,492,...,58.0,8.18,62.0,0.1,612.696514,327582.0,17.5,17.5,0.476,10.0
2,Afghanistan,2013,Developing,59.9,268.0,66,0.01,73.219243,64.0,430,...,62.0,8.13,64.0,0.1,631.744976,31731688.0,17.7,17.7,0.47,9.9


In [41]:
dff.dtypes

Country                             object
Year                                 int64
Status                              object
Life expectancy                    float64
Adult Mortality                    float64
infant deaths                        int64
Alcohol                            float64
percentage expenditure             float64
Hepatitis B                        float64
Measles                              int64
 BMI                               float64
under-five deaths                    int64
Polio                              float64
Total expenditure                  float64
Diphtheria                         float64
 HIV/AIDS                          float64
GDP                                float64
Population                         float64
 thinness  1-19 years              float64
 thinness 5-9 years                float64
Income composition of resources    float64
Schooling                          float64
dtype: object

#### importing lighgbm

In [42]:
import lightgbm as lg

#### Seggregating data into features and target

In [43]:
x=dff.drop(columns=["Life expectancy "]) ## predicting features 
y=dff["Life expectancy "]  ## target of countinous type

#### light gbm can handle categorical data as well only need to ensure that they are of catergorical type

In [53]:
for c in x.columns:
    col_type = x[c].dtype
    if col_type == 'object' or col_type.name == 'category':
        x[c] = x[c].astype('category')
x.head(5)
print(x.dtypes)

Country                            category
Year                                  int64
Status                             category
Adult Mortality                     float64
infant deaths                         int64
Alcohol                             float64
percentage expenditure              float64
Hepatitis B                         float64
Measles                               int64
 BMI                                float64
under-five deaths                     int64
Polio                               float64
Total expenditure                   float64
Diphtheria                          float64
 HIV/AIDS                           float64
GDP                                 float64
Population                          float64
 thinness  1-19 years               float64
 thinness 5-9 years                 float64
Income composition of resources     float64
Schooling                           float64
dtype: object


### Dividing data into training and test data

In [51]:
x_train,x_test,y_train,y_test=train_test_split(x,y,random_state=42) ## by default = 0.25

In [52]:
start=time.time()
lgbm=lg.LGBMRegressor() ## run with default parameters
lgbm.fit(x_train,y_train)
y_pred=lgbm.predict(x_test)  ### predictions 
r_square=rsquare(y_test,y_pred)  #### LightGBM Model Evaluation
print(r_square)
end=time.time()
diff=end-start
print("execution time",diff)

0.9732571422616872
execution time 0.3996753692626953
