In [13]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
data = pd.read_csv(r'C:\Users\asokk\Downloads\LifeExpectancyDataset - Sheet1.csv')
print(data.head())
data.shape

     Country  Year     Status  Life expectancy  Adult Mortality  \
0  Australia  2015  Developed             82.8             59.0   
1  Australia  2014  Developed             82.7              6.0   
2  Australia  2013  Developed             82.5             61.0   
3  Australia  2012  Developed             82.3             61.0   
4  Australia  2011  Developed             82.0             63.0   

   infant deaths  Alcohol  percentage expenditure  Hepatitis B  Measles  ...  \
0              1      NaN                 0.00000         93.0       74  ...   
1              1     9.71             10769.36305         91.0      340  ...   
2              1     9.87             11734.85381         91.0      158  ...   
3              1    10.03             11714.99858         91.0      199  ...   
4              1    10.30             10986.26527         92.0      190  ...   

   Polio  Total expenditure  Diphtheria  HIV/AIDS          GDP  Population  \
0   93.0                NaN        93.

(2938, 22)

In [14]:
#Drop empty cells
data.dropna(inplace=True)

#Target and Features
target = 'Life expectancy'
features = [
    'Adult Mortality', 'infant deaths', 'Alcohol', 'percentage expenditure', 
    'Hepatitis B', 'Measles', 'BMI', 'under-five deaths', 'Polio', 
    'Total expenditure', 'Diphtheria', 'HIV/AIDS', 'GDP', 'Population', 
    'thinness  1-19 years', 'thinness 5-9 years', 
    'Income composition of resources', 'Schooling'
]

x = data[features].values
y = data[target].values

In [15]:
#Normalization
scaler = StandardScaler()
x_normalized = scaler.fit_transform(x)

In [16]:
#Splitting
x_train, x_test, y_train, y_test = train_test_split(x_normalized, y, test_size = 0.2, random_state = 42)

In [17]:
#Intercept
x_train = np.hstack([np.ones((x_train.shape[0], 1)), x_train])
x_test = np.hstack([np.ones((x_test.shape[0], 1)), x_test])

In [18]:
#Initialize weights
weights = np.zeros(x_train.shape[1])

In [19]:
#Learning rate
learning_rate = 0.01
iterations = 1000

In [20]:
#GD alg
for i in range(iterations):
    y_pred = np.dot(x_train, weights) #value prediction
    error = y_pred - y_train #error computation
    #update weight
    gradient = np.dot(x_train.T, error) / y_train.size
    weights -= learning_rate * gradient

#Print the weights
print ('Learned weights', weights)

#Predict values for test set
y_pred_test = np.dot(x_test, weights)

#Compute MSE & RMSE
mse = np.mean((y_pred_test - y_test) ** 2)
print(f"Mean Squared Error: {mse}")
rmse = np.sqrt(mse)
print(f"Root Mean Squared Error: {rmse}")

#Compute R square value
ss_total = np.sum((y_test - np.mean(y_test)) ** 2)
ss_residual = np.sum((y_test - y_pred_test) ** 2)
r_squared = 1 - (ss_residual / ss_total)
print (f"R-squared : {r_squared}")


Learned weights [ 6.93235809e+01 -2.19882623e+00  1.46082256e-01 -5.05974462e-01
  5.42847893e-01 -1.26049548e-01  1.89805991e-01  6.22181468e-01
 -5.59497181e-01  2.40929578e-01  2.58909617e-01  4.84348697e-01
 -2.63147149e+00  2.47656853e-01  4.44816879e-02 -2.02296816e-01
 -5.10069849e-02  2.01038412e+00  2.36664461e+00]
Mean Squared Error: 13.314406490938895
Root Mean Squared Error: 3.6488911316917765
R-squared : 0.8368009199018173
