In [1]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.model_selection import  train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
import numpy as np

In [2]:
df = pd.read_csv("ETData.csv")

In [34]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2938 entries, 0 to 2937
Data columns (total 18 columns):
 #   Column                                                                          Non-Null Count  Dtype  
---  ------                                                                          --------------  -----  
 0   Country                                                                         2938 non-null   object 
 1   Year                                                                            2938 non-null   int64  
 2   Status                                                                          2938 non-null   object 
 3   Adult Mortality (probability of dying between 15 and 60 years per 1000 people)  2928 non-null   float64
 4   Infant Deaths (number of infant deaths per 1000 people)                         2938 non-null   int64  
 5   Alcohol (average number of litres consumed by a person)                         2744 non-null   float64
 6   Hepatitis B (per

In [20]:
df.corr()["Life Expectancy (Years)"].nlargest(50)

Life Expectancy (Years)                                                           1.000000
Schooling (average number of years completed)                                     0.751975
BMI (average BMI for the population)                                              0.567694
Diphtheria (percent of population immunized)                                      0.479495
Polio (percent of population immunized)                                           0.465556
GDP (billion US Dollars)                                                          0.461455
Alcohol (average number of litres consumed by a person)                           0.404877
Hepatitis B (percent of population immunized)                                     0.256762
Healthcare Spending (percentage of government's total budget)                     0.218086
Year                                                                              0.170033
Population                                                                       -0.021538

In [43]:
x = df[["Schooling (average number of years completed)", "BMI (average BMI for the population)", "Diphtheria (percent of population immunized)", "Polio (percent of population immunized)", "GDP (billion US Dollars)", "Adult Mortality (probability of dying between 15 and 60 years per 1000 people)", "HIV/AIDS (deaths per 1000 people)"]].values
y = df[["Life Expectancy (Years)"]].values.reshape(-1, 1)

df_x = pd.DataFrame(x)
x_filled = df_x.fillna(df_x.median())

In [44]:
class Linear_Regression():

    # Initiating the parameters.
    def __init__(self, learning_rate, no_of_itr):
        self.learning_rate = learning_rate
        self.no_of_itr = no_of_itr

    def fit(self, X, Y):

        # No. of training examples and no. of features.
        self.m, self.n = X.shape     # Number of rows and columns
        # Initiating the weight and bias
        self.w = np.zeros((self.n, 1))
        self.b = 0
        self.X = X
        self.Y = Y

        # Implementing the gradient descent.
        for i in range(self.no_of_itr):
            self.update_weights()

    def update_weights(self):
        Y_prediction = self.predict(self.X)

        # Calculating gradients
        dw = -(self.X.T).dot(self.Y - Y_prediction)/self.m

        db = -np.sum(self.Y - Y_prediction)/self.m

        # Updating weights
        self.w = self.w - self.learning_rate * dw
        self.b = self.b - self.learning_rate * db

    def predict(self, X):
        return X.dot(self.w) + self.b

    def print_weights(self):
        print('Weights for the respective features are :')
        print(self.w)
        print()

        print('Bias value for the regression is:', self.b)
        print()

    def return_weights(self):
        return(self.w)

    def return_bias(self):
        return(self.b)

In [46]:
selected_model = LinearRegression()
selected_model.fit(x_filled, y)

y_pred = selected_model.predict(x_filled)

mse = mean_squared_error(y, y_pred)

print("Coefficients: ", selected_model.coef_)
print("Intercept: ", selected_model.intercept_)
print("Mean squared error: %.3f" % mean_squared_error(y, y_pred))
print("Coefficient of determination: %.3f" % r2_score(y, y_pred))

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [13]:
testdf = pd.read_csv("PredictET.csv")

In [16]:
df = df.fillna(df.median())
x_data = df[["Schooling (average number of years completed)", "BMI (average BMI for the population)", "Diphtheria (percent of population immunized)", "Polio (percent of population immunized)", "GDP (billion US Dollars)", "Adult Mortality (probability of dying between 15 and 60 years per 1000 people)", "HIV/AIDS (deaths per 1000 people)"]]
y_data = df[["Life Expectancy (Years)"]]

# x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.2, random_state=42)

selected_model = LinearRegression()
selected_model.fit(x_data, y_data)

y_pred = selected_model.predict(x_test)

us_pred = selected_model.predict(testdf[["Schooling (average number of years completed)", "BMI (average BMI for the population)", "Diphtheria (percent of population immunized)", "Polio (percent of population immunized)", "GDP (billion US Dollars)", "Adult Mortality (probability of dying between 15 and 60 years per 1000 people)", "HIV/AIDS (deaths per 1000 people)"]])

mse = mean_squared_error(y_test, y_pred)

# The coefficients
print("Coefficients: ", selected_model.coef_)
# The intercept
print("Intercept: ", selected_model.intercept_)
# The mean squared error
print("Mean squared error: %.3f" % mean_squared_error(y_test, y_pred))

print("Root Mean squared error: %.3f" % mean_squared_error(y_test, y_pred, squared=False))
# The coefficient of determination: 1 is perfect prediction
print("Coefficient of determination: %.3f" % r2_score(y_test, y_pred))

print(us_pred)

Coefficients:  [[ 1.04255502e+00  6.21248601e-02  4.56234548e-02  3.10954880e-02
   6.78207491e-05 -2.13456580e-02 -4.83714667e-01]]
Intercept:  [51.91173812]
Mean squared error: 16.609
Root Mean squared error: 4.075
Coefficient of determination: 0.808
[[76.44992512]]


  df = df.fillna(df.median())
